src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "glsl/ir_uniform.h"
  26 extern "C" {
  27 #include "main/context.h"
  28 #include "main/macros.h"
  29 #include "program/prog_parameter.h"
  30 #include "program/sampler.h"
  31 }
  32
  33 namespace brw {
  34
  35 vec4_instruction::vec4_instruction(vec4_visitor *v,
  36                                    enum opcode opcode, dst_reg dst,
  37                                    src_reg src0, src_reg src1, src_reg src2)
  38 {
  39    this->opcode = opcode;
  40    this->dst = dst;
  41    this->src[0] = src0;
  42    this->src[1] = src1;
  43    this->src[2] = src2;
  44    this->ir = v->base_ir;
  45    this->annotation = v->current_annotation;
  46 }
  47
  48 vec4_instruction *
  49 vec4_visitor::emit(vec4_instruction *inst)
  50 {
  51    this->instructions.push_tail(inst);
  52
  53    return inst;
  54 }
  55
  56 vec4_instruction *
  57 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
  58 {
  59    new_inst->ir = inst->ir;
  60    new_inst->annotation = inst->annotation;
  61
  62    inst->insert_before(new_inst);
  63
  64    return inst;
  65 }
  66
  67 vec4_instruction *
  68 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
  69                    src_reg src0, src_reg src1, src_reg src2)
  70 {
  71    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
  72                                              src0, src1, src2));
  73 }
  74
  75
  76 vec4_instruction *
  77 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
  78 {
  79    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
  80 }
  81
  82 vec4_instruction *
  83 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
  84 {
  85    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
  86 }
  87
  88 vec4_instruction *
  89 vec4_visitor::emit(enum opcode opcode)
  90 {
  91    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
  92 }
  93
  94 #define ALU1(op)                                                        \
  95    vec4_instruction *                                                   \
  96    vec4_visitor::op(dst_reg dst, src_reg src0)                          \
  97    {                                                                    \
  98       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
  99                                            src0);                       \
 100    }
 101
 102 #define ALU2(op)                                                        \
 103    vec4_instruction *                                                   \
 104    vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1)            \
 105    {                                                                    \
 106       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 107                                            src0, src1);                 \
 108    }
 109
 110 #define ALU3(op)                                                        \
 111    vec4_instruction *                                                   \
 112    vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1, src_reg src2)\
 113    {                                                                    \
 114       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 115                                            src0, src1, src2);           \
 116    }
 117
 118 ALU1(NOT)
 119 ALU1(MOV)
 120 ALU1(FRC)
 121 ALU1(RNDD)
 122 ALU1(RNDE)
 123 ALU1(RNDZ)
 124 ALU1(F32TO16)
 125 ALU1(F16TO32)
 126 ALU2(ADD)
 127 ALU2(MUL)
 128 ALU2(MACH)
 129 ALU2(AND)
 130 ALU2(OR)
 131 ALU2(XOR)
 132 ALU2(DP3)
 133 ALU2(DP4)
 134 ALU2(DPH)
 135 ALU2(SHL)
 136 ALU2(SHR)
 137 ALU2(ASR)
 138 ALU3(LRP)
 139 ALU1(BFREV)
 140 ALU3(BFE)
 141 ALU2(BFI1)
 142 ALU3(BFI2)
 143 ALU1(FBH)
 144 ALU1(FBL)
 145 ALU1(CBIT)
 146
 147 /** Gen4 predicated IF. */
 148 vec4_instruction *
 149 vec4_visitor::IF(uint32_t predicate)
 150 {
 151    vec4_instruction *inst;
 152
 153    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
 154    inst->predicate = predicate;
 155
 156    return inst;
 157 }
 158
 159 /** Gen6+ IF with embedded comparison. */
 160 vec4_instruction *
 161 vec4_visitor::IF(src_reg src0, src_reg src1, uint32_t condition)
 162 {
 163    assert(intel->gen >= 6);
 164
 165    vec4_instruction *inst;
 166
 167    resolve_ud_negate(&src0);
 168    resolve_ud_negate(&src1);
 169
 170    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
 171                                         src0, src1);
 172    inst->conditional_mod = condition;
 173
 174    return inst;
 175 }
 176
 177 /**
 178  * CMP: Sets the low bit of the destination channels with the result
 179  * of the comparison, while the upper bits are undefined, and updates
 180  * the flag register with the packed 16 bits of the result.
 181  */
 182 vec4_instruction *
 183 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, uint32_t condition)
 184 {
 185    vec4_instruction *inst;
 186
 187    /* original gen4 does type conversion to the destination type
 188     * before before comparison, producing garbage results for floating
 189     * point comparisons.
 190     */
 191    if (intel->gen == 4) {
 192       dst.type = src0.type;
 193       if (dst.file == HW_REG)
 194          dst.fixed_hw_reg.type = dst.type;
 195    }
 196
 197    resolve_ud_negate(&src0);
 198    resolve_ud_negate(&src1);
 199
 200    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
 201    inst->conditional_mod = condition;
 202
 203    return inst;
 204 }
 205
 206 vec4_instruction *
 207 vec4_visitor::SCRATCH_READ(dst_reg dst, src_reg index)
 208 {
 209    vec4_instruction *inst;
 210
 211    inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_READ,
 212                                         dst, index);
 213    inst->base_mrf = 14;
 214    inst->mlen = 2;
 215
 216    return inst;
 217 }
 218
 219 vec4_instruction *
 220 vec4_visitor::SCRATCH_WRITE(dst_reg dst, src_reg src, src_reg index)
 221 {
 222    vec4_instruction *inst;
 223
 224    inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_WRITE,
 225                                         dst, src, index);
 226    inst->base_mrf = 13;
 227    inst->mlen = 3;
 228
 229    return inst;
 230 }
 231
 232 void
 233 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
 234 {
 235    static enum opcode dot_opcodes[] = {
 236       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
 237    };
 238
 239    emit(dot_opcodes[elements - 2], dst, src0, src1);
 240 }
 241
 242 src_reg
 243 vec4_visitor::fix_3src_operand(src_reg src)
 244 {
 245    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
 246     * able to use vertical stride of zero to replicate the vec4 uniform, like
 247     *
 248     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
 249     *
 250     * But you can't, since vertical stride is always four in three-source
 251     * instructions. Instead, insert a MOV instruction to do the replication so
 252     * that the three-source instruction can consume it.
 253     */
 254
 255    /* The MOV is only needed if the source is a uniform or immediate. */
 256    if (src.file != UNIFORM && src.file != IMM)
 257       return src;
 258
 259    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 260    expanded.type = src.type;
 261    emit(MOV(expanded, src));
 262    return src_reg(expanded);
 263 }
 264
 265 src_reg
 266 vec4_visitor::fix_math_operand(src_reg src)
 267 {
 268    /* The gen6 math instruction ignores the source modifiers --
 269     * swizzle, abs, negate, and at least some parts of the register
 270     * region description.
 271     *
 272     * Rather than trying to enumerate all these cases, *always* expand the
 273     * operand to a temp GRF for gen6.
 274     *
 275     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 276     * can't use.
 277     */
 278
 279    if (intel->gen == 7 && src.file != IMM)
 280       return src;
 281
 282    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 283    expanded.type = src.type;
 284    emit(MOV(expanded, src));
 285    return src_reg(expanded);
 286 }
 287
 288 void
 289 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
 290 {
 291    src = fix_math_operand(src);
 292
 293    if (dst.writemask != WRITEMASK_XYZW) {
 294       /* The gen6 math instruction must be align1, so we can't do
 295        * writemasks.
 296        */
 297       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 298
 299       emit(opcode, temp_dst, src);
 300
 301       emit(MOV(dst, src_reg(temp_dst)));
 302    } else {
 303       emit(opcode, dst, src);
 304    }
 305 }
 306
 307 void
 308 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
 309 {
 310    vec4_instruction *inst = emit(opcode, dst, src);
 311    inst->base_mrf = 1;
 312    inst->mlen = 1;
 313 }
 314
 315 void
 316 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
 317 {
 318    switch (opcode) {
 319    case SHADER_OPCODE_RCP:
 320    case SHADER_OPCODE_RSQ:
 321    case SHADER_OPCODE_SQRT:
 322    case SHADER_OPCODE_EXP2:
 323    case SHADER_OPCODE_LOG2:
 324    case SHADER_OPCODE_SIN:
 325    case SHADER_OPCODE_COS:
 326       break;
 327    default:
 328       assert(!"not reached: bad math opcode");
 329       return;
 330    }
 331
 332    if (intel->gen >= 6) {
 333       return emit_math1_gen6(opcode, dst, src);
 334    } else {
 335       return emit_math1_gen4(opcode, dst, src);
 336    }
 337 }
 338
 339 void
 340 vec4_visitor::emit_math2_gen6(enum opcode opcode,
 341                               dst_reg dst, src_reg src0, src_reg src1)
 342 {
 343    src0 = fix_math_operand(src0);
 344    src1 = fix_math_operand(src1);
 345
 346    if (dst.writemask != WRITEMASK_XYZW) {
 347       /* The gen6 math instruction must be align1, so we can't do
 348        * writemasks.
 349        */
 350       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 351       temp_dst.type = dst.type;
 352
 353       emit(opcode, temp_dst, src0, src1);
 354
 355       emit(MOV(dst, src_reg(temp_dst)));
 356    } else {
 357       emit(opcode, dst, src0, src1);
 358    }
 359 }
 360
 361 void
 362 vec4_visitor::emit_math2_gen4(enum opcode opcode,
 363                               dst_reg dst, src_reg src0, src_reg src1)
 364 {
 365    vec4_instruction *inst = emit(opcode, dst, src0, src1);
 366    inst->base_mrf = 1;
 367    inst->mlen = 2;
 368 }
 369
 370 void
 371 vec4_visitor::emit_math(enum opcode opcode,
 372                         dst_reg dst, src_reg src0, src_reg src1)
 373 {
 374    switch (opcode) {
 375    case SHADER_OPCODE_POW:
 376    case SHADER_OPCODE_INT_QUOTIENT:
 377    case SHADER_OPCODE_INT_REMAINDER:
 378       break;
 379    default:
 380       assert(!"not reached: unsupported binary math opcode");
 381       return;
 382    }
 383
 384    if (intel->gen >= 6) {
 385       return emit_math2_gen6(opcode, dst, src0, src1);
 386    } else {
 387       return emit_math2_gen4(opcode, dst, src0, src1);
 388    }
 389 }
 390
 391 void
 392 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 393 {
 394    if (intel->gen < 7)
 395       assert(!"ir_unop_pack_half_2x16 should be lowered");
 396
 397    assert(dst.type == BRW_REGISTER_TYPE_UD);
 398    assert(src0.type == BRW_REGISTER_TYPE_F);
 399
 400    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
 401     *
 402     *   Because this instruction does not have a 16-bit floating-point type,
 403     *   the destination data type must be Word (W).
 404     *
 405     *   The destination must be DWord-aligned and specify a horizontal stride
 406     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
 407     *   each destination channel and the upper word is not modified.
 408     *
 409     * The above restriction implies that the f32to16 instruction must use
 410     * align1 mode, because only in align1 mode is it possible to specify
 411     * horizontal stride.  We choose here to defy the hardware docs and emit
 412     * align16 instructions.
 413     *
 414     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
 415     * instructions. I was partially successful in that the code passed all
 416     * tests.  However, the code was dubiously correct and fragile, and the
 417     * tests were not harsh enough to probe that frailty. Not trusting the
 418     * code, I chose instead to remain in align16 mode in defiance of the hw
 419     * docs).
 420     *
 421     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
 422     * simulator, emitting a f32to16 in align16 mode with UD as destination
 423     * data type is safe. The behavior differs from that specified in the PRM
 424     * in that the upper word of each destination channel is cleared to 0.
 425     */
 426
 427    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 428    src_reg tmp_src(tmp_dst);
 429
 430 #if 0
 431    /* Verify the undocumented behavior on which the following instructions
 432     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
 433     * then the result of the bit-or instruction below will be incorrect.
 434     *
 435     * You should inspect the disasm output in order to verify that the MOV is
 436     * not optimized away.
 437     */
 438    emit(MOV(tmp_dst, src_reg(0x12345678u)));
 439 #endif
 440
 441    /* Give tmp the form below, where "." means untouched.
 442     *
 443     *     w z          y          x w z          y          x
 444     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
 445     *
 446     * That the upper word of each write-channel be 0 is required for the
 447     * following bit-shift and bit-or instructions to work. Note that this
 448     * relies on the undocumented hardware behavior mentioned above.
 449     */
 450    tmp_dst.writemask = WRITEMASK_XY;
 451    emit(F32TO16(tmp_dst, src0));
 452
 453    /* Give the write-channels of dst the form:
 454     *   0xhhhh0000
 455     */
 456    tmp_src.swizzle = SWIZZLE_Y;
 457    emit(SHL(dst, tmp_src, src_reg(16u)));
 458
 459    /* Finally, give the write-channels of dst the form of packHalf2x16's
 460     * output:
 461     *   0xhhhhllll
 462     */
 463    tmp_src.swizzle = SWIZZLE_X;
 464    emit(OR(dst, src_reg(dst), tmp_src));
 465 }
 466
 467 void
 468 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 469 {
 470    if (intel->gen < 7)
 471       assert(!"ir_unop_unpack_half_2x16 should be lowered");
 472
 473    assert(dst.type == BRW_REGISTER_TYPE_F);
 474    assert(src0.type == BRW_REGISTER_TYPE_UD);
 475
 476    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
 477     *
 478     *   Because this instruction does not have a 16-bit floating-point type,
 479     *   the source data type must be Word (W). The destination type must be
 480     *   F (Float).
 481     *
 482     * To use W as the source data type, we must adjust horizontal strides,
 483     * which is only possible in align1 mode. All my [chadv] attempts at
 484     * emitting align1 instructions for unpackHalf2x16 failed to pass the
 485     * Piglit tests, so I gave up.
 486     *
 487     * I've verified that, on gen7 hardware and the simulator, it is safe to
 488     * emit f16to32 in align16 mode with UD as source data type.
 489     */
 490
 491    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 492    src_reg tmp_src(tmp_dst);
 493
 494    tmp_dst.writemask = WRITEMASK_X;
 495    emit(AND(tmp_dst, src0, src_reg(0xffffu)));
 496
 497    tmp_dst.writemask = WRITEMASK_Y;
 498    emit(SHR(tmp_dst, src0, src_reg(16u)));
 499
 500    dst.writemask = WRITEMASK_XY;
 501    emit(F16TO32(dst, tmp_src));
 502 }
 503
 504 void
 505 vec4_visitor::visit_instructions(const exec_list *list)
 506 {
 507    foreach_list(node, list) {
 508       ir_instruction *ir = (ir_instruction *)node;
 509
 510       base_ir = ir;
 511       ir->accept(this);
 512    }
 513 }
 514
 515
 516 static int
 517 type_size(const struct glsl_type *type)
 518 {
 519    unsigned int i;
 520    int size;
 521
 522    switch (type->base_type) {
 523    case GLSL_TYPE_UINT:
 524    case GLSL_TYPE_INT:
 525    case GLSL_TYPE_FLOAT:
 526    case GLSL_TYPE_BOOL:
 527       if (type->is_matrix()) {
 528          return type->matrix_columns;
 529       } else {
 530          /* Regardless of size of vector, it gets a vec4. This is bad
 531           * packing for things like floats, but otherwise arrays become a
 532           * mess.  Hopefully a later pass over the code can pack scalars
 533           * down if appropriate.
 534           */
 535          return 1;
 536       }
 537    case GLSL_TYPE_ARRAY:
 538       assert(type->length > 0);
 539       return type_size(type->fields.array) * type->length;
 540    case GLSL_TYPE_STRUCT:
 541       size = 0;
 542       for (i = 0; i < type->length; i++) {
 543          size += type_size(type->fields.structure[i].type);
 544       }
 545       return size;
 546    case GLSL_TYPE_SAMPLER:
 547       /* Samplers take up one slot in UNIFORMS[], but they're baked in
 548        * at link time.
 549        */
 550       return 1;
 551    case GLSL_TYPE_VOID:
 552    case GLSL_TYPE_ERROR:
 553    case GLSL_TYPE_INTERFACE:
 554       assert(0);
 555       break;
 556    }
 557
 558    return 0;
 559 }
 560
 561 int
 562 vec4_visitor::virtual_grf_alloc(int size)
 563 {
 564    if (virtual_grf_array_size <= virtual_grf_count) {
 565       if (virtual_grf_array_size == 0)
 566          virtual_grf_array_size = 16;
 567       else
 568          virtual_grf_array_size *= 2;
 569       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 570                                    virtual_grf_array_size);
 571       virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
 572                                      virtual_grf_array_size);
 573    }
 574    virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
 575    virtual_grf_reg_count += size;
 576    virtual_grf_sizes[virtual_grf_count] = size;
 577    return virtual_grf_count++;
 578 }
 579
 580 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 581 {
 582    init();
 583
 584    this->file = GRF;
 585    this->reg = v->virtual_grf_alloc(type_size(type));
 586
 587    if (type->is_array() || type->is_record()) {
 588       this->swizzle = BRW_SWIZZLE_NOOP;
 589    } else {
 590       this->swizzle = swizzle_for_size(type->vector_elements);
 591    }
 592
 593    this->type = brw_type_for_base_type(type);
 594 }
 595
 596 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 597 {
 598    init();
 599
 600    this->file = GRF;
 601    this->reg = v->virtual_grf_alloc(type_size(type));
 602
 603    if (type->is_array() || type->is_record()) {
 604       this->writemask = WRITEMASK_XYZW;
 605    } else {
 606       this->writemask = (1 << type->vector_elements) - 1;
 607    }
 608
 609    this->type = brw_type_for_base_type(type);
 610 }
 611
 612 /* Our support for uniforms is piggy-backed on the struct
 613  * gl_fragment_program, because that's where the values actually
 614  * get stored, rather than in some global gl_shader_program uniform
 615  * store.
 616  */
 617 void
 618 vec4_visitor::setup_uniform_values(ir_variable *ir)
 619 {
 620    int namelen = strlen(ir->name);
 621
 622    /* The data for our (non-builtin) uniforms is stored in a series of
 623     * gl_uniform_driver_storage structs for each subcomponent that
 624     * glGetUniformLocation() could name.  We know it's been set up in the same
 625     * order we'd walk the type, so walk the list of storage and find anything
 626     * with our name, or the prefix of a component that starts with our name.
 627     */
 628    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 629       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 630
 631       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 632           (storage->name[namelen] != 0 &&
 633            storage->name[namelen] != '.' &&
 634            storage->name[namelen] != '[')) {
 635          continue;
 636       }
 637
 638       gl_constant_value *components = storage->storage;
 639       unsigned vector_count = (MAX2(storage->array_elements, 1) *
 640                                storage->type->matrix_columns);
 641
 642       for (unsigned s = 0; s < vector_count; s++) {
 643          uniform_vector_size[uniforms] = storage->type->vector_elements;
 644
 645          int i;
 646          for (i = 0; i < uniform_vector_size[uniforms]; i++) {
 647             prog_data->param[uniforms * 4 + i] = &components->f;
 648             components++;
 649          }
 650          for (; i < 4; i++) {
 651             static float zero = 0;
 652             prog_data->param[uniforms * 4 + i] = &zero;
 653          }
 654
 655          uniforms++;
 656       }
 657    }
 658 }
 659
 660 void
 661 vec4_visitor::setup_uniform_clipplane_values()
 662 {
 663    gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
 664
 665    if (intel->gen < 6) {
 666       /* Pre-Gen6, we compact clip planes.  For example, if the user
 667        * enables just clip planes 0, 1, and 3, we will enable clip planes
 668        * 0, 1, and 2 in the hardware, and we'll move clip plane 3 to clip
 669        * plane 2.  This simplifies the implementation of the Gen6 clip
 670        * thread.
 671        */
 672       int compacted_clipplane_index = 0;
 673       for (int i = 0; i < MAX_CLIP_PLANES; ++i) {
 674          if (!(key->userclip_planes_enabled_gen_4_5 & (1 << i)))
 675             continue;
 676
 677          this->uniform_vector_size[this->uniforms] = 4;
 678          this->userplane[compacted_clipplane_index] = dst_reg(UNIFORM, this->uniforms);
 679          this->userplane[compacted_clipplane_index].type = BRW_REGISTER_TYPE_F;
 680          for (int j = 0; j < 4; ++j) {
 681             prog_data->param[this->uniforms * 4 + j] = &clip_planes[i][j];
 682          }
 683          ++compacted_clipplane_index;
 684          ++this->uniforms;
 685       }
 686    } else {
 687       /* In Gen6 and later, we don't compact clip planes, because this
 688        * simplifies the implementation of gl_ClipDistance.
 689        */
 690       for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
 691          this->uniform_vector_size[this->uniforms] = 4;
 692          this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
 693          this->userplane[i].type = BRW_REGISTER_TYPE_F;
 694          for (int j = 0; j < 4; ++j) {
 695             prog_data->param[this->uniforms * 4 + j] = &clip_planes[i][j];
 696          }
 697          ++this->uniforms;
 698       }
 699    }
 700 }
 701
 702 /* Our support for builtin uniforms is even scarier than non-builtin.
 703  * It sits on top of the PROG_STATE_VAR parameters that are
 704  * automatically updated from GL context state.
 705  */
 706 void
 707 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
 708 {
 709    const ir_state_slot *const slots = ir->state_slots;
 710    assert(ir->state_slots != NULL);
 711
 712    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 713       /* This state reference has already been setup by ir_to_mesa,
 714        * but we'll get the same index back here.  We can reference
 715        * ParameterValues directly, since unlike brw_fs.cpp, we never
 716        * add new state references during compile.
 717        */
 718       int index = _mesa_add_state_reference(this->prog->Parameters,
 719                                             (gl_state_index *)slots[i].tokens);
 720       float *values = &this->prog->Parameters->ParameterValues[index][0].f;
 721
 722       this->uniform_vector_size[this->uniforms] = 0;
 723       /* Add each of the unique swizzled channels of the element.
 724        * This will end up matching the size of the glsl_type of this field.
 725        */
 726       int last_swiz = -1;
 727       for (unsigned int j = 0; j < 4; j++) {
 728          int swiz = GET_SWZ(slots[i].swizzle, j);
 729          last_swiz = swiz;
 730
 731          prog_data->param[this->uniforms * 4 + j] = &values[swiz];
 732          if (swiz <= last_swiz)
 733             this->uniform_vector_size[this->uniforms]++;
 734       }
 735       this->uniforms++;
 736    }
 737 }
 738
 739 dst_reg *
 740 vec4_visitor::variable_storage(ir_variable *var)
 741 {
 742    return (dst_reg *)hash_table_find(this->variable_ht, var);
 743 }
 744
 745 void
 746 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir, uint32_t *predicate)
 747 {
 748    ir_expression *expr = ir->as_expression();
 749
 750    *predicate = BRW_PREDICATE_NORMAL;
 751
 752    if (expr) {
 753       src_reg op[2];
 754       vec4_instruction *inst;
 755
 756       assert(expr->get_num_operands() <= 2);
 757       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 758          expr->operands[i]->accept(this);
 759          op[i] = this->result;
 760
 761          resolve_ud_negate(&op[i]);
 762       }
 763
 764       switch (expr->operation) {
 765       case ir_unop_logic_not:
 766          inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
 767          inst->conditional_mod = BRW_CONDITIONAL_Z;
 768          break;
 769
 770       case ir_binop_logic_xor:
 771          inst = emit(XOR(dst_null_d(), op[0], op[1]));
 772          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 773          break;
 774
 775       case ir_binop_logic_or:
 776          inst = emit(OR(dst_null_d(), op[0], op[1]));
 777          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 778          break;
 779
 780       case ir_binop_logic_and:
 781          inst = emit(AND(dst_null_d(), op[0], op[1]));
 782          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 783          break;
 784
 785       case ir_unop_f2b:
 786          if (intel->gen >= 6) {
 787             emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
 788          } else {
 789             inst = emit(MOV(dst_null_f(), op[0]));
 790             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 791          }
 792          break;
 793
 794       case ir_unop_i2b:
 795          if (intel->gen >= 6) {
 796             emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 797          } else {
 798             inst = emit(MOV(dst_null_d(), op[0]));
 799             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 800          }
 801          break;
 802
 803       case ir_binop_all_equal:
 804          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 805          *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
 806          break;
 807
 808       case ir_binop_any_nequal:
 809          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 810          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 811          break;
 812
 813       case ir_unop_any:
 814          inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 815          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 816          break;
 817
 818       case ir_binop_greater:
 819       case ir_binop_gequal:
 820       case ir_binop_less:
 821       case ir_binop_lequal:
 822       case ir_binop_equal:
 823       case ir_binop_nequal:
 824          emit(CMP(dst_null_d(), op[0], op[1],
 825                   brw_conditional_for_comparison(expr->operation)));
 826          break;
 827
 828       default:
 829          assert(!"not reached");
 830          break;
 831       }
 832       return;
 833    }
 834
 835    ir->accept(this);
 836
 837    resolve_ud_negate(&this->result);
 838
 839    if (intel->gen >= 6) {
 840       vec4_instruction *inst = emit(AND(dst_null_d(),
 841                                         this->result, src_reg(1)));
 842       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 843    } else {
 844       vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
 845       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 846    }
 847 }
 848
 849 /**
 850  * Emit a gen6 IF statement with the comparison folded into the IF
 851  * instruction.
 852  */
 853 void
 854 vec4_visitor::emit_if_gen6(ir_if *ir)
 855 {
 856    ir_expression *expr = ir->condition->as_expression();
 857
 858    if (expr) {
 859       src_reg op[2];
 860       dst_reg temp;
 861
 862       assert(expr->get_num_operands() <= 2);
 863       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 864          expr->operands[i]->accept(this);
 865          op[i] = this->result;
 866       }
 867
 868       switch (expr->operation) {
 869       case ir_unop_logic_not:
 870          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
 871          return;
 872
 873       case ir_binop_logic_xor:
 874          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
 875          return;
 876
 877       case ir_binop_logic_or:
 878          temp = dst_reg(this, glsl_type::bool_type);
 879          emit(OR(temp, op[0], op[1]));
 880          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 881          return;
 882
 883       case ir_binop_logic_and:
 884          temp = dst_reg(this, glsl_type::bool_type);
 885          emit(AND(temp, op[0], op[1]));
 886          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 887          return;
 888
 889       case ir_unop_f2b:
 890          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 891          return;
 892
 893       case ir_unop_i2b:
 894          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 895          return;
 896
 897       case ir_binop_greater:
 898       case ir_binop_gequal:
 899       case ir_binop_less:
 900       case ir_binop_lequal:
 901       case ir_binop_equal:
 902       case ir_binop_nequal:
 903          emit(IF(op[0], op[1],
 904                  brw_conditional_for_comparison(expr->operation)));
 905          return;
 906
 907       case ir_binop_all_equal:
 908          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 909          emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
 910          return;
 911
 912       case ir_binop_any_nequal:
 913          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 914          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 915          return;
 916
 917       case ir_unop_any:
 918          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 919          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 920          return;
 921
 922       default:
 923          assert(!"not reached");
 924          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 925          return;
 926       }
 927       return;
 928    }
 929
 930    ir->condition->accept(this);
 931
 932    emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
 933 }
 934
 935 static dst_reg
 936 with_writemask(dst_reg const & r, int mask)
 937 {
 938    dst_reg result = r;
 939    result.writemask = mask;
 940    return result;
 941 }
 942
 943 void
 944 vec4_vs_visitor::emit_prolog()
 945 {
 946    dst_reg sign_recovery_shift;
 947    dst_reg normalize_factor;
 948    dst_reg es3_normalize_factor;
 949
 950    for (int i = 0; i < VERT_ATTRIB_MAX; i++) {
 951       if (vs_prog_data->inputs_read & BITFIELD64_BIT(i)) {
 952          uint8_t wa_flags = vs_compile->key.gl_attrib_wa_flags[i];
 953          dst_reg reg(ATTR, i);
 954          dst_reg reg_d = reg;
 955          reg_d.type = BRW_REGISTER_TYPE_D;
 956          dst_reg reg_ud = reg;
 957          reg_ud.type = BRW_REGISTER_TYPE_UD;
 958
 959          /* Do GL_FIXED rescaling for GLES2.0.  Our GL_FIXED attributes
 960           * come in as floating point conversions of the integer values.
 961           */
 962          if (wa_flags & BRW_ATTRIB_WA_COMPONENT_MASK) {
 963             dst_reg dst = reg;
 964             dst.type = brw_type_for_base_type(glsl_type::vec4_type);
 965             dst.writemask = (1 << (wa_flags & BRW_ATTRIB_WA_COMPONENT_MASK)) - 1;
 966             emit(MUL(dst, src_reg(dst), src_reg(1.0f / 65536.0f)));
 967          }
 968
 969          /* Do sign recovery for 2101010 formats if required. */
 970          if (wa_flags & BRW_ATTRIB_WA_SIGN) {
 971             if (sign_recovery_shift.file == BAD_FILE) {
 972                /* shift constant: <22,22,22,30> */
 973                sign_recovery_shift = dst_reg(this, glsl_type::uvec4_type);
 974                emit(MOV(with_writemask(sign_recovery_shift, WRITEMASK_XYZ), src_reg(22u)));
 975                emit(MOV(with_writemask(sign_recovery_shift, WRITEMASK_W), src_reg(30u)));
 976             }
 977
 978             emit(SHL(reg_ud, src_reg(reg_ud), src_reg(sign_recovery_shift)));
 979             emit(ASR(reg_d, src_reg(reg_d), src_reg(sign_recovery_shift)));
 980          }
 981
 982          /* Apply BGRA swizzle if required. */
 983          if (wa_flags & BRW_ATTRIB_WA_BGRA) {
 984             src_reg temp = src_reg(reg);
 985             temp.swizzle = BRW_SWIZZLE4(2,1,0,3);
 986             emit(MOV(reg, temp));
 987          }
 988
 989          if (wa_flags & BRW_ATTRIB_WA_NORMALIZE) {
 990             /* ES 3.0 has different rules for converting signed normalized
 991              * fixed-point numbers than desktop GL.
 992              */
 993             if (_mesa_is_gles3(ctx) && (wa_flags & BRW_ATTRIB_WA_SIGN)) {
 994                /* According to equation 2.2 of the ES 3.0 specification,
 995                 * signed normalization conversion is done by:
 996                 *
 997                 * f = c / (2^(b-1)-1)
 998                 */
 999                if (es3_normalize_factor.file == BAD_FILE) {
1000                   /* mul constant: 1 / (2^(b-1) - 1) */
1001                   es3_normalize_factor = dst_reg(this, glsl_type::vec4_type);
1002                   emit(MOV(with_writemask(es3_normalize_factor, WRITEMASK_XYZ),
1003                            src_reg(1.0f / ((1<<9) - 1))));
1004                   emit(MOV(with_writemask(es3_normalize_factor, WRITEMASK_W),
1005                            src_reg(1.0f / ((1<<1) - 1))));
1006                }
1007
1008                dst_reg dst = reg;
1009                dst.type = brw_type_for_base_type(glsl_type::vec4_type);
1010                emit(MOV(dst, src_reg(reg_d)));
1011                emit(MUL(dst, src_reg(dst), src_reg(es3_normalize_factor)));
1012                emit_minmax(BRW_CONDITIONAL_G, dst, src_reg(dst), src_reg(-1.0f));
1013             } else {
1014                /* The following equations are from the OpenGL 3.2 specification:
1015                 *
1016                 * 2.1 unsigned normalization
1017                 * f = c/(2^n-1)
1018                 *
1019                 * 2.2 signed normalization
1020                 * f = (2c+1)/(2^n-1)
1021                 *
1022                 * Both of these share a common divisor, which is represented by
1023                 * "normalize_factor" in the code below.
1024                 */
1025                if (normalize_factor.file == BAD_FILE) {
1026                   /* 1 / (2^b - 1) for b=<10,10,10,2> */
1027                   normalize_factor = dst_reg(this, glsl_type::vec4_type);
1028                   emit(MOV(with_writemask(normalize_factor, WRITEMASK_XYZ),
1029                            src_reg(1.0f / ((1<<10) - 1))));
1030                   emit(MOV(with_writemask(normalize_factor, WRITEMASK_W),
1031                            src_reg(1.0f / ((1<<2) - 1))));
1032                }
1033
1034                dst_reg dst = reg;
1035                dst.type = brw_type_for_base_type(glsl_type::vec4_type);
1036                emit(MOV(dst, src_reg((wa_flags & BRW_ATTRIB_WA_SIGN) ? reg_d : reg_ud)));
1037
1038                /* For signed normalization, we want the numerator to be 2c+1. */
1039                if (wa_flags & BRW_ATTRIB_WA_SIGN) {
1040                   emit(MUL(dst, src_reg(dst), src_reg(2.0f)));
1041                   emit(ADD(dst, src_reg(dst), src_reg(1.0f)));
1042                }
1043
1044                emit(MUL(dst, src_reg(dst), src_reg(normalize_factor)));
1045             }
1046          }
1047
1048          if (wa_flags & BRW_ATTRIB_WA_SCALE) {
1049             dst_reg dst = reg;
1050             dst.type = brw_type_for_base_type(glsl_type::vec4_type);
1051             emit(MOV(dst, src_reg((wa_flags & BRW_ATTRIB_WA_SIGN) ? reg_d : reg_ud)));
1052          }
1053       }
1054    }
1055 }
1056
1057
1058 dst_reg *
1059 vec4_vs_visitor::make_reg_for_system_value(ir_variable *ir)
1060 {
1061    /* VertexID is stored by the VF as the last vertex element, but
1062     * we don't represent it with a flag in inputs_read, so we call
1063     * it VERT_ATTRIB_MAX, which setup_attributes() picks up on.
1064     */
1065    dst_reg *reg = new(mem_ctx) dst_reg(ATTR, VERT_ATTRIB_MAX);
1066    vs_prog_data->uses_vertexid = true;
1067
1068    switch (ir->location) {
1069    case SYSTEM_VALUE_VERTEX_ID:
1070       reg->writemask = WRITEMASK_X;
1071       break;
1072    case SYSTEM_VALUE_INSTANCE_ID:
1073       reg->writemask = WRITEMASK_Y;
1074       break;
1075    default:
1076       assert(!"not reached");
1077       break;
1078    }
1079
1080    return reg;
1081 }
1082
1083
1084 void
1085 vec4_visitor::visit(ir_variable *ir)
1086 {
1087    dst_reg *reg = NULL;
1088
1089    if (variable_storage(ir))
1090       return;
1091
1092    switch (ir->mode) {
1093    case ir_var_shader_in:
1094       reg = new(mem_ctx) dst_reg(ATTR, ir->location);
1095       break;
1096
1097    case ir_var_shader_out:
1098       reg = new(mem_ctx) dst_reg(this, ir->type);
1099
1100       for (int i = 0; i < type_size(ir->type); i++) {
1101          output_reg[ir->location + i] = *reg;
1102          output_reg[ir->location + i].reg_offset = i;
1103          output_reg[ir->location + i].type =
1104             brw_type_for_base_type(ir->type->get_scalar_type());
1105          output_reg_annotation[ir->location + i] = ir->name;
1106       }
1107       break;
1108
1109    case ir_var_auto:
1110    case ir_var_temporary:
1111       reg = new(mem_ctx) dst_reg(this, ir->type);
1112       break;
1113
1114    case ir_var_uniform:
1115       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
1116
1117       /* Thanks to the lower_ubo_reference pass, we will see only
1118        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
1119        * variables, so no need for them to be in variable_ht.
1120        */
1121       if (ir->is_in_uniform_block())
1122          return;
1123
1124       /* Track how big the whole uniform variable is, in case we need to put a
1125        * copy of its data into pull constants for array access.
1126        */
1127       this->uniform_size[this->uniforms] = type_size(ir->type);
1128
1129       if (!strncmp(ir->name, "gl_", 3)) {
1130          setup_builtin_uniform_values(ir);
1131       } else {
1132          setup_uniform_values(ir);
1133       }
1134       break;
1135
1136    case ir_var_system_value:
1137       reg = make_reg_for_system_value(ir);
1138       break;
1139
1140    default:
1141       assert(!"not reached");
1142    }
1143
1144    reg->type = brw_type_for_base_type(ir->type);
1145    hash_table_insert(this->variable_ht, reg, ir);
1146 }
1147
1148 void
1149 vec4_visitor::visit(ir_loop *ir)
1150 {
1151    dst_reg counter;
1152
1153    /* We don't want debugging output to print the whole body of the
1154     * loop as the annotation.
1155     */
1156    this->base_ir = NULL;
1157
1158    if (ir->counter != NULL) {
1159       this->base_ir = ir->counter;
1160       ir->counter->accept(this);
1161       counter = *(variable_storage(ir->counter));
1162
1163       if (ir->from != NULL) {
1164          this->base_ir = ir->from;
1165          ir->from->accept(this);
1166
1167          emit(MOV(counter, this->result));
1168       }
1169    }
1170
1171    emit(BRW_OPCODE_DO);
1172
1173    if (ir->to) {
1174       this->base_ir = ir->to;
1175       ir->to->accept(this);
1176
1177       emit(CMP(dst_null_d(), src_reg(counter), this->result,
1178                brw_conditional_for_comparison(ir->cmp)));
1179
1180       vec4_instruction *inst = emit(BRW_OPCODE_BREAK);
1181       inst->predicate = BRW_PREDICATE_NORMAL;
1182    }
1183
1184    visit_instructions(&ir->body_instructions);
1185
1186
1187    if (ir->increment) {
1188       this->base_ir = ir->increment;
1189       ir->increment->accept(this);
1190       emit(ADD(counter, src_reg(counter), this->result));
1191    }
1192
1193    emit(BRW_OPCODE_WHILE);
1194 }
1195
1196 void
1197 vec4_visitor::visit(ir_loop_jump *ir)
1198 {
1199    switch (ir->mode) {
1200    case ir_loop_jump::jump_break:
1201       emit(BRW_OPCODE_BREAK);
1202       break;
1203    case ir_loop_jump::jump_continue:
1204       emit(BRW_OPCODE_CONTINUE);
1205       break;
1206    }
1207 }
1208
1209
1210 void
1211 vec4_visitor::visit(ir_function_signature *ir)
1212 {
1213    assert(0);
1214    (void)ir;
1215 }
1216
1217 void
1218 vec4_visitor::visit(ir_function *ir)
1219 {
1220    /* Ignore function bodies other than main() -- we shouldn't see calls to
1221     * them since they should all be inlined.
1222     */
1223    if (strcmp(ir->name, "main") == 0) {
1224       const ir_function_signature *sig;
1225       exec_list empty;
1226
1227       sig = ir->matching_signature(&empty);
1228
1229       assert(sig);
1230
1231       visit_instructions(&sig->body);
1232    }
1233 }
1234
1235 bool
1236 vec4_visitor::try_emit_sat(ir_expression *ir)
1237 {
1238    ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
1239    if (!sat_src)
1240       return false;
1241
1242    sat_src->accept(this);
1243    src_reg src = this->result;
1244
1245    this->result = src_reg(this, ir->type);
1246    vec4_instruction *inst;
1247    inst = emit(MOV(dst_reg(this->result), src));
1248    inst->saturate = true;
1249
1250    return true;
1251 }
1252
1253 void
1254 vec4_visitor::emit_bool_comparison(unsigned int op,
1255                                  dst_reg dst, src_reg src0, src_reg src1)
1256 {
1257    /* original gen4 does destination conversion before comparison. */
1258    if (intel->gen < 5)
1259       dst.type = src0.type;
1260
1261    emit(CMP(dst, src0, src1, brw_conditional_for_comparison(op)));
1262
1263    dst.type = BRW_REGISTER_TYPE_D;
1264    emit(AND(dst, src_reg(dst), src_reg(0x1)));
1265 }
1266
1267 void
1268 vec4_visitor::emit_minmax(uint32_t conditionalmod, dst_reg dst,
1269                           src_reg src0, src_reg src1)
1270 {
1271    vec4_instruction *inst;
1272
1273    if (intel->gen >= 6) {
1274       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1275       inst->conditional_mod = conditionalmod;
1276    } else {
1277       emit(CMP(dst, src0, src1, conditionalmod));
1278
1279       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1280       inst->predicate = BRW_PREDICATE_NORMAL;
1281    }
1282 }
1283
1284 void
1285 vec4_visitor::visit(ir_expression *ir)
1286 {
1287    unsigned int operand;
1288    src_reg op[Elements(ir->operands)];
1289    src_reg result_src;
1290    dst_reg result_dst;
1291    vec4_instruction *inst;
1292
1293    if (try_emit_sat(ir))
1294       return;
1295
1296    for (operand = 0; operand < ir->get_num_operands(); operand++) {
1297       this->result.file = BAD_FILE;
1298       ir->operands[operand]->accept(this);
1299       if (this->result.file == BAD_FILE) {
1300          printf("Failed to get tree for expression operand:\n");
1301          ir->operands[operand]->print();
1302          exit(1);
1303       }
1304       op[operand] = this->result;
1305
1306       /* Matrix expression operands should have been broken down to vector
1307        * operations already.
1308        */
1309       assert(!ir->operands[operand]->type->is_matrix());
1310    }
1311
1312    int vector_elements = ir->operands[0]->type->vector_elements;
1313    if (ir->operands[1]) {
1314       vector_elements = MAX2(vector_elements,
1315                              ir->operands[1]->type->vector_elements);
1316    }
1317
1318    this->result.file = BAD_FILE;
1319
1320    /* Storage for our result.  Ideally for an assignment we'd be using
1321     * the actual storage for the result here, instead.
1322     */
1323    result_src = src_reg(this, ir->type);
1324    /* convenience for the emit functions below. */
1325    result_dst = dst_reg(result_src);
1326    /* If nothing special happens, this is the result. */
1327    this->result = result_src;
1328    /* Limit writes to the channels that will be used by result_src later.
1329     * This does limit this temp's use as a temporary for multi-instruction
1330     * sequences.
1331     */
1332    result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1333
1334    switch (ir->operation) {
1335    case ir_unop_logic_not:
1336       /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
1337        * ones complement of the whole register, not just bit 0.
1338        */
1339       emit(XOR(result_dst, op[0], src_reg(1)));
1340       break;
1341    case ir_unop_neg:
1342       op[0].negate = !op[0].negate;
1343       this->result = op[0];
1344       break;
1345    case ir_unop_abs:
1346       op[0].abs = true;
1347       op[0].negate = false;
1348       this->result = op[0];
1349       break;
1350
1351    case ir_unop_sign:
1352       emit(MOV(result_dst, src_reg(0.0f)));
1353
1354       emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_G));
1355       inst = emit(MOV(result_dst, src_reg(1.0f)));
1356       inst->predicate = BRW_PREDICATE_NORMAL;
1357
1358       emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_L));
1359       inst = emit(MOV(result_dst, src_reg(-1.0f)));
1360       inst->predicate = BRW_PREDICATE_NORMAL;
1361
1362       break;
1363
1364    case ir_unop_rcp:
1365       emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1366       break;
1367
1368    case ir_unop_exp2:
1369       emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1370       break;
1371    case ir_unop_log2:
1372       emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1373       break;
1374    case ir_unop_exp:
1375    case ir_unop_log:
1376       assert(!"not reached: should be handled by ir_explog_to_explog2");
1377       break;
1378    case ir_unop_sin:
1379    case ir_unop_sin_reduced:
1380       emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1381       break;
1382    case ir_unop_cos:
1383    case ir_unop_cos_reduced:
1384       emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1385       break;
1386
1387    case ir_unop_dFdx:
1388    case ir_unop_dFdy:
1389       assert(!"derivatives not valid in vertex shader");
1390       break;
1391
1392    case ir_unop_bitfield_reverse:
1393       emit(BFREV(result_dst, op[0]));
1394       break;
1395    case ir_unop_bit_count:
1396       emit(CBIT(result_dst, op[0]));
1397       break;
1398    case ir_unop_find_msb: {
1399       src_reg temp = src_reg(this, glsl_type::uint_type);
1400
1401       inst = emit(FBH(dst_reg(temp), op[0]));
1402       inst->dst.writemask = WRITEMASK_XYZW;
1403
1404       /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1405        * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1406        * subtract the result from 31 to convert the MSB count into an LSB count.
1407        */
1408
1409       /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1410       temp.swizzle = BRW_SWIZZLE_NOOP;
1411       emit(MOV(result_dst, temp));
1412
1413       src_reg src_tmp = src_reg(result_dst);
1414       emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1415
1416       src_tmp.negate = true;
1417       inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1418       inst->predicate = BRW_PREDICATE_NORMAL;
1419       break;
1420    }
1421    case ir_unop_find_lsb:
1422       emit(FBL(result_dst, op[0]));
1423       break;
1424
1425    case ir_unop_noise:
1426       assert(!"not reached: should be handled by lower_noise");
1427       break;
1428
1429    case ir_binop_add:
1430       emit(ADD(result_dst, op[0], op[1]));
1431       break;
1432    case ir_binop_sub:
1433       assert(!"not reached: should be handled by ir_sub_to_add_neg");
1434       break;
1435
1436    case ir_binop_mul:
1437       if (ir->type->is_integer()) {
1438          /* For integer multiplication, the MUL uses the low 16 bits
1439           * of one of the operands (src0 on gen6, src1 on gen7).  The
1440           * MACH accumulates in the contribution of the upper 16 bits
1441           * of that operand.
1442           *
1443           * FINISHME: Emit just the MUL if we know an operand is small
1444           * enough.
1445           */
1446          struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_D);
1447
1448          emit(MUL(acc, op[0], op[1]));
1449          emit(MACH(dst_null_d(), op[0], op[1]));
1450          emit(MOV(result_dst, src_reg(acc)));
1451       } else {
1452          emit(MUL(result_dst, op[0], op[1]));
1453       }
1454       break;
1455    case ir_binop_div:
1456       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1457       assert(ir->type->is_integer());
1458       emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1459       break;
1460    case ir_binop_mod:
1461       /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1462       assert(ir->type->is_integer());
1463       emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1464       break;
1465
1466    case ir_binop_less:
1467    case ir_binop_greater:
1468    case ir_binop_lequal:
1469    case ir_binop_gequal:
1470    case ir_binop_equal:
1471    case ir_binop_nequal: {
1472       emit(CMP(result_dst, op[0], op[1],
1473                brw_conditional_for_comparison(ir->operation)));
1474       emit(AND(result_dst, result_src, src_reg(0x1)));
1475       break;
1476    }
1477
1478    case ir_binop_all_equal:
1479       /* "==" operator producing a scalar boolean. */
1480       if (ir->operands[0]->type->is_vector() ||
1481           ir->operands[1]->type->is_vector()) {
1482          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1483          emit(MOV(result_dst, src_reg(0)));
1484          inst = emit(MOV(result_dst, src_reg(1)));
1485          inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1486       } else {
1487          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1488          emit(AND(result_dst, result_src, src_reg(0x1)));
1489       }
1490       break;
1491    case ir_binop_any_nequal:
1492       /* "!=" operator producing a scalar boolean. */
1493       if (ir->operands[0]->type->is_vector() ||
1494           ir->operands[1]->type->is_vector()) {
1495          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1496
1497          emit(MOV(result_dst, src_reg(0)));
1498          inst = emit(MOV(result_dst, src_reg(1)));
1499          inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1500       } else {
1501          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1502          emit(AND(result_dst, result_src, src_reg(0x1)));
1503       }
1504       break;
1505
1506    case ir_unop_any:
1507       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1508       emit(MOV(result_dst, src_reg(0)));
1509
1510       inst = emit(MOV(result_dst, src_reg(1)));
1511       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1512       break;
1513
1514    case ir_binop_logic_xor:
1515       emit(XOR(result_dst, op[0], op[1]));
1516       break;
1517
1518    case ir_binop_logic_or:
1519       emit(OR(result_dst, op[0], op[1]));
1520       break;
1521
1522    case ir_binop_logic_and:
1523       emit(AND(result_dst, op[0], op[1]));
1524       break;
1525
1526    case ir_binop_dot:
1527       assert(ir->operands[0]->type->is_vector());
1528       assert(ir->operands[0]->type == ir->operands[1]->type);
1529       emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1530       break;
1531
1532    case ir_unop_sqrt:
1533       emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1534       break;
1535    case ir_unop_rsq:
1536       emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1537       break;
1538
1539    case ir_unop_bitcast_i2f:
1540    case ir_unop_bitcast_u2f:
1541       this->result = op[0];
1542       this->result.type = BRW_REGISTER_TYPE_F;
1543       break;
1544
1545    case ir_unop_bitcast_f2i:
1546       this->result = op[0];
1547       this->result.type = BRW_REGISTER_TYPE_D;
1548       break;
1549
1550    case ir_unop_bitcast_f2u:
1551       this->result = op[0];
1552       this->result.type = BRW_REGISTER_TYPE_UD;
1553       break;
1554
1555    case ir_unop_i2f:
1556    case ir_unop_i2u:
1557    case ir_unop_u2i:
1558    case ir_unop_u2f:
1559    case ir_unop_b2f:
1560    case ir_unop_b2i:
1561    case ir_unop_f2i:
1562    case ir_unop_f2u:
1563       emit(MOV(result_dst, op[0]));
1564       break;
1565    case ir_unop_f2b:
1566    case ir_unop_i2b: {
1567       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1568       emit(AND(result_dst, result_src, src_reg(1)));
1569       break;
1570    }
1571
1572    case ir_unop_trunc:
1573       emit(RNDZ(result_dst, op[0]));
1574       break;
1575    case ir_unop_ceil:
1576       op[0].negate = !op[0].negate;
1577       inst = emit(RNDD(result_dst, op[0]));
1578       this->result.negate = true;
1579       break;
1580    case ir_unop_floor:
1581       inst = emit(RNDD(result_dst, op[0]));
1582       break;
1583    case ir_unop_fract:
1584       inst = emit(FRC(result_dst, op[0]));
1585       break;
1586    case ir_unop_round_even:
1587       emit(RNDE(result_dst, op[0]));
1588       break;
1589
1590    case ir_binop_min:
1591       emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1592       break;
1593    case ir_binop_max:
1594       emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1595       break;
1596
1597    case ir_binop_pow:
1598       emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1599       break;
1600
1601    case ir_unop_bit_not:
1602       inst = emit(NOT(result_dst, op[0]));
1603       break;
1604    case ir_binop_bit_and:
1605       inst = emit(AND(result_dst, op[0], op[1]));
1606       break;
1607    case ir_binop_bit_xor:
1608       inst = emit(XOR(result_dst, op[0], op[1]));
1609       break;
1610    case ir_binop_bit_or:
1611       inst = emit(OR(result_dst, op[0], op[1]));
1612       break;
1613
1614    case ir_binop_lshift:
1615       inst = emit(SHL(result_dst, op[0], op[1]));
1616       break;
1617
1618    case ir_binop_rshift:
1619       if (ir->type->base_type == GLSL_TYPE_INT)
1620          inst = emit(ASR(result_dst, op[0], op[1]));
1621       else
1622          inst = emit(SHR(result_dst, op[0], op[1]));
1623       break;
1624
1625    case ir_binop_bfm:
1626       emit(BFI1(result_dst, op[0], op[1]));
1627       break;
1628
1629    case ir_binop_ubo_load: {
1630       ir_constant *uniform_block = ir->operands[0]->as_constant();
1631       ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1632       unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1633       src_reg offset = op[1];
1634
1635       /* Now, load the vector from that offset. */
1636       assert(ir->type->is_vector() || ir->type->is_scalar());
1637
1638       src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1639       packed_consts.type = result.type;
1640       src_reg surf_index =
1641          src_reg(SURF_INDEX_VS_UBO(uniform_block->value.u[0]));
1642       if (const_offset_ir) {
1643          offset = src_reg(const_offset / 16);
1644       } else {
1645          emit(SHR(dst_reg(offset), offset, src_reg(4)));
1646       }
1647
1648       vec4_instruction *pull =
1649          emit(new(mem_ctx) vec4_instruction(this,
1650                                             VS_OPCODE_PULL_CONSTANT_LOAD,
1651                                             dst_reg(packed_consts),
1652                                             surf_index,
1653                                             offset));
1654       pull->base_mrf = 14;
1655       pull->mlen = 1;
1656
1657       packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1658       packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1659                                             const_offset % 16 / 4,
1660                                             const_offset % 16 / 4,
1661                                             const_offset % 16 / 4);
1662
1663       /* UBO bools are any nonzero int.  We store bools as either 0 or 1. */
1664       if (ir->type->base_type == GLSL_TYPE_BOOL) {
1665          emit(CMP(result_dst, packed_consts, src_reg(0u),
1666                   BRW_CONDITIONAL_NZ));
1667          emit(AND(result_dst, result, src_reg(0x1)));
1668       } else {
1669          emit(MOV(result_dst, packed_consts));
1670       }
1671       break;
1672    }
1673
1674    case ir_triop_lrp:
1675       op[0] = fix_3src_operand(op[0]);
1676       op[1] = fix_3src_operand(op[1]);
1677       op[2] = fix_3src_operand(op[2]);
1678       /* Note that the instruction's argument order is reversed from GLSL
1679        * and the IR.
1680        */
1681       emit(LRP(result_dst, op[2], op[1], op[0]));
1682       break;
1683
1684    case ir_triop_bfi:
1685       op[0] = fix_3src_operand(op[0]);
1686       op[1] = fix_3src_operand(op[1]);
1687       op[2] = fix_3src_operand(op[2]);
1688       emit(BFI2(result_dst, op[0], op[1], op[2]));
1689       break;
1690
1691    case ir_triop_bitfield_extract:
1692       op[0] = fix_3src_operand(op[0]);
1693       op[1] = fix_3src_operand(op[1]);
1694       op[2] = fix_3src_operand(op[2]);
1695       /* Note that the instruction's argument order is reversed from GLSL
1696        * and the IR.
1697        */
1698       emit(BFE(result_dst, op[2], op[1], op[0]));
1699       break;
1700
1701    case ir_quadop_bitfield_insert:
1702       assert(!"not reached: should be handled by "
1703               "bitfield_insert_to_bfm_bfi\n");
1704       break;
1705
1706    case ir_quadop_vector:
1707       assert(!"not reached: should be handled by lower_quadop_vector");
1708       break;
1709
1710    case ir_unop_pack_half_2x16:
1711       emit_pack_half_2x16(result_dst, op[0]);
1712       break;
1713    case ir_unop_unpack_half_2x16:
1714       emit_unpack_half_2x16(result_dst, op[0]);
1715       break;
1716    case ir_unop_pack_snorm_2x16:
1717    case ir_unop_pack_snorm_4x8:
1718    case ir_unop_pack_unorm_2x16:
1719    case ir_unop_pack_unorm_4x8:
1720    case ir_unop_unpack_snorm_2x16:
1721    case ir_unop_unpack_snorm_4x8:
1722    case ir_unop_unpack_unorm_2x16:
1723    case ir_unop_unpack_unorm_4x8:
1724       assert(!"not reached: should be handled by lower_packing_builtins");
1725       break;
1726    case ir_unop_unpack_half_2x16_split_x:
1727    case ir_unop_unpack_half_2x16_split_y:
1728    case ir_binop_pack_half_2x16_split:
1729       assert(!"not reached: should not occur in vertex shader");
1730       break;
1731    }
1732 }
1733
1734
1735 void
1736 vec4_visitor::visit(ir_swizzle *ir)
1737 {
1738    src_reg src;
1739    int i = 0;
1740    int swizzle[4];
1741
1742    /* Note that this is only swizzles in expressions, not those on the left
1743     * hand side of an assignment, which do write masking.  See ir_assignment
1744     * for that.
1745     */
1746
1747    ir->val->accept(this);
1748    src = this->result;
1749    assert(src.file != BAD_FILE);
1750
1751    for (i = 0; i < ir->type->vector_elements; i++) {
1752       switch (i) {
1753       case 0:
1754          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1755          break;
1756       case 1:
1757          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1758          break;
1759       case 2:
1760          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1761          break;
1762       case 3:
1763          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1764             break;
1765       }
1766    }
1767    for (; i < 4; i++) {
1768       /* Replicate the last channel out. */
1769       swizzle[i] = swizzle[ir->type->vector_elements - 1];
1770    }
1771
1772    src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1773
1774    this->result = src;
1775 }
1776
1777 void
1778 vec4_visitor::visit(ir_dereference_variable *ir)
1779 {
1780    const struct glsl_type *type = ir->type;
1781    dst_reg *reg = variable_storage(ir->var);
1782
1783    if (!reg) {
1784       fail("Failed to find variable storage for %s\n", ir->var->name);
1785       this->result = src_reg(brw_null_reg());
1786       return;
1787    }
1788
1789    this->result = src_reg(*reg);
1790
1791    /* System values get their swizzle from the dst_reg writemask */
1792    if (ir->var->mode == ir_var_system_value)
1793       return;
1794
1795    if (type->is_scalar() || type->is_vector() || type->is_matrix())
1796       this->result.swizzle = swizzle_for_size(type->vector_elements);
1797 }
1798
1799
1800 int
1801 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1802 {
1803    /* Under normal circumstances array elements are stored consecutively, so
1804     * the stride is equal to the size of the array element.
1805     */
1806    return type_size(ir->type);
1807 }
1808
1809
1810 void
1811 vec4_visitor::visit(ir_dereference_array *ir)
1812 {
1813    ir_constant *constant_index;
1814    src_reg src;
1815    int array_stride = compute_array_stride(ir);
1816
1817    constant_index = ir->array_index->constant_expression_value();
1818
1819    ir->array->accept(this);
1820    src = this->result;
1821
1822    if (constant_index) {
1823       src.reg_offset += constant_index->value.i[0] * array_stride;
1824    } else {
1825       /* Variable index array dereference.  It eats the "vec4" of the
1826        * base of the array and an index that offsets the Mesa register
1827        * index.
1828        */
1829       ir->array_index->accept(this);
1830
1831       src_reg index_reg;
1832
1833       if (array_stride == 1) {
1834          index_reg = this->result;
1835       } else {
1836          index_reg = src_reg(this, glsl_type::int_type);
1837
1838          emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
1839       }
1840
1841       if (src.reladdr) {
1842          src_reg temp = src_reg(this, glsl_type::int_type);
1843
1844          emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1845
1846          index_reg = temp;
1847       }
1848
1849       src.reladdr = ralloc(mem_ctx, src_reg);
1850       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1851    }
1852
1853    /* If the type is smaller than a vec4, replicate the last channel out. */
1854    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1855       src.swizzle = swizzle_for_size(ir->type->vector_elements);
1856    else
1857       src.swizzle = BRW_SWIZZLE_NOOP;
1858    src.type = brw_type_for_base_type(ir->type);
1859
1860    this->result = src;
1861 }
1862
1863 void
1864 vec4_visitor::visit(ir_dereference_record *ir)
1865 {
1866    unsigned int i;
1867    const glsl_type *struct_type = ir->record->type;
1868    int offset = 0;
1869
1870    ir->record->accept(this);
1871
1872    for (i = 0; i < struct_type->length; i++) {
1873       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1874          break;
1875       offset += type_size(struct_type->fields.structure[i].type);
1876    }
1877
1878    /* If the type is smaller than a vec4, replicate the last channel out. */
1879    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1880       this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1881    else
1882       this->result.swizzle = BRW_SWIZZLE_NOOP;
1883    this->result.type = brw_type_for_base_type(ir->type);
1884
1885    this->result.reg_offset += offset;
1886 }
1887
1888 /**
1889  * We want to be careful in assignment setup to hit the actual storage
1890  * instead of potentially using a temporary like we might with the
1891  * ir_dereference handler.
1892  */
1893 static dst_reg
1894 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1895 {
1896    /* The LHS must be a dereference.  If the LHS is a variable indexed array
1897     * access of a vector, it must be separated into a series conditional moves
1898     * before reaching this point (see ir_vec_index_to_cond_assign).
1899     */
1900    assert(ir->as_dereference());
1901    ir_dereference_array *deref_array = ir->as_dereference_array();
1902    if (deref_array) {
1903       assert(!deref_array->array->type->is_vector());
1904    }
1905
1906    /* Use the rvalue deref handler for the most part.  We'll ignore
1907     * swizzles in it and write swizzles using writemask, though.
1908     */
1909    ir->accept(v);
1910    return dst_reg(v->result);
1911 }
1912
1913 void
1914 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1915                               const struct glsl_type *type, uint32_t predicate)
1916 {
1917    if (type->base_type == GLSL_TYPE_STRUCT) {
1918       for (unsigned int i = 0; i < type->length; i++) {
1919          emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1920       }
1921       return;
1922    }
1923
1924    if (type->is_array()) {
1925       for (unsigned int i = 0; i < type->length; i++) {
1926          emit_block_move(dst, src, type->fields.array, predicate);
1927       }
1928       return;
1929    }
1930
1931    if (type->is_matrix()) {
1932       const struct glsl_type *vec_type;
1933
1934       vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1935                                          type->vector_elements, 1);
1936
1937       for (int i = 0; i < type->matrix_columns; i++) {
1938          emit_block_move(dst, src, vec_type, predicate);
1939       }
1940       return;
1941    }
1942
1943    assert(type->is_scalar() || type->is_vector());
1944
1945    dst->type = brw_type_for_base_type(type);
1946    src->type = dst->type;
1947
1948    dst->writemask = (1 << type->vector_elements) - 1;
1949
1950    src->swizzle = swizzle_for_size(type->vector_elements);
1951
1952    vec4_instruction *inst = emit(MOV(*dst, *src));
1953    inst->predicate = predicate;
1954
1955    dst->reg_offset++;
1956    src->reg_offset++;
1957 }
1958
1959
1960 /* If the RHS processing resulted in an instruction generating a
1961  * temporary value, and it would be easy to rewrite the instruction to
1962  * generate its result right into the LHS instead, do so.  This ends
1963  * up reliably removing instructions where it can be tricky to do so
1964  * later without real UD chain information.
1965  */
1966 bool
1967 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
1968                                      dst_reg dst,
1969                                      src_reg src,
1970                                      vec4_instruction *pre_rhs_inst,
1971                                      vec4_instruction *last_rhs_inst)
1972 {
1973    /* This could be supported, but it would take more smarts. */
1974    if (ir->condition)
1975       return false;
1976
1977    if (pre_rhs_inst == last_rhs_inst)
1978       return false; /* No instructions generated to work with. */
1979
1980    /* Make sure the last instruction generated our source reg. */
1981    if (src.file != GRF ||
1982        src.file != last_rhs_inst->dst.file ||
1983        src.reg != last_rhs_inst->dst.reg ||
1984        src.reg_offset != last_rhs_inst->dst.reg_offset ||
1985        src.reladdr ||
1986        src.abs ||
1987        src.negate ||
1988        last_rhs_inst->predicate != BRW_PREDICATE_NONE)
1989       return false;
1990
1991    /* Check that that last instruction fully initialized the channels
1992     * we want to use, in the order we want to use them.  We could
1993     * potentially reswizzle the operands of many instructions so that
1994     * we could handle out of order channels, but don't yet.
1995     */
1996
1997    for (unsigned i = 0; i < 4; i++) {
1998       if (dst.writemask & (1 << i)) {
1999          if (!(last_rhs_inst->dst.writemask & (1 << i)))
2000             return false;
2001
2002          if (BRW_GET_SWZ(src.swizzle, i) != i)
2003             return false;
2004       }
2005    }
2006
2007    /* Success!  Rewrite the instruction. */
2008    last_rhs_inst->dst.file = dst.file;
2009    last_rhs_inst->dst.reg = dst.reg;
2010    last_rhs_inst->dst.reg_offset = dst.reg_offset;
2011    last_rhs_inst->dst.reladdr = dst.reladdr;
2012    last_rhs_inst->dst.writemask &= dst.writemask;
2013
2014    return true;
2015 }
2016
2017 void
2018 vec4_visitor::visit(ir_assignment *ir)
2019 {
2020    dst_reg dst = get_assignment_lhs(ir->lhs, this);
2021    uint32_t predicate = BRW_PREDICATE_NONE;
2022
2023    if (!ir->lhs->type->is_scalar() &&
2024        !ir->lhs->type->is_vector()) {
2025       ir->rhs->accept(this);
2026       src_reg src = this->result;
2027
2028       if (ir->condition) {
2029          emit_bool_to_cond_code(ir->condition, &predicate);
2030       }
2031
2032       /* emit_block_move doesn't account for swizzles in the source register.
2033        * This should be ok, since the source register is a structure or an
2034        * array, and those can't be swizzled.  But double-check to be sure.
2035        */
2036       assert(src.swizzle ==
2037              (ir->rhs->type->is_matrix()
2038               ? swizzle_for_size(ir->rhs->type->vector_elements)
2039               : BRW_SWIZZLE_NOOP));
2040
2041       emit_block_move(&dst, &src, ir->rhs->type, predicate);
2042       return;
2043    }
2044
2045    /* Now we're down to just a scalar/vector with writemasks. */
2046    int i;
2047
2048    vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2049    pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2050
2051    ir->rhs->accept(this);
2052
2053    last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2054
2055    src_reg src = this->result;
2056
2057    int swizzles[4];
2058    int first_enabled_chan = 0;
2059    int src_chan = 0;
2060
2061    assert(ir->lhs->type->is_vector() ||
2062           ir->lhs->type->is_scalar());
2063    dst.writemask = ir->write_mask;
2064
2065    for (int i = 0; i < 4; i++) {
2066       if (dst.writemask & (1 << i)) {
2067          first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
2068          break;
2069       }
2070    }
2071
2072    /* Swizzle a small RHS vector into the channels being written.
2073     *
2074     * glsl ir treats write_mask as dictating how many channels are
2075     * present on the RHS while in our instructions we need to make
2076     * those channels appear in the slots of the vec4 they're written to.
2077     */
2078    for (int i = 0; i < 4; i++) {
2079       if (dst.writemask & (1 << i))
2080          swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
2081       else
2082          swizzles[i] = first_enabled_chan;
2083    }
2084    src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2085                               swizzles[2], swizzles[3]);
2086
2087    if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2088       return;
2089    }
2090
2091    if (ir->condition) {
2092       emit_bool_to_cond_code(ir->condition, &predicate);
2093    }
2094
2095    for (i = 0; i < type_size(ir->lhs->type); i++) {
2096       vec4_instruction *inst = emit(MOV(dst, src));
2097       inst->predicate = predicate;
2098
2099       dst.reg_offset++;
2100       src.reg_offset++;
2101    }
2102 }
2103
2104 void
2105 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2106 {
2107    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2108       foreach_list(node, &ir->components) {
2109          ir_constant *field_value = (ir_constant *)node;
2110
2111          emit_constant_values(dst, field_value);
2112       }
2113       return;
2114    }
2115
2116    if (ir->type->is_array()) {
2117       for (unsigned int i = 0; i < ir->type->length; i++) {
2118          emit_constant_values(dst, ir->array_elements[i]);
2119       }
2120       return;
2121    }
2122
2123    if (ir->type->is_matrix()) {
2124       for (int i = 0; i < ir->type->matrix_columns; i++) {
2125          float *vec = &ir->value.f[i * ir->type->vector_elements];
2126
2127          for (int j = 0; j < ir->type->vector_elements; j++) {
2128             dst->writemask = 1 << j;
2129             dst->type = BRW_REGISTER_TYPE_F;
2130
2131             emit(MOV(*dst, src_reg(vec[j])));
2132          }
2133          dst->reg_offset++;
2134       }
2135       return;
2136    }
2137
2138    int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2139
2140    for (int i = 0; i < ir->type->vector_elements; i++) {
2141       if (!(remaining_writemask & (1 << i)))
2142          continue;
2143
2144       dst->writemask = 1 << i;
2145       dst->type = brw_type_for_base_type(ir->type);
2146
2147       /* Find other components that match the one we're about to
2148        * write.  Emits fewer instructions for things like vec4(0.5,
2149        * 1.5, 1.5, 1.5).
2150        */
2151       for (int j = i + 1; j < ir->type->vector_elements; j++) {
2152          if (ir->type->base_type == GLSL_TYPE_BOOL) {
2153             if (ir->value.b[i] == ir->value.b[j])
2154                dst->writemask |= (1 << j);
2155          } else {
2156             /* u, i, and f storage all line up, so no need for a
2157              * switch case for comparing each type.
2158              */
2159             if (ir->value.u[i] == ir->value.u[j])
2160                dst->writemask |= (1 << j);
2161          }
2162       }
2163
2164       switch (ir->type->base_type) {
2165       case GLSL_TYPE_FLOAT:
2166          emit(MOV(*dst, src_reg(ir->value.f[i])));
2167          break;
2168       case GLSL_TYPE_INT:
2169          emit(MOV(*dst, src_reg(ir->value.i[i])));
2170          break;
2171       case GLSL_TYPE_UINT:
2172          emit(MOV(*dst, src_reg(ir->value.u[i])));
2173          break;
2174       case GLSL_TYPE_BOOL:
2175          emit(MOV(*dst, src_reg(ir->value.b[i])));
2176          break;
2177       default:
2178          assert(!"Non-float/uint/int/bool constant");
2179          break;
2180       }
2181
2182       remaining_writemask &= ~dst->writemask;
2183    }
2184    dst->reg_offset++;
2185 }
2186
2187 void
2188 vec4_visitor::visit(ir_constant *ir)
2189 {
2190    dst_reg dst = dst_reg(this, ir->type);
2191    this->result = src_reg(dst);
2192
2193    emit_constant_values(&dst, ir);
2194 }
2195
2196 void
2197 vec4_visitor::visit(ir_call *ir)
2198 {
2199    assert(!"not reached");
2200 }
2201
2202 void
2203 vec4_visitor::visit(ir_texture *ir)
2204 {
2205    int sampler =
2206       _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2207
2208    /* Should be lowered by do_lower_texture_projection */
2209    assert(!ir->projector);
2210
2211    /* Generate code to compute all the subexpression trees.  This has to be
2212     * done before loading any values into MRFs for the sampler message since
2213     * generating these values may involve SEND messages that need the MRFs.
2214     */
2215    src_reg coordinate;
2216    if (ir->coordinate) {
2217       ir->coordinate->accept(this);
2218       coordinate = this->result;
2219    }
2220
2221    src_reg shadow_comparitor;
2222    if (ir->shadow_comparitor) {
2223       ir->shadow_comparitor->accept(this);
2224       shadow_comparitor = this->result;
2225    }
2226
2227    const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2228    src_reg lod, dPdx, dPdy, sample_index;
2229    switch (ir->op) {
2230    case ir_tex:
2231       lod = src_reg(0.0f);
2232       lod_type = glsl_type::float_type;
2233       break;
2234    case ir_txf:
2235    case ir_txl:
2236    case ir_txs:
2237       ir->lod_info.lod->accept(this);
2238       lod = this->result;
2239       lod_type = ir->lod_info.lod->type;
2240       break;
2241    case ir_txf_ms:
2242       ir->lod_info.sample_index->accept(this);
2243       sample_index = this->result;
2244       sample_index_type = ir->lod_info.sample_index->type;
2245       break;
2246    case ir_txd:
2247       ir->lod_info.grad.dPdx->accept(this);
2248       dPdx = this->result;
2249
2250       ir->lod_info.grad.dPdy->accept(this);
2251       dPdy = this->result;
2252
2253       lod_type = ir->lod_info.grad.dPdx->type;
2254       break;
2255    case ir_txb:
2256    case ir_lod:
2257       break;
2258    }
2259
2260    vec4_instruction *inst = NULL;
2261    switch (ir->op) {
2262    case ir_tex:
2263    case ir_txl:
2264       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXL);
2265       break;
2266    case ir_txd:
2267       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXD);
2268       break;
2269    case ir_txf:
2270       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF);
2271       break;
2272    case ir_txf_ms:
2273       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_MS);
2274       break;
2275    case ir_txs:
2276       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
2277       break;
2278    case ir_txb:
2279       assert(!"TXB is not valid for vertex shaders.");
2280       break;
2281    case ir_lod:
2282       assert(!"LOD is not valid for vertex shaders.");
2283       break;
2284    }
2285
2286    bool use_texture_offset = ir->offset != NULL && ir->op != ir_txf;
2287
2288    /* Texel offsets go in the message header; Gen4 also requires headers. */
2289    inst->header_present = use_texture_offset || intel->gen < 5;
2290    inst->base_mrf = 2;
2291    inst->mlen = inst->header_present + 1; /* always at least one */
2292    inst->sampler = sampler;
2293    inst->dst = dst_reg(this, ir->type);
2294    inst->dst.writemask = WRITEMASK_XYZW;
2295    inst->shadow_compare = ir->shadow_comparitor != NULL;
2296
2297    if (use_texture_offset)
2298       inst->texture_offset = brw_texture_offset(ir->offset->as_constant());
2299
2300    /* MRF for the first parameter */
2301    int param_base = inst->base_mrf + inst->header_present;
2302
2303    if (ir->op == ir_txs) {
2304       int writemask = intel->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2305       emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2306    } else {
2307       int i, coord_mask = 0, zero_mask = 0;
2308       /* Load the coordinate */
2309       /* FINISHME: gl_clamp_mask and saturate */
2310       for (i = 0; i < ir->coordinate->type->vector_elements; i++)
2311          coord_mask |= (1 << i);
2312       for (; i < 4; i++)
2313          zero_mask |= (1 << i);
2314
2315       if (ir->offset && ir->op == ir_txf) {
2316          /* It appears that the ld instruction used for txf does its
2317           * address bounds check before adding in the offset.  To work
2318           * around this, just add the integer offset to the integer
2319           * texel coordinate, and don't put the offset in the header.
2320           */
2321          ir_constant *offset = ir->offset->as_constant();
2322          assert(offset);
2323
2324          for (int j = 0; j < ir->coordinate->type->vector_elements; j++) {
2325             src_reg src = coordinate;
2326             src.swizzle = BRW_SWIZZLE4(BRW_GET_SWZ(src.swizzle, j),
2327                                        BRW_GET_SWZ(src.swizzle, j),
2328                                        BRW_GET_SWZ(src.swizzle, j),
2329                                        BRW_GET_SWZ(src.swizzle, j));
2330             emit(ADD(dst_reg(MRF, param_base, ir->coordinate->type, 1 << j),
2331                      src, offset->value.i[j]));
2332          }
2333       } else {
2334          emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2335                   coordinate));
2336       }
2337       emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2338                src_reg(0)));
2339       /* Load the shadow comparitor */
2340       if (ir->shadow_comparitor && ir->op != ir_txd) {
2341          emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2342                           WRITEMASK_X),
2343                   shadow_comparitor));
2344          inst->mlen++;
2345       }
2346
2347       /* Load the LOD info */
2348       if (ir->op == ir_tex || ir->op == ir_txl) {
2349          int mrf, writemask;
2350          if (intel->gen >= 5) {
2351             mrf = param_base + 1;
2352             if (ir->shadow_comparitor) {
2353                writemask = WRITEMASK_Y;
2354                /* mlen already incremented */
2355             } else {
2356                writemask = WRITEMASK_X;
2357                inst->mlen++;
2358             }
2359          } else /* intel->gen == 4 */ {
2360             mrf = param_base;
2361             writemask = WRITEMASK_Z;
2362          }
2363          emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2364       } else if (ir->op == ir_txf) {
2365          emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2366       } else if (ir->op == ir_txf_ms) {
2367          emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2368                   sample_index));
2369          inst->mlen++;
2370
2371          /* on Gen7, there is an additional MCS parameter here after SI,
2372           * but we don't bother to emit it since it's always zero. If
2373           * we start supporting texturing from CMS surfaces, this will have
2374           * to change
2375           */
2376       } else if (ir->op == ir_txd) {
2377          const glsl_type *type = lod_type;
2378
2379          if (intel->gen >= 5) {
2380             dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2381             dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2382             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2383             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2384             inst->mlen++;
2385
2386             if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2387                dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2388                dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2389                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2390                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2391                inst->mlen++;
2392
2393                if (ir->shadow_comparitor) {
2394                   emit(MOV(dst_reg(MRF, param_base + 2,
2395                                    ir->shadow_comparitor->type, WRITEMASK_Z),
2396                            shadow_comparitor));
2397                }
2398             }
2399          } else /* intel->gen == 4 */ {
2400             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2401             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2402             inst->mlen += 2;
2403          }
2404       }
2405    }
2406
2407    emit(inst);
2408
2409    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2410     * spec requires layers.
2411     */
2412    if (ir->op == ir_txs) {
2413       glsl_type const *type = ir->sampler->type;
2414       if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2415           type->sampler_array) {
2416          emit_math(SHADER_OPCODE_INT_QUOTIENT,
2417                    with_writemask(inst->dst, WRITEMASK_Z),
2418                    src_reg(inst->dst), src_reg(6));
2419       }
2420    }
2421
2422    swizzle_result(ir, src_reg(inst->dst), sampler);
2423 }
2424
2425 void
2426 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, int sampler)
2427 {
2428    int s = key->tex.swizzles[sampler];
2429
2430    this->result = src_reg(this, ir->type);
2431    dst_reg swizzled_result(this->result);
2432
2433    if (ir->op == ir_txs || ir->type == glsl_type::float_type
2434                         || s == SWIZZLE_NOOP) {
2435       emit(MOV(swizzled_result, orig_val));
2436       return;
2437    }
2438
2439    int zero_mask = 0, one_mask = 0, copy_mask = 0;
2440    int swizzle[4];
2441
2442    for (int i = 0; i < 4; i++) {
2443       switch (GET_SWZ(s, i)) {
2444       case SWIZZLE_ZERO:
2445          zero_mask |= (1 << i);
2446          break;
2447       case SWIZZLE_ONE:
2448          one_mask |= (1 << i);
2449          break;
2450       default:
2451          copy_mask |= (1 << i);
2452          swizzle[i] = GET_SWZ(s, i);
2453          break;
2454       }
2455    }
2456
2457    if (copy_mask) {
2458       orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2459       swizzled_result.writemask = copy_mask;
2460       emit(MOV(swizzled_result, orig_val));
2461    }
2462
2463    if (zero_mask) {
2464       swizzled_result.writemask = zero_mask;
2465       emit(MOV(swizzled_result, src_reg(0.0f)));
2466    }
2467
2468    if (one_mask) {
2469       swizzled_result.writemask = one_mask;
2470       emit(MOV(swizzled_result, src_reg(1.0f)));
2471    }
2472 }
2473
2474 void
2475 vec4_visitor::visit(ir_return *ir)
2476 {
2477    assert(!"not reached");
2478 }
2479
2480 void
2481 vec4_visitor::visit(ir_discard *ir)
2482 {
2483    assert(!"not reached");
2484 }
2485
2486 void
2487 vec4_visitor::visit(ir_if *ir)
2488 {
2489    /* Don't point the annotation at the if statement, because then it plus
2490     * the then and else blocks get printed.
2491     */
2492    this->base_ir = ir->condition;
2493
2494    if (intel->gen == 6) {
2495       emit_if_gen6(ir);
2496    } else {
2497       uint32_t predicate;
2498       emit_bool_to_cond_code(ir->condition, &predicate);
2499       emit(IF(predicate));
2500    }
2501
2502    visit_instructions(&ir->then_instructions);
2503
2504    if (!ir->else_instructions.is_empty()) {
2505       this->base_ir = ir->condition;
2506       emit(BRW_OPCODE_ELSE);
2507
2508       visit_instructions(&ir->else_instructions);
2509    }
2510
2511    this->base_ir = ir->condition;
2512    emit(BRW_OPCODE_ENDIF);
2513 }
2514
2515 void
2516 vec4_visitor::emit_ndc_computation()
2517 {
2518    /* Get the position */
2519    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2520
2521    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2522    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2523    output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2524
2525    current_annotation = "NDC";
2526    dst_reg ndc_w = ndc;
2527    ndc_w.writemask = WRITEMASK_W;
2528    src_reg pos_w = pos;
2529    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2530    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2531
2532    dst_reg ndc_xyz = ndc;
2533    ndc_xyz.writemask = WRITEMASK_XYZ;
2534
2535    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2536 }
2537
2538 void
2539 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2540 {
2541    if (intel->gen < 6 &&
2542        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2543         key->userclip_active || brw->has_negative_rhw_bug)) {
2544       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2545       dst_reg header1_w = header1;
2546       header1_w.writemask = WRITEMASK_W;
2547       GLuint i;
2548
2549       emit(MOV(header1, 0u));
2550
2551       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2552          src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2553
2554          current_annotation = "Point size";
2555          emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2556          emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2557       }
2558
2559       current_annotation = "Clipping flags";
2560       for (i = 0; i < key->nr_userclip_plane_consts; i++) {
2561          vec4_instruction *inst;
2562
2563          inst = emit(DP4(dst_null_f(), src_reg(output_reg[VARYING_SLOT_POS]),
2564                          src_reg(this->userplane[i])));
2565          inst->conditional_mod = BRW_CONDITIONAL_L;
2566
2567          inst = emit(OR(header1_w, src_reg(header1_w), 1u << i));
2568          inst->predicate = BRW_PREDICATE_NORMAL;
2569       }
2570
2571       /* i965 clipping workaround:
2572        * 1) Test for -ve rhw
2573        * 2) If set,
2574        *      set ndc = (0,0,0,0)
2575        *      set ucp[6] = 1
2576        *
2577        * Later, clipping will detect ucp[6] and ensure the primitive is
2578        * clipped against all fixed planes.
2579        */
2580       if (brw->has_negative_rhw_bug) {
2581          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
2582          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
2583          emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
2584          vec4_instruction *inst;
2585          inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
2586          inst->predicate = BRW_PREDICATE_NORMAL;
2587          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
2588          inst->predicate = BRW_PREDICATE_NORMAL;
2589       }
2590
2591       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2592    } else if (intel->gen < 6) {
2593       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2594    } else {
2595       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2596       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2597          emit(MOV(brw_writemask(reg, WRITEMASK_W),
2598                   src_reg(output_reg[VARYING_SLOT_PSIZ])));
2599       }
2600    }
2601 }
2602
2603 void
2604 vec4_visitor::emit_clip_distances(struct brw_reg reg, int offset)
2605 {
2606    if (intel->gen < 6) {
2607       /* Clip distance slots are set aside in gen5, but they are not used.  It
2608        * is not clear whether we actually need to set aside space for them,
2609        * but the performance cost is negligible.
2610        */
2611       return;
2612    }
2613
2614    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2615     *
2616     *     "If a linked set of shaders forming the vertex stage contains no
2617     *     static write to gl_ClipVertex or gl_ClipDistance, but the
2618     *     application has requested clipping against user clip planes through
2619     *     the API, then the coordinate written to gl_Position is used for
2620     *     comparison against the user clip planes."
2621     *
2622     * This function is only called if the shader didn't write to
2623     * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
2624     * if the user wrote to it; otherwise we use gl_Position.
2625     */
2626    gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
2627    if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
2628       clip_vertex = VARYING_SLOT_POS;
2629    }
2630
2631    for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
2632         ++i) {
2633       emit(DP4(dst_reg(brw_writemask(reg, 1 << i)),
2634                src_reg(output_reg[clip_vertex]),
2635                src_reg(this->userplane[i + offset])));
2636    }
2637 }
2638
2639 void
2640 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
2641 {
2642    assert (varying < VARYING_SLOT_MAX);
2643    reg.type = output_reg[varying].type;
2644    current_annotation = output_reg_annotation[varying];
2645    /* Copy the register, saturating if necessary */
2646    vec4_instruction *inst = emit(MOV(reg,
2647                                      src_reg(output_reg[varying])));
2648    if ((varying == VARYING_SLOT_COL0 ||
2649         varying == VARYING_SLOT_COL1 ||
2650         varying == VARYING_SLOT_BFC0 ||
2651         varying == VARYING_SLOT_BFC1) &&
2652        key->clamp_vertex_color) {
2653       inst->saturate = true;
2654    }
2655 }
2656
2657 void
2658 vec4_visitor::emit_urb_slot(int mrf, int varying)
2659 {
2660    struct brw_reg hw_reg = brw_message_reg(mrf);
2661    dst_reg reg = dst_reg(MRF, mrf);
2662    reg.type = BRW_REGISTER_TYPE_F;
2663
2664    switch (varying) {
2665    case VARYING_SLOT_PSIZ:
2666       /* PSIZ is always in slot 0, and is coupled with other flags. */
2667       current_annotation = "indices, point width, clip flags";
2668       emit_psiz_and_flags(hw_reg);
2669       break;
2670    case BRW_VARYING_SLOT_NDC:
2671       current_annotation = "NDC";
2672       emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
2673       break;
2674    case BRW_VARYING_SLOT_POS_DUPLICATE:
2675    case VARYING_SLOT_POS:
2676       current_annotation = "gl_Position";
2677       emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
2678       break;
2679    case VARYING_SLOT_CLIP_DIST0:
2680    case VARYING_SLOT_CLIP_DIST1:
2681       if (this->key->uses_clip_distance) {
2682          emit_generic_urb_slot(reg, varying);
2683       } else {
2684          current_annotation = "user clip distances";
2685          emit_clip_distances(hw_reg, (varying - VARYING_SLOT_CLIP_DIST0) * 4);
2686       }
2687       break;
2688    case VARYING_SLOT_EDGE:
2689       /* This is present when doing unfilled polygons.  We're supposed to copy
2690        * the edge flag from the user-provided vertex array
2691        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
2692        * of that attribute (starts as 1.0f).  This is then used in clipping to
2693        * determine which edges should be drawn as wireframe.
2694        */
2695       current_annotation = "edge flag";
2696       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
2697                                     glsl_type::float_type, WRITEMASK_XYZW))));
2698       break;
2699    case BRW_VARYING_SLOT_PAD:
2700       /* No need to write to this slot */
2701       break;
2702    default:
2703       emit_generic_urb_slot(reg, varying);
2704       break;
2705    }
2706 }
2707
2708 static int
2709 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
2710 {
2711    struct intel_context *intel = &brw->intel;
2712
2713    if (intel->gen >= 6) {
2714       /* URB data written (does not include the message header reg) must
2715        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
2716        * section 5.4.3.2.2: URB_INTERLEAVED.
2717        *
2718        * URB entries are allocated on a multiple of 1024 bits, so an
2719        * extra 128 bits written here to make the end align to 256 is
2720        * no problem.
2721        */
2722       if ((mlen % 2) != 1)
2723          mlen++;
2724    }
2725
2726    return mlen;
2727 }
2728
2729 void
2730 vec4_vs_visitor::emit_urb_write_header(int mrf)
2731 {
2732    /* No need to do anything for VS; an implied write to this MRF will be
2733     * performed by VS_OPCODE_URB_WRITE.
2734     */
2735    (void) mrf;
2736 }
2737
2738 vec4_instruction *
2739 vec4_vs_visitor::emit_urb_write_opcode(bool complete)
2740 {
2741    /* For VS, the URB writes end the thread. */
2742    if (complete) {
2743       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2744          emit_shader_time_end();
2745    }
2746
2747    vec4_instruction *inst = emit(VS_OPCODE_URB_WRITE);
2748    inst->eot = complete;
2749
2750    return inst;
2751 }
2752
2753 /**
2754  * Generates the VUE payload plus the necessary URB write instructions to
2755  * output it.
2756  *
2757  * The VUE layout is documented in Volume 2a.
2758  */
2759 void
2760 vec4_visitor::emit_vertex()
2761 {
2762    /* MRF 0 is reserved for the debugger, so start with message header
2763     * in MRF 1.
2764     */
2765    int base_mrf = 1;
2766    int mrf = base_mrf;
2767    /* In the process of generating our URB write message contents, we
2768     * may need to unspill a register or load from an array.  Those
2769     * reads would use MRFs 14-15.
2770     */
2771    int max_usable_mrf = 13;
2772
2773    /* The following assertion verifies that max_usable_mrf causes an
2774     * even-numbered amount of URB write data, which will meet gen6's
2775     * requirements for length alignment.
2776     */
2777    assert ((max_usable_mrf - base_mrf) % 2 == 0);
2778
2779    /* First mrf is the g0-based message header containing URB handles and
2780     * such.
2781     */
2782    emit_urb_write_header(mrf++);
2783
2784    if (intel->gen < 6) {
2785       emit_ndc_computation();
2786    }
2787
2788    /* Set up the VUE data for the first URB write */
2789    int slot;
2790    for (slot = 0; slot < prog_data->vue_map.num_slots; ++slot) {
2791       emit_urb_slot(mrf++, prog_data->vue_map.slot_to_varying[slot]);
2792
2793       /* If this was max_usable_mrf, we can't fit anything more into this URB
2794        * WRITE.
2795        */
2796       if (mrf > max_usable_mrf) {
2797          slot++;
2798          break;
2799       }
2800    }
2801
2802    bool complete = slot >= prog_data->vue_map.num_slots;
2803    current_annotation = "URB write";
2804    vec4_instruction *inst = emit_urb_write_opcode(complete);
2805    inst->base_mrf = base_mrf;
2806    inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2807
2808    /* Optional second URB write */
2809    if (!complete) {
2810       mrf = base_mrf + 1;
2811
2812       for (; slot < prog_data->vue_map.num_slots; ++slot) {
2813          assert(mrf < max_usable_mrf);
2814
2815          emit_urb_slot(mrf++, prog_data->vue_map.slot_to_varying[slot]);
2816       }
2817
2818       current_annotation = "URB write";
2819       inst = emit_urb_write_opcode(true /* complete */);
2820       inst->base_mrf = base_mrf;
2821       inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2822       /* URB destination offset.  In the previous write, we got MRFs
2823        * 2-13 minus the one header MRF, so 12 regs.  URB offset is in
2824        * URB row increments, and each of our MRFs is half of one of
2825        * those, since we're doing interleaved writes.
2826        */
2827       inst->offset = (max_usable_mrf - base_mrf) / 2;
2828    }
2829 }
2830
2831 void
2832 vec4_vs_visitor::emit_thread_end()
2833 {
2834    /* For VS, we always end the thread by emitting a single vertex.
2835     * emit_urb_write_opcode() will take care of setting the eot flag on the
2836     * SEND instruction.
2837     */
2838    emit_vertex();
2839 }
2840
2841 src_reg
2842 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
2843                                  src_reg *reladdr, int reg_offset)
2844 {
2845    /* Because we store the values to scratch interleaved like our
2846     * vertex data, we need to scale the vec4 index by 2.
2847     */
2848    int message_header_scale = 2;
2849
2850    /* Pre-gen6, the message header uses byte offsets instead of vec4
2851     * (16-byte) offset units.
2852     */
2853    if (intel->gen < 6)
2854       message_header_scale *= 16;
2855
2856    if (reladdr) {
2857       src_reg index = src_reg(this, glsl_type::int_type);
2858
2859       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2860       emit_before(inst, MUL(dst_reg(index),
2861                             index, src_reg(message_header_scale)));
2862
2863       return index;
2864    } else {
2865       return src_reg(reg_offset * message_header_scale);
2866    }
2867 }
2868
2869 src_reg
2870 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
2871                                        src_reg *reladdr, int reg_offset)
2872 {
2873    if (reladdr) {
2874       src_reg index = src_reg(this, glsl_type::int_type);
2875
2876       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2877
2878       /* Pre-gen6, the message header uses byte offsets instead of vec4
2879        * (16-byte) offset units.
2880        */
2881       if (intel->gen < 6) {
2882          emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
2883       }
2884
2885       return index;
2886    } else {
2887       int message_header_scale = intel->gen < 6 ? 16 : 1;
2888       return src_reg(reg_offset * message_header_scale);
2889    }
2890 }
2891
2892 /**
2893  * Emits an instruction before @inst to load the value named by @orig_src
2894  * from scratch space at @base_offset to @temp.
2895  *
2896  * @base_offset is measured in 32-byte units (the size of a register).
2897  */
2898 void
2899 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
2900                                 dst_reg temp, src_reg orig_src,
2901                                 int base_offset)
2902 {
2903    int reg_offset = base_offset + orig_src.reg_offset;
2904    src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
2905
2906    emit_before(inst, SCRATCH_READ(temp, index));
2907 }
2908
2909 /**
2910  * Emits an instruction after @inst to store the value to be written
2911  * to @orig_dst to scratch space at @base_offset, from @temp.
2912  *
2913  * @base_offset is measured in 32-byte units (the size of a register).
2914  */
2915 void
2916 vec4_visitor::emit_scratch_write(vec4_instruction *inst, int base_offset)
2917 {
2918    int reg_offset = base_offset + inst->dst.reg_offset;
2919    src_reg index = get_scratch_offset(inst, inst->dst.reladdr, reg_offset);
2920
2921    /* Create a temporary register to store *inst's result in.
2922     *
2923     * We have to be careful in MOVing from our temporary result register in
2924     * the scratch write.  If we swizzle from channels of the temporary that
2925     * weren't initialized, it will confuse live interval analysis, which will
2926     * make spilling fail to make progress.
2927     */
2928    src_reg temp = src_reg(this, glsl_type::vec4_type);
2929    temp.type = inst->dst.type;
2930    int first_writemask_chan = ffs(inst->dst.writemask) - 1;
2931    int swizzles[4];
2932    for (int i = 0; i < 4; i++)
2933       if (inst->dst.writemask & (1 << i))
2934          swizzles[i] = i;
2935       else
2936          swizzles[i] = first_writemask_chan;
2937    temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2938                                swizzles[2], swizzles[3]);
2939
2940    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
2941                                        inst->dst.writemask));
2942    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
2943    write->predicate = inst->predicate;
2944    write->ir = inst->ir;
2945    write->annotation = inst->annotation;
2946    inst->insert_after(write);
2947
2948    inst->dst.file = temp.file;
2949    inst->dst.reg = temp.reg;
2950    inst->dst.reg_offset = temp.reg_offset;
2951    inst->dst.reladdr = NULL;
2952 }
2953
2954 /**
2955  * We can't generally support array access in GRF space, because a
2956  * single instruction's destination can only span 2 contiguous
2957  * registers.  So, we send all GRF arrays that get variable index
2958  * access to scratch space.
2959  */
2960 void
2961 vec4_visitor::move_grf_array_access_to_scratch()
2962 {
2963    int scratch_loc[this->virtual_grf_count];
2964
2965    for (int i = 0; i < this->virtual_grf_count; i++) {
2966       scratch_loc[i] = -1;
2967    }
2968
2969    /* First, calculate the set of virtual GRFs that need to be punted
2970     * to scratch due to having any array access on them, and where in
2971     * scratch.
2972     */
2973    foreach_list(node, &this->instructions) {
2974       vec4_instruction *inst = (vec4_instruction *)node;
2975
2976       if (inst->dst.file == GRF && inst->dst.reladdr &&
2977           scratch_loc[inst->dst.reg] == -1) {
2978          scratch_loc[inst->dst.reg] = c->last_scratch;
2979          c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
2980       }
2981
2982       for (int i = 0 ; i < 3; i++) {
2983          src_reg *src = &inst->src[i];
2984
2985          if (src->file == GRF && src->reladdr &&
2986              scratch_loc[src->reg] == -1) {
2987             scratch_loc[src->reg] = c->last_scratch;
2988             c->last_scratch += this->virtual_grf_sizes[src->reg];
2989          }
2990       }
2991    }
2992
2993    /* Now, for anything that will be accessed through scratch, rewrite
2994     * it to load/store.  Note that this is a _safe list walk, because
2995     * we may generate a new scratch_write instruction after the one
2996     * we're processing.
2997     */
2998    foreach_list_safe(node, &this->instructions) {
2999       vec4_instruction *inst = (vec4_instruction *)node;
3000
3001       /* Set up the annotation tracking for new generated instructions. */
3002       base_ir = inst->ir;
3003       current_annotation = inst->annotation;
3004
3005       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
3006          emit_scratch_write(inst, scratch_loc[inst->dst.reg]);
3007       }
3008
3009       for (int i = 0 ; i < 3; i++) {
3010          if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
3011             continue;
3012
3013          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3014
3015          emit_scratch_read(inst, temp, inst->src[i],
3016                            scratch_loc[inst->src[i].reg]);
3017
3018          inst->src[i].file = temp.file;
3019          inst->src[i].reg = temp.reg;
3020          inst->src[i].reg_offset = temp.reg_offset;
3021          inst->src[i].reladdr = NULL;
3022       }
3023    }
3024 }
3025
3026 /**
3027  * Emits an instruction before @inst to load the value named by @orig_src
3028  * from the pull constant buffer (surface) at @base_offset to @temp.
3029  */
3030 void
3031 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
3032                                       dst_reg temp, src_reg orig_src,
3033                                       int base_offset)
3034 {
3035    int reg_offset = base_offset + orig_src.reg_offset;
3036    src_reg index = src_reg((unsigned)SURF_INDEX_VERT_CONST_BUFFER);
3037    src_reg offset = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
3038    vec4_instruction *load;
3039
3040    if (intel->gen >= 7) {
3041       dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
3042       grf_offset.type = offset.type;
3043       emit_before(inst, MOV(grf_offset, offset));
3044
3045       load = new(mem_ctx) vec4_instruction(this,
3046                                            VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
3047                                            temp, index, src_reg(grf_offset));
3048    } else {
3049       load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
3050                                            temp, index, offset);
3051       load->base_mrf = 14;
3052       load->mlen = 1;
3053    }
3054    emit_before(inst, load);
3055 }
3056
3057 /**
3058  * Implements array access of uniforms by inserting a
3059  * PULL_CONSTANT_LOAD instruction.
3060  *
3061  * Unlike temporary GRF array access (where we don't support it due to
3062  * the difficulty of doing relative addressing on instruction
3063  * destinations), we could potentially do array access of uniforms
3064  * that were loaded in GRF space as push constants.  In real-world
3065  * usage we've seen, though, the arrays being used are always larger
3066  * than we could load as push constants, so just always move all
3067  * uniform array access out to a pull constant buffer.
3068  */
3069 void
3070 vec4_visitor::move_uniform_array_access_to_pull_constants()
3071 {
3072    int pull_constant_loc[this->uniforms];
3073
3074    for (int i = 0; i < this->uniforms; i++) {
3075       pull_constant_loc[i] = -1;
3076    }
3077
3078    /* Walk through and find array access of uniforms.  Put a copy of that
3079     * uniform in the pull constant buffer.
3080     *
3081     * Note that we don't move constant-indexed accesses to arrays.  No
3082     * testing has been done of the performance impact of this choice.
3083     */
3084    foreach_list_safe(node, &this->instructions) {
3085       vec4_instruction *inst = (vec4_instruction *)node;
3086
3087       for (int i = 0 ; i < 3; i++) {
3088          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3089             continue;
3090
3091          int uniform = inst->src[i].reg;
3092
3093          /* If this array isn't already present in the pull constant buffer,
3094           * add it.
3095           */
3096          if (pull_constant_loc[uniform] == -1) {
3097             const float **values = &prog_data->param[uniform * 4];
3098
3099             pull_constant_loc[uniform] = prog_data->nr_pull_params / 4;
3100
3101             for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3102                prog_data->pull_param[prog_data->nr_pull_params++]
3103                   = values[j];
3104             }
3105          }
3106
3107          /* Set up the annotation tracking for new generated instructions. */
3108          base_ir = inst->ir;
3109          current_annotation = inst->annotation;
3110
3111          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3112
3113          emit_pull_constant_load(inst, temp, inst->src[i],
3114                                  pull_constant_loc[uniform]);
3115
3116          inst->src[i].file = temp.file;
3117          inst->src[i].reg = temp.reg;
3118          inst->src[i].reg_offset = temp.reg_offset;
3119          inst->src[i].reladdr = NULL;
3120       }
3121    }
3122
3123    /* Now there are no accesses of the UNIFORM file with a reladdr, so
3124     * no need to track them as larger-than-vec4 objects.  This will be
3125     * relied on in cutting out unused uniform vectors from push
3126     * constants.
3127     */
3128    split_uniform_registers();
3129 }
3130
3131 void
3132 vec4_visitor::resolve_ud_negate(src_reg *reg)
3133 {
3134    if (reg->type != BRW_REGISTER_TYPE_UD ||
3135        !reg->negate)
3136       return;
3137
3138    src_reg temp = src_reg(this, glsl_type::uvec4_type);
3139    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3140    *reg = temp;
3141 }
3142
3143 vec4_visitor::vec4_visitor(struct brw_context *brw,
3144                            struct brw_vec4_compile *c,
3145                            struct gl_program *prog,
3146                            const struct brw_vec4_prog_key *key,
3147                            struct brw_vec4_prog_data *prog_data,
3148                            struct gl_shader_program *shader_prog,
3149                            struct brw_shader *shader,
3150                            void *mem_ctx,
3151                            bool debug_flag)
3152    : debug_flag(debug_flag)
3153 {
3154    this->brw = brw;
3155    this->intel = &brw->intel;
3156    this->ctx = &intel->ctx;
3157    this->shader_prog = shader_prog;
3158    this->shader = shader;
3159
3160    this->mem_ctx = mem_ctx;
3161    this->failed = false;
3162
3163    this->base_ir = NULL;
3164    this->current_annotation = NULL;
3165    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3166
3167    this->c = c;
3168    this->prog = prog;
3169    this->key = key;
3170    this->prog_data = prog_data;
3171
3172    this->variable_ht = hash_table_ctor(0,
3173                                        hash_table_pointer_hash,
3174                                        hash_table_pointer_compare);
3175
3176    this->virtual_grf_def = NULL;
3177    this->virtual_grf_use = NULL;
3178    this->virtual_grf_sizes = NULL;
3179    this->virtual_grf_count = 0;
3180    this->virtual_grf_reg_map = NULL;
3181    this->virtual_grf_reg_count = 0;
3182    this->virtual_grf_array_size = 0;
3183    this->live_intervals_valid = false;
3184
3185    this->max_grf = intel->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3186
3187    this->uniforms = 0;
3188 }
3189
3190 vec4_visitor::~vec4_visitor()
3191 {
3192    hash_table_dtor(this->variable_ht);
3193 }
3194
3195
3196 vec4_vs_visitor::vec4_vs_visitor(struct brw_context *brw,
3197                                  struct brw_vs_compile *vs_compile,
3198                                  struct brw_vs_prog_data *vs_prog_data,
3199                                  struct gl_shader_program *prog,
3200                                  struct brw_shader *shader,
3201                                  void *mem_ctx)
3202    : vec4_visitor(brw, &vs_compile->base, &vs_compile->vp->program.Base,
3203                   &vs_compile->key.base, &vs_prog_data->base, prog, shader,
3204                   mem_ctx, INTEL_DEBUG & DEBUG_VS),
3205      vs_compile(vs_compile),
3206      vs_prog_data(vs_prog_data)
3207 {
3208 }
3209
3210
3211 void
3212 vec4_visitor::fail(const char *format, ...)
3213 {
3214    va_list va;
3215    char *msg;
3216
3217    if (failed)
3218       return;
3219
3220    failed = true;
3221
3222    va_start(va, format);
3223    msg = ralloc_vasprintf(mem_ctx, format, va);
3224    va_end(va);
3225    msg = ralloc_asprintf(mem_ctx, "VS compile failed: %s\n", msg);
3226
3227    this->fail_msg = msg;
3228
3229    if (debug_flag) {
3230       fprintf(stderr, "%s",  msg);
3231    }
3232 }
3233
3234 } /* namespace brw */