src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "glsl/ir_uniform.h"
  26 extern "C" {
  27 #include "program/sampler.h"
  28 }
  29
  30 namespace brw {
  31
  32 vec4_instruction::vec4_instruction(vec4_visitor *v,
  33                                    enum opcode opcode, dst_reg dst,
  34                                    src_reg src0, src_reg src1, src_reg src2)
  35 {
  36    this->opcode = opcode;
  37    this->dst = dst;
  38    this->src[0] = src0;
  39    this->src[1] = src1;
  40    this->src[2] = src2;
  41    this->ir = v->base_ir;
  42    this->annotation = v->current_annotation;
  43 }
  44
  45 vec4_instruction *
  46 vec4_visitor::emit(vec4_instruction *inst)
  47 {
  48    this->instructions.push_tail(inst);
  49
  50    return inst;
  51 }
  52
  53 vec4_instruction *
  54 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
  55 {
  56    new_inst->ir = inst->ir;
  57    new_inst->annotation = inst->annotation;
  58
  59    inst->insert_before(new_inst);
  60
  61    return inst;
  62 }
  63
  64 vec4_instruction *
  65 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
  66                    src_reg src0, src_reg src1, src_reg src2)
  67 {
  68    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
  69                                              src0, src1, src2));
  70 }
  71
  72
  73 vec4_instruction *
  74 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
  75 {
  76    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
  77 }
  78
  79 vec4_instruction *
  80 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
  81 {
  82    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
  83 }
  84
  85 vec4_instruction *
  86 vec4_visitor::emit(enum opcode opcode, dst_reg dst)
  87 {
  88    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst));
  89 }
  90
  91 vec4_instruction *
  92 vec4_visitor::emit(enum opcode opcode)
  93 {
  94    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
  95 }
  96
  97 #define ALU1(op)                                                        \
  98    vec4_instruction *                                                   \
  99    vec4_visitor::op(dst_reg dst, src_reg src0)                          \
 100    {                                                                    \
 101       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 102                                            src0);                       \
 103    }
 104
 105 #define ALU2(op)                                                        \
 106    vec4_instruction *                                                   \
 107    vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1)            \
 108    {                                                                    \
 109       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 110                                            src0, src1);                 \
 111    }
 112
 113 #define ALU3(op)                                                        \
 114    vec4_instruction *                                                   \
 115    vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1, src_reg src2)\
 116    {                                                                    \
 117       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 118                                            src0, src1, src2);           \
 119    }
 120
 121 ALU1(NOT)
 122 ALU1(MOV)
 123 ALU1(FRC)
 124 ALU1(RNDD)
 125 ALU1(RNDE)
 126 ALU1(RNDZ)
 127 ALU1(F32TO16)
 128 ALU1(F16TO32)
 129 ALU2(ADD)
 130 ALU2(MUL)
 131 ALU2(MACH)
 132 ALU2(AND)
 133 ALU2(OR)
 134 ALU2(XOR)
 135 ALU2(DP3)
 136 ALU2(DP4)
 137 ALU2(DPH)
 138 ALU2(SHL)
 139 ALU2(SHR)
 140 ALU2(ASR)
 141 ALU3(LRP)
 142 ALU1(BFREV)
 143 ALU3(BFE)
 144 ALU2(BFI1)
 145 ALU3(BFI2)
 146 ALU1(FBH)
 147 ALU1(FBL)
 148 ALU1(CBIT)
 149 ALU3(MAD)
 150
 151 /** Gen4 predicated IF. */
 152 vec4_instruction *
 153 vec4_visitor::IF(uint32_t predicate)
 154 {
 155    vec4_instruction *inst;
 156
 157    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
 158    inst->predicate = predicate;
 159
 160    return inst;
 161 }
 162
 163 /** Gen6+ IF with embedded comparison. */
 164 vec4_instruction *
 165 vec4_visitor::IF(src_reg src0, src_reg src1, uint32_t condition)
 166 {
 167    assert(brw->gen >= 6);
 168
 169    vec4_instruction *inst;
 170
 171    resolve_ud_negate(&src0);
 172    resolve_ud_negate(&src1);
 173
 174    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
 175                                         src0, src1);
 176    inst->conditional_mod = condition;
 177
 178    return inst;
 179 }
 180
 181 /**
 182  * CMP: Sets the low bit of the destination channels with the result
 183  * of the comparison, while the upper bits are undefined, and updates
 184  * the flag register with the packed 16 bits of the result.
 185  */
 186 vec4_instruction *
 187 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, uint32_t condition)
 188 {
 189    vec4_instruction *inst;
 190
 191    /* original gen4 does type conversion to the destination type
 192     * before before comparison, producing garbage results for floating
 193     * point comparisons.
 194     */
 195    if (brw->gen == 4) {
 196       dst.type = src0.type;
 197       if (dst.file == HW_REG)
 198          dst.fixed_hw_reg.type = dst.type;
 199    }
 200
 201    resolve_ud_negate(&src0);
 202    resolve_ud_negate(&src1);
 203
 204    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
 205    inst->conditional_mod = condition;
 206
 207    return inst;
 208 }
 209
 210 vec4_instruction *
 211 vec4_visitor::SCRATCH_READ(dst_reg dst, src_reg index)
 212 {
 213    vec4_instruction *inst;
 214
 215    inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_READ,
 216                                         dst, index);
 217    inst->base_mrf = 14;
 218    inst->mlen = 2;
 219
 220    return inst;
 221 }
 222
 223 vec4_instruction *
 224 vec4_visitor::SCRATCH_WRITE(dst_reg dst, src_reg src, src_reg index)
 225 {
 226    vec4_instruction *inst;
 227
 228    inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_WRITE,
 229                                         dst, src, index);
 230    inst->base_mrf = 13;
 231    inst->mlen = 3;
 232
 233    return inst;
 234 }
 235
 236 void
 237 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
 238 {
 239    static enum opcode dot_opcodes[] = {
 240       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
 241    };
 242
 243    emit(dot_opcodes[elements - 2], dst, src0, src1);
 244 }
 245
 246 src_reg
 247 vec4_visitor::fix_3src_operand(src_reg src)
 248 {
 249    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
 250     * able to use vertical stride of zero to replicate the vec4 uniform, like
 251     *
 252     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
 253     *
 254     * But you can't, since vertical stride is always four in three-source
 255     * instructions. Instead, insert a MOV instruction to do the replication so
 256     * that the three-source instruction can consume it.
 257     */
 258
 259    /* The MOV is only needed if the source is a uniform or immediate. */
 260    if (src.file != UNIFORM && src.file != IMM)
 261       return src;
 262
 263    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 264    expanded.type = src.type;
 265    emit(MOV(expanded, src));
 266    return src_reg(expanded);
 267 }
 268
 269 src_reg
 270 vec4_visitor::fix_math_operand(src_reg src)
 271 {
 272    /* The gen6 math instruction ignores the source modifiers --
 273     * swizzle, abs, negate, and at least some parts of the register
 274     * region description.
 275     *
 276     * Rather than trying to enumerate all these cases, *always* expand the
 277     * operand to a temp GRF for gen6.
 278     *
 279     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 280     * can't use.
 281     */
 282
 283    if (brw->gen == 7 && src.file != IMM)
 284       return src;
 285
 286    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 287    expanded.type = src.type;
 288    emit(MOV(expanded, src));
 289    return src_reg(expanded);
 290 }
 291
 292 void
 293 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
 294 {
 295    src = fix_math_operand(src);
 296
 297    if (dst.writemask != WRITEMASK_XYZW) {
 298       /* The gen6 math instruction must be align1, so we can't do
 299        * writemasks.
 300        */
 301       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 302
 303       emit(opcode, temp_dst, src);
 304
 305       emit(MOV(dst, src_reg(temp_dst)));
 306    } else {
 307       emit(opcode, dst, src);
 308    }
 309 }
 310
 311 void
 312 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
 313 {
 314    vec4_instruction *inst = emit(opcode, dst, src);
 315    inst->base_mrf = 1;
 316    inst->mlen = 1;
 317 }
 318
 319 void
 320 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
 321 {
 322    switch (opcode) {
 323    case SHADER_OPCODE_RCP:
 324    case SHADER_OPCODE_RSQ:
 325    case SHADER_OPCODE_SQRT:
 326    case SHADER_OPCODE_EXP2:
 327    case SHADER_OPCODE_LOG2:
 328    case SHADER_OPCODE_SIN:
 329    case SHADER_OPCODE_COS:
 330       break;
 331    default:
 332       assert(!"not reached: bad math opcode");
 333       return;
 334    }
 335
 336    if (brw->gen >= 6) {
 337       return emit_math1_gen6(opcode, dst, src);
 338    } else {
 339       return emit_math1_gen4(opcode, dst, src);
 340    }
 341 }
 342
 343 void
 344 vec4_visitor::emit_math2_gen6(enum opcode opcode,
 345                               dst_reg dst, src_reg src0, src_reg src1)
 346 {
 347    src0 = fix_math_operand(src0);
 348    src1 = fix_math_operand(src1);
 349
 350    if (dst.writemask != WRITEMASK_XYZW) {
 351       /* The gen6 math instruction must be align1, so we can't do
 352        * writemasks.
 353        */
 354       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 355       temp_dst.type = dst.type;
 356
 357       emit(opcode, temp_dst, src0, src1);
 358
 359       emit(MOV(dst, src_reg(temp_dst)));
 360    } else {
 361       emit(opcode, dst, src0, src1);
 362    }
 363 }
 364
 365 void
 366 vec4_visitor::emit_math2_gen4(enum opcode opcode,
 367                               dst_reg dst, src_reg src0, src_reg src1)
 368 {
 369    vec4_instruction *inst = emit(opcode, dst, src0, src1);
 370    inst->base_mrf = 1;
 371    inst->mlen = 2;
 372 }
 373
 374 void
 375 vec4_visitor::emit_math(enum opcode opcode,
 376                         dst_reg dst, src_reg src0, src_reg src1)
 377 {
 378    switch (opcode) {
 379    case SHADER_OPCODE_POW:
 380    case SHADER_OPCODE_INT_QUOTIENT:
 381    case SHADER_OPCODE_INT_REMAINDER:
 382       break;
 383    default:
 384       assert(!"not reached: unsupported binary math opcode");
 385       return;
 386    }
 387
 388    if (brw->gen >= 6) {
 389       return emit_math2_gen6(opcode, dst, src0, src1);
 390    } else {
 391       return emit_math2_gen4(opcode, dst, src0, src1);
 392    }
 393 }
 394
 395 void
 396 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 397 {
 398    if (brw->gen < 7)
 399       assert(!"ir_unop_pack_half_2x16 should be lowered");
 400
 401    assert(dst.type == BRW_REGISTER_TYPE_UD);
 402    assert(src0.type == BRW_REGISTER_TYPE_F);
 403
 404    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
 405     *
 406     *   Because this instruction does not have a 16-bit floating-point type,
 407     *   the destination data type must be Word (W).
 408     *
 409     *   The destination must be DWord-aligned and specify a horizontal stride
 410     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
 411     *   each destination channel and the upper word is not modified.
 412     *
 413     * The above restriction implies that the f32to16 instruction must use
 414     * align1 mode, because only in align1 mode is it possible to specify
 415     * horizontal stride.  We choose here to defy the hardware docs and emit
 416     * align16 instructions.
 417     *
 418     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
 419     * instructions. I was partially successful in that the code passed all
 420     * tests.  However, the code was dubiously correct and fragile, and the
 421     * tests were not harsh enough to probe that frailty. Not trusting the
 422     * code, I chose instead to remain in align16 mode in defiance of the hw
 423     * docs).
 424     *
 425     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
 426     * simulator, emitting a f32to16 in align16 mode with UD as destination
 427     * data type is safe. The behavior differs from that specified in the PRM
 428     * in that the upper word of each destination channel is cleared to 0.
 429     */
 430
 431    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 432    src_reg tmp_src(tmp_dst);
 433
 434 #if 0
 435    /* Verify the undocumented behavior on which the following instructions
 436     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
 437     * then the result of the bit-or instruction below will be incorrect.
 438     *
 439     * You should inspect the disasm output in order to verify that the MOV is
 440     * not optimized away.
 441     */
 442    emit(MOV(tmp_dst, src_reg(0x12345678u)));
 443 #endif
 444
 445    /* Give tmp the form below, where "." means untouched.
 446     *
 447     *     w z          y          x w z          y          x
 448     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
 449     *
 450     * That the upper word of each write-channel be 0 is required for the
 451     * following bit-shift and bit-or instructions to work. Note that this
 452     * relies on the undocumented hardware behavior mentioned above.
 453     */
 454    tmp_dst.writemask = WRITEMASK_XY;
 455    emit(F32TO16(tmp_dst, src0));
 456
 457    /* Give the write-channels of dst the form:
 458     *   0xhhhh0000
 459     */
 460    tmp_src.swizzle = SWIZZLE_Y;
 461    emit(SHL(dst, tmp_src, src_reg(16u)));
 462
 463    /* Finally, give the write-channels of dst the form of packHalf2x16's
 464     * output:
 465     *   0xhhhhllll
 466     */
 467    tmp_src.swizzle = SWIZZLE_X;
 468    emit(OR(dst, src_reg(dst), tmp_src));
 469 }
 470
 471 void
 472 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 473 {
 474    if (brw->gen < 7)
 475       assert(!"ir_unop_unpack_half_2x16 should be lowered");
 476
 477    assert(dst.type == BRW_REGISTER_TYPE_F);
 478    assert(src0.type == BRW_REGISTER_TYPE_UD);
 479
 480    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
 481     *
 482     *   Because this instruction does not have a 16-bit floating-point type,
 483     *   the source data type must be Word (W). The destination type must be
 484     *   F (Float).
 485     *
 486     * To use W as the source data type, we must adjust horizontal strides,
 487     * which is only possible in align1 mode. All my [chadv] attempts at
 488     * emitting align1 instructions for unpackHalf2x16 failed to pass the
 489     * Piglit tests, so I gave up.
 490     *
 491     * I've verified that, on gen7 hardware and the simulator, it is safe to
 492     * emit f16to32 in align16 mode with UD as source data type.
 493     */
 494
 495    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 496    src_reg tmp_src(tmp_dst);
 497
 498    tmp_dst.writemask = WRITEMASK_X;
 499    emit(AND(tmp_dst, src0, src_reg(0xffffu)));
 500
 501    tmp_dst.writemask = WRITEMASK_Y;
 502    emit(SHR(tmp_dst, src0, src_reg(16u)));
 503
 504    dst.writemask = WRITEMASK_XY;
 505    emit(F16TO32(dst, tmp_src));
 506 }
 507
 508 void
 509 vec4_visitor::visit_instructions(const exec_list *list)
 510 {
 511    foreach_list(node, list) {
 512       ir_instruction *ir = (ir_instruction *)node;
 513
 514       base_ir = ir;
 515       ir->accept(this);
 516    }
 517 }
 518
 519
 520 static int
 521 type_size(const struct glsl_type *type)
 522 {
 523    unsigned int i;
 524    int size;
 525
 526    switch (type->base_type) {
 527    case GLSL_TYPE_UINT:
 528    case GLSL_TYPE_INT:
 529    case GLSL_TYPE_FLOAT:
 530    case GLSL_TYPE_BOOL:
 531       if (type->is_matrix()) {
 532          return type->matrix_columns;
 533       } else {
 534          /* Regardless of size of vector, it gets a vec4. This is bad
 535           * packing for things like floats, but otherwise arrays become a
 536           * mess.  Hopefully a later pass over the code can pack scalars
 537           * down if appropriate.
 538           */
 539          return 1;
 540       }
 541    case GLSL_TYPE_ARRAY:
 542       assert(type->length > 0);
 543       return type_size(type->fields.array) * type->length;
 544    case GLSL_TYPE_STRUCT:
 545       size = 0;
 546       for (i = 0; i < type->length; i++) {
 547          size += type_size(type->fields.structure[i].type);
 548       }
 549       return size;
 550    case GLSL_TYPE_SAMPLER:
 551       /* Samplers take up one slot in UNIFORMS[], but they're baked in
 552        * at link time.
 553        */
 554       return 1;
 555    case GLSL_TYPE_VOID:
 556    case GLSL_TYPE_ERROR:
 557    case GLSL_TYPE_INTERFACE:
 558       assert(0);
 559       break;
 560    }
 561
 562    return 0;
 563 }
 564
 565 int
 566 vec4_visitor::virtual_grf_alloc(int size)
 567 {
 568    if (virtual_grf_array_size <= virtual_grf_count) {
 569       if (virtual_grf_array_size == 0)
 570          virtual_grf_array_size = 16;
 571       else
 572          virtual_grf_array_size *= 2;
 573       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 574                                    virtual_grf_array_size);
 575       virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
 576                                      virtual_grf_array_size);
 577    }
 578    virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
 579    virtual_grf_reg_count += size;
 580    virtual_grf_sizes[virtual_grf_count] = size;
 581    return virtual_grf_count++;
 582 }
 583
 584 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 585 {
 586    init();
 587
 588    this->file = GRF;
 589    this->reg = v->virtual_grf_alloc(type_size(type));
 590
 591    if (type->is_array() || type->is_record()) {
 592       this->swizzle = BRW_SWIZZLE_NOOP;
 593    } else {
 594       this->swizzle = swizzle_for_size(type->vector_elements);
 595    }
 596
 597    this->type = brw_type_for_base_type(type);
 598 }
 599
 600 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 601 {
 602    init();
 603
 604    this->file = GRF;
 605    this->reg = v->virtual_grf_alloc(type_size(type));
 606
 607    if (type->is_array() || type->is_record()) {
 608       this->writemask = WRITEMASK_XYZW;
 609    } else {
 610       this->writemask = (1 << type->vector_elements) - 1;
 611    }
 612
 613    this->type = brw_type_for_base_type(type);
 614 }
 615
 616 /* Our support for uniforms is piggy-backed on the struct
 617  * gl_fragment_program, because that's where the values actually
 618  * get stored, rather than in some global gl_shader_program uniform
 619  * store.
 620  */
 621 void
 622 vec4_visitor::setup_uniform_values(ir_variable *ir)
 623 {
 624    int namelen = strlen(ir->name);
 625
 626    /* The data for our (non-builtin) uniforms is stored in a series of
 627     * gl_uniform_driver_storage structs for each subcomponent that
 628     * glGetUniformLocation() could name.  We know it's been set up in the same
 629     * order we'd walk the type, so walk the list of storage and find anything
 630     * with our name, or the prefix of a component that starts with our name.
 631     */
 632    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 633       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 634
 635       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 636           (storage->name[namelen] != 0 &&
 637            storage->name[namelen] != '.' &&
 638            storage->name[namelen] != '[')) {
 639          continue;
 640       }
 641
 642       gl_constant_value *components = storage->storage;
 643       unsigned vector_count = (MAX2(storage->array_elements, 1) *
 644                                storage->type->matrix_columns);
 645
 646       for (unsigned s = 0; s < vector_count; s++) {
 647          uniform_vector_size[uniforms] = storage->type->vector_elements;
 648
 649          int i;
 650          for (i = 0; i < uniform_vector_size[uniforms]; i++) {
 651             prog_data->param[uniforms * 4 + i] = &components->f;
 652             components++;
 653          }
 654          for (; i < 4; i++) {
 655             static float zero = 0;
 656             prog_data->param[uniforms * 4 + i] = &zero;
 657          }
 658
 659          uniforms++;
 660       }
 661    }
 662 }
 663
 664 void
 665 vec4_visitor::setup_uniform_clipplane_values()
 666 {
 667    gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
 668
 669    for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
 670       this->uniform_vector_size[this->uniforms] = 4;
 671       this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
 672       this->userplane[i].type = BRW_REGISTER_TYPE_F;
 673       for (int j = 0; j < 4; ++j) {
 674          prog_data->param[this->uniforms * 4 + j] = &clip_planes[i][j];
 675       }
 676       ++this->uniforms;
 677    }
 678 }
 679
 680 /* Our support for builtin uniforms is even scarier than non-builtin.
 681  * It sits on top of the PROG_STATE_VAR parameters that are
 682  * automatically updated from GL context state.
 683  */
 684 void
 685 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
 686 {
 687    const ir_state_slot *const slots = ir->state_slots;
 688    assert(ir->state_slots != NULL);
 689
 690    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 691       /* This state reference has already been setup by ir_to_mesa,
 692        * but we'll get the same index back here.  We can reference
 693        * ParameterValues directly, since unlike brw_fs.cpp, we never
 694        * add new state references during compile.
 695        */
 696       int index = _mesa_add_state_reference(this->prog->Parameters,
 697                                             (gl_state_index *)slots[i].tokens);
 698       float *values = &this->prog->Parameters->ParameterValues[index][0].f;
 699
 700       this->uniform_vector_size[this->uniforms] = 0;
 701       /* Add each of the unique swizzled channels of the element.
 702        * This will end up matching the size of the glsl_type of this field.
 703        */
 704       int last_swiz = -1;
 705       for (unsigned int j = 0; j < 4; j++) {
 706          int swiz = GET_SWZ(slots[i].swizzle, j);
 707          last_swiz = swiz;
 708
 709          prog_data->param[this->uniforms * 4 + j] = &values[swiz];
 710          if (swiz <= last_swiz)
 711             this->uniform_vector_size[this->uniforms]++;
 712       }
 713       this->uniforms++;
 714    }
 715 }
 716
 717 dst_reg *
 718 vec4_visitor::variable_storage(ir_variable *var)
 719 {
 720    return (dst_reg *)hash_table_find(this->variable_ht, var);
 721 }
 722
 723 void
 724 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir, uint32_t *predicate)
 725 {
 726    ir_expression *expr = ir->as_expression();
 727
 728    *predicate = BRW_PREDICATE_NORMAL;
 729
 730    if (expr) {
 731       src_reg op[2];
 732       vec4_instruction *inst;
 733
 734       assert(expr->get_num_operands() <= 2);
 735       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 736          expr->operands[i]->accept(this);
 737          op[i] = this->result;
 738
 739          resolve_ud_negate(&op[i]);
 740       }
 741
 742       switch (expr->operation) {
 743       case ir_unop_logic_not:
 744          inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
 745          inst->conditional_mod = BRW_CONDITIONAL_Z;
 746          break;
 747
 748       case ir_binop_logic_xor:
 749          inst = emit(XOR(dst_null_d(), op[0], op[1]));
 750          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 751          break;
 752
 753       case ir_binop_logic_or:
 754          inst = emit(OR(dst_null_d(), op[0], op[1]));
 755          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 756          break;
 757
 758       case ir_binop_logic_and:
 759          inst = emit(AND(dst_null_d(), op[0], op[1]));
 760          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 761          break;
 762
 763       case ir_unop_f2b:
 764          if (brw->gen >= 6) {
 765             emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
 766          } else {
 767             inst = emit(MOV(dst_null_f(), op[0]));
 768             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 769          }
 770          break;
 771
 772       case ir_unop_i2b:
 773          if (brw->gen >= 6) {
 774             emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 775          } else {
 776             inst = emit(MOV(dst_null_d(), op[0]));
 777             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 778          }
 779          break;
 780
 781       case ir_binop_all_equal:
 782          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 783          *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
 784          break;
 785
 786       case ir_binop_any_nequal:
 787          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 788          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 789          break;
 790
 791       case ir_unop_any:
 792          inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 793          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 794          break;
 795
 796       case ir_binop_greater:
 797       case ir_binop_gequal:
 798       case ir_binop_less:
 799       case ir_binop_lequal:
 800       case ir_binop_equal:
 801       case ir_binop_nequal:
 802          emit(CMP(dst_null_d(), op[0], op[1],
 803                   brw_conditional_for_comparison(expr->operation)));
 804          break;
 805
 806       default:
 807          assert(!"not reached");
 808          break;
 809       }
 810       return;
 811    }
 812
 813    ir->accept(this);
 814
 815    resolve_ud_negate(&this->result);
 816
 817    if (brw->gen >= 6) {
 818       vec4_instruction *inst = emit(AND(dst_null_d(),
 819                                         this->result, src_reg(1)));
 820       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 821    } else {
 822       vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
 823       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 824    }
 825 }
 826
 827 /**
 828  * Emit a gen6 IF statement with the comparison folded into the IF
 829  * instruction.
 830  */
 831 void
 832 vec4_visitor::emit_if_gen6(ir_if *ir)
 833 {
 834    ir_expression *expr = ir->condition->as_expression();
 835
 836    if (expr) {
 837       src_reg op[2];
 838       dst_reg temp;
 839
 840       assert(expr->get_num_operands() <= 2);
 841       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 842          expr->operands[i]->accept(this);
 843          op[i] = this->result;
 844       }
 845
 846       switch (expr->operation) {
 847       case ir_unop_logic_not:
 848          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
 849          return;
 850
 851       case ir_binop_logic_xor:
 852          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
 853          return;
 854
 855       case ir_binop_logic_or:
 856          temp = dst_reg(this, glsl_type::bool_type);
 857          emit(OR(temp, op[0], op[1]));
 858          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 859          return;
 860
 861       case ir_binop_logic_and:
 862          temp = dst_reg(this, glsl_type::bool_type);
 863          emit(AND(temp, op[0], op[1]));
 864          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 865          return;
 866
 867       case ir_unop_f2b:
 868          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 869          return;
 870
 871       case ir_unop_i2b:
 872          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 873          return;
 874
 875       case ir_binop_greater:
 876       case ir_binop_gequal:
 877       case ir_binop_less:
 878       case ir_binop_lequal:
 879       case ir_binop_equal:
 880       case ir_binop_nequal:
 881          emit(IF(op[0], op[1],
 882                  brw_conditional_for_comparison(expr->operation)));
 883          return;
 884
 885       case ir_binop_all_equal:
 886          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 887          emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
 888          return;
 889
 890       case ir_binop_any_nequal:
 891          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 892          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 893          return;
 894
 895       case ir_unop_any:
 896          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 897          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 898          return;
 899
 900       default:
 901          assert(!"not reached");
 902          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 903          return;
 904       }
 905       return;
 906    }
 907
 908    ir->condition->accept(this);
 909
 910    emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
 911 }
 912
 913 dst_reg
 914 with_writemask(dst_reg const & r, int mask)
 915 {
 916    dst_reg result = r;
 917    result.writemask = mask;
 918    return result;
 919 }
 920
 921
 922 void
 923 vec4_visitor::visit(ir_variable *ir)
 924 {
 925    dst_reg *reg = NULL;
 926
 927    if (variable_storage(ir))
 928       return;
 929
 930    switch (ir->mode) {
 931    case ir_var_shader_in:
 932       reg = new(mem_ctx) dst_reg(ATTR, ir->location);
 933       break;
 934
 935    case ir_var_shader_out:
 936       reg = new(mem_ctx) dst_reg(this, ir->type);
 937
 938       for (int i = 0; i < type_size(ir->type); i++) {
 939          output_reg[ir->location + i] = *reg;
 940          output_reg[ir->location + i].reg_offset = i;
 941          output_reg[ir->location + i].type =
 942             brw_type_for_base_type(ir->type->get_scalar_type());
 943          output_reg_annotation[ir->location + i] = ir->name;
 944       }
 945       break;
 946
 947    case ir_var_auto:
 948    case ir_var_temporary:
 949       reg = new(mem_ctx) dst_reg(this, ir->type);
 950       break;
 951
 952    case ir_var_uniform:
 953       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
 954
 955       /* Thanks to the lower_ubo_reference pass, we will see only
 956        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
 957        * variables, so no need for them to be in variable_ht.
 958        */
 959       if (ir->is_in_uniform_block())
 960          return;
 961
 962       /* Track how big the whole uniform variable is, in case we need to put a
 963        * copy of its data into pull constants for array access.
 964        */
 965       this->uniform_size[this->uniforms] = type_size(ir->type);
 966
 967       if (!strncmp(ir->name, "gl_", 3)) {
 968          setup_builtin_uniform_values(ir);
 969       } else {
 970          setup_uniform_values(ir);
 971       }
 972       break;
 973
 974    case ir_var_system_value:
 975       reg = make_reg_for_system_value(ir);
 976       break;
 977
 978    default:
 979       assert(!"not reached");
 980    }
 981
 982    reg->type = brw_type_for_base_type(ir->type);
 983    hash_table_insert(this->variable_ht, reg, ir);
 984 }
 985
 986 void
 987 vec4_visitor::visit(ir_loop *ir)
 988 {
 989    dst_reg counter;
 990
 991    /* We don't want debugging output to print the whole body of the
 992     * loop as the annotation.
 993     */
 994    this->base_ir = NULL;
 995
 996    if (ir->counter != NULL) {
 997       this->base_ir = ir->counter;
 998       ir->counter->accept(this);
 999       counter = *(variable_storage(ir->counter));
1000
1001       if (ir->from != NULL) {
1002          this->base_ir = ir->from;
1003          ir->from->accept(this);
1004
1005          emit(MOV(counter, this->result));
1006       }
1007    }
1008
1009    emit(BRW_OPCODE_DO);
1010
1011    if (ir->to) {
1012       this->base_ir = ir->to;
1013       ir->to->accept(this);
1014
1015       emit(CMP(dst_null_d(), src_reg(counter), this->result,
1016                brw_conditional_for_comparison(ir->cmp)));
1017
1018       vec4_instruction *inst = emit(BRW_OPCODE_BREAK);
1019       inst->predicate = BRW_PREDICATE_NORMAL;
1020    }
1021
1022    visit_instructions(&ir->body_instructions);
1023
1024
1025    if (ir->increment) {
1026       this->base_ir = ir->increment;
1027       ir->increment->accept(this);
1028       emit(ADD(counter, src_reg(counter), this->result));
1029    }
1030
1031    emit(BRW_OPCODE_WHILE);
1032 }
1033
1034 void
1035 vec4_visitor::visit(ir_loop_jump *ir)
1036 {
1037    switch (ir->mode) {
1038    case ir_loop_jump::jump_break:
1039       emit(BRW_OPCODE_BREAK);
1040       break;
1041    case ir_loop_jump::jump_continue:
1042       emit(BRW_OPCODE_CONTINUE);
1043       break;
1044    }
1045 }
1046
1047
1048 void
1049 vec4_visitor::visit(ir_function_signature *ir)
1050 {
1051    assert(0);
1052    (void)ir;
1053 }
1054
1055 void
1056 vec4_visitor::visit(ir_function *ir)
1057 {
1058    /* Ignore function bodies other than main() -- we shouldn't see calls to
1059     * them since they should all be inlined.
1060     */
1061    if (strcmp(ir->name, "main") == 0) {
1062       const ir_function_signature *sig;
1063       exec_list empty;
1064
1065       sig = ir->matching_signature(NULL, &empty);
1066
1067       assert(sig);
1068
1069       visit_instructions(&sig->body);
1070    }
1071 }
1072
1073 bool
1074 vec4_visitor::try_emit_sat(ir_expression *ir)
1075 {
1076    ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
1077    if (!sat_src)
1078       return false;
1079
1080    sat_src->accept(this);
1081    src_reg src = this->result;
1082
1083    this->result = src_reg(this, ir->type);
1084    vec4_instruction *inst;
1085    inst = emit(MOV(dst_reg(this->result), src));
1086    inst->saturate = true;
1087
1088    return true;
1089 }
1090
1091 bool
1092 vec4_visitor::try_emit_mad(ir_expression *ir, int mul_arg)
1093 {
1094    /* 3-src instructions were introduced in gen6. */
1095    if (brw->gen < 6)
1096       return false;
1097
1098    /* MAD can only handle floating-point data. */
1099    if (ir->type->base_type != GLSL_TYPE_FLOAT)
1100       return false;
1101
1102    ir_rvalue *nonmul = ir->operands[1 - mul_arg];
1103    ir_expression *mul = ir->operands[mul_arg]->as_expression();
1104
1105    if (!mul || mul->operation != ir_binop_mul)
1106       return false;
1107
1108    nonmul->accept(this);
1109    src_reg src0 = fix_3src_operand(this->result);
1110
1111    mul->operands[0]->accept(this);
1112    src_reg src1 = fix_3src_operand(this->result);
1113
1114    mul->operands[1]->accept(this);
1115    src_reg src2 = fix_3src_operand(this->result);
1116
1117    this->result = src_reg(this, ir->type);
1118    emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1119
1120    return true;
1121 }
1122
1123 void
1124 vec4_visitor::emit_bool_comparison(unsigned int op,
1125                                  dst_reg dst, src_reg src0, src_reg src1)
1126 {
1127    /* original gen4 does destination conversion before comparison. */
1128    if (brw->gen < 5)
1129       dst.type = src0.type;
1130
1131    emit(CMP(dst, src0, src1, brw_conditional_for_comparison(op)));
1132
1133    dst.type = BRW_REGISTER_TYPE_D;
1134    emit(AND(dst, src_reg(dst), src_reg(0x1)));
1135 }
1136
1137 void
1138 vec4_visitor::emit_minmax(uint32_t conditionalmod, dst_reg dst,
1139                           src_reg src0, src_reg src1)
1140 {
1141    vec4_instruction *inst;
1142
1143    if (brw->gen >= 6) {
1144       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1145       inst->conditional_mod = conditionalmod;
1146    } else {
1147       emit(CMP(dst, src0, src1, conditionalmod));
1148
1149       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1150       inst->predicate = BRW_PREDICATE_NORMAL;
1151    }
1152 }
1153
1154 static bool
1155 is_16bit_constant(ir_rvalue *rvalue)
1156 {
1157    ir_constant *constant = rvalue->as_constant();
1158    if (!constant)
1159       return false;
1160
1161    if (constant->type != glsl_type::int_type &&
1162        constant->type != glsl_type::uint_type)
1163       return false;
1164
1165    return constant->value.u[0] < (1 << 16);
1166 }
1167
1168 void
1169 vec4_visitor::visit(ir_expression *ir)
1170 {
1171    unsigned int operand;
1172    src_reg op[Elements(ir->operands)];
1173    src_reg result_src;
1174    dst_reg result_dst;
1175    vec4_instruction *inst;
1176
1177    if (try_emit_sat(ir))
1178       return;
1179
1180    if (ir->operation == ir_binop_add) {
1181       if (try_emit_mad(ir, 0) || try_emit_mad(ir, 1))
1182          return;
1183    }
1184
1185    for (operand = 0; operand < ir->get_num_operands(); operand++) {
1186       this->result.file = BAD_FILE;
1187       ir->operands[operand]->accept(this);
1188       if (this->result.file == BAD_FILE) {
1189          printf("Failed to get tree for expression operand:\n");
1190          ir->operands[operand]->print();
1191          exit(1);
1192       }
1193       op[operand] = this->result;
1194
1195       /* Matrix expression operands should have been broken down to vector
1196        * operations already.
1197        */
1198       assert(!ir->operands[operand]->type->is_matrix());
1199    }
1200
1201    int vector_elements = ir->operands[0]->type->vector_elements;
1202    if (ir->operands[1]) {
1203       vector_elements = MAX2(vector_elements,
1204                              ir->operands[1]->type->vector_elements);
1205    }
1206
1207    this->result.file = BAD_FILE;
1208
1209    /* Storage for our result.  Ideally for an assignment we'd be using
1210     * the actual storage for the result here, instead.
1211     */
1212    result_src = src_reg(this, ir->type);
1213    /* convenience for the emit functions below. */
1214    result_dst = dst_reg(result_src);
1215    /* If nothing special happens, this is the result. */
1216    this->result = result_src;
1217    /* Limit writes to the channels that will be used by result_src later.
1218     * This does limit this temp's use as a temporary for multi-instruction
1219     * sequences.
1220     */
1221    result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1222
1223    switch (ir->operation) {
1224    case ir_unop_logic_not:
1225       /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
1226        * ones complement of the whole register, not just bit 0.
1227        */
1228       emit(XOR(result_dst, op[0], src_reg(1)));
1229       break;
1230    case ir_unop_neg:
1231       op[0].negate = !op[0].negate;
1232       emit(MOV(result_dst, op[0]));
1233       break;
1234    case ir_unop_abs:
1235       op[0].abs = true;
1236       op[0].negate = false;
1237       emit(MOV(result_dst, op[0]));
1238       break;
1239
1240    case ir_unop_sign:
1241       emit(MOV(result_dst, src_reg(0.0f)));
1242
1243       emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_G));
1244       inst = emit(MOV(result_dst, src_reg(1.0f)));
1245       inst->predicate = BRW_PREDICATE_NORMAL;
1246
1247       emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_L));
1248       inst = emit(MOV(result_dst, src_reg(-1.0f)));
1249       inst->predicate = BRW_PREDICATE_NORMAL;
1250
1251       break;
1252
1253    case ir_unop_rcp:
1254       emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1255       break;
1256
1257    case ir_unop_exp2:
1258       emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1259       break;
1260    case ir_unop_log2:
1261       emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1262       break;
1263    case ir_unop_exp:
1264    case ir_unop_log:
1265       assert(!"not reached: should be handled by ir_explog_to_explog2");
1266       break;
1267    case ir_unop_sin:
1268    case ir_unop_sin_reduced:
1269       emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1270       break;
1271    case ir_unop_cos:
1272    case ir_unop_cos_reduced:
1273       emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1274       break;
1275
1276    case ir_unop_dFdx:
1277    case ir_unop_dFdy:
1278       assert(!"derivatives not valid in vertex shader");
1279       break;
1280
1281    case ir_unop_bitfield_reverse:
1282       emit(BFREV(result_dst, op[0]));
1283       break;
1284    case ir_unop_bit_count:
1285       emit(CBIT(result_dst, op[0]));
1286       break;
1287    case ir_unop_find_msb: {
1288       src_reg temp = src_reg(this, glsl_type::uint_type);
1289
1290       inst = emit(FBH(dst_reg(temp), op[0]));
1291       inst->dst.writemask = WRITEMASK_XYZW;
1292
1293       /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1294        * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1295        * subtract the result from 31 to convert the MSB count into an LSB count.
1296        */
1297
1298       /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1299       temp.swizzle = BRW_SWIZZLE_NOOP;
1300       emit(MOV(result_dst, temp));
1301
1302       src_reg src_tmp = src_reg(result_dst);
1303       emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1304
1305       src_tmp.negate = true;
1306       inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1307       inst->predicate = BRW_PREDICATE_NORMAL;
1308       break;
1309    }
1310    case ir_unop_find_lsb:
1311       emit(FBL(result_dst, op[0]));
1312       break;
1313
1314    case ir_unop_noise:
1315       assert(!"not reached: should be handled by lower_noise");
1316       break;
1317
1318    case ir_binop_add:
1319       emit(ADD(result_dst, op[0], op[1]));
1320       break;
1321    case ir_binop_sub:
1322       assert(!"not reached: should be handled by ir_sub_to_add_neg");
1323       break;
1324
1325    case ir_binop_mul:
1326       if (ir->type->is_integer()) {
1327          /* For integer multiplication, the MUL uses the low 16 bits of one of
1328           * the operands (src0 through SNB, src1 on IVB and later).  The MACH
1329           * accumulates in the contribution of the upper 16 bits of that
1330           * operand.  If we can determine that one of the args is in the low
1331           * 16 bits, though, we can just emit a single MUL.
1332           */
1333          if (is_16bit_constant(ir->operands[0])) {
1334             if (brw->gen < 7)
1335                emit(MUL(result_dst, op[0], op[1]));
1336             else
1337                emit(MUL(result_dst, op[1], op[0]));
1338          } else if (is_16bit_constant(ir->operands[1])) {
1339             if (brw->gen < 7)
1340                emit(MUL(result_dst, op[1], op[0]));
1341             else
1342                emit(MUL(result_dst, op[0], op[1]));
1343          } else {
1344             struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_D);
1345
1346             emit(MUL(acc, op[0], op[1]));
1347             emit(MACH(dst_null_d(), op[0], op[1]));
1348             emit(MOV(result_dst, src_reg(acc)));
1349          }
1350       } else {
1351          emit(MUL(result_dst, op[0], op[1]));
1352       }
1353       break;
1354    case ir_binop_div:
1355       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1356       assert(ir->type->is_integer());
1357       emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1358       break;
1359    case ir_binop_mod:
1360       /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1361       assert(ir->type->is_integer());
1362       emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1363       break;
1364
1365    case ir_binop_less:
1366    case ir_binop_greater:
1367    case ir_binop_lequal:
1368    case ir_binop_gequal:
1369    case ir_binop_equal:
1370    case ir_binop_nequal: {
1371       emit(CMP(result_dst, op[0], op[1],
1372                brw_conditional_for_comparison(ir->operation)));
1373       emit(AND(result_dst, result_src, src_reg(0x1)));
1374       break;
1375    }
1376
1377    case ir_binop_all_equal:
1378       /* "==" operator producing a scalar boolean. */
1379       if (ir->operands[0]->type->is_vector() ||
1380           ir->operands[1]->type->is_vector()) {
1381          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1382          emit(MOV(result_dst, src_reg(0)));
1383          inst = emit(MOV(result_dst, src_reg(1)));
1384          inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1385       } else {
1386          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1387          emit(AND(result_dst, result_src, src_reg(0x1)));
1388       }
1389       break;
1390    case ir_binop_any_nequal:
1391       /* "!=" operator producing a scalar boolean. */
1392       if (ir->operands[0]->type->is_vector() ||
1393           ir->operands[1]->type->is_vector()) {
1394          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1395
1396          emit(MOV(result_dst, src_reg(0)));
1397          inst = emit(MOV(result_dst, src_reg(1)));
1398          inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1399       } else {
1400          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1401          emit(AND(result_dst, result_src, src_reg(0x1)));
1402       }
1403       break;
1404
1405    case ir_unop_any:
1406       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1407       emit(MOV(result_dst, src_reg(0)));
1408
1409       inst = emit(MOV(result_dst, src_reg(1)));
1410       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1411       break;
1412
1413    case ir_binop_logic_xor:
1414       emit(XOR(result_dst, op[0], op[1]));
1415       break;
1416
1417    case ir_binop_logic_or:
1418       emit(OR(result_dst, op[0], op[1]));
1419       break;
1420
1421    case ir_binop_logic_and:
1422       emit(AND(result_dst, op[0], op[1]));
1423       break;
1424
1425    case ir_binop_dot:
1426       assert(ir->operands[0]->type->is_vector());
1427       assert(ir->operands[0]->type == ir->operands[1]->type);
1428       emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1429       break;
1430
1431    case ir_unop_sqrt:
1432       emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1433       break;
1434    case ir_unop_rsq:
1435       emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1436       break;
1437
1438    case ir_unop_bitcast_i2f:
1439    case ir_unop_bitcast_u2f:
1440       this->result = op[0];
1441       this->result.type = BRW_REGISTER_TYPE_F;
1442       break;
1443
1444    case ir_unop_bitcast_f2i:
1445       this->result = op[0];
1446       this->result.type = BRW_REGISTER_TYPE_D;
1447       break;
1448
1449    case ir_unop_bitcast_f2u:
1450       this->result = op[0];
1451       this->result.type = BRW_REGISTER_TYPE_UD;
1452       break;
1453
1454    case ir_unop_i2f:
1455    case ir_unop_i2u:
1456    case ir_unop_u2i:
1457    case ir_unop_u2f:
1458    case ir_unop_b2f:
1459    case ir_unop_b2i:
1460    case ir_unop_f2i:
1461    case ir_unop_f2u:
1462       emit(MOV(result_dst, op[0]));
1463       break;
1464    case ir_unop_f2b:
1465    case ir_unop_i2b: {
1466       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1467       emit(AND(result_dst, result_src, src_reg(1)));
1468       break;
1469    }
1470
1471    case ir_unop_trunc:
1472       emit(RNDZ(result_dst, op[0]));
1473       break;
1474    case ir_unop_ceil:
1475       op[0].negate = !op[0].negate;
1476       inst = emit(RNDD(result_dst, op[0]));
1477       this->result.negate = true;
1478       break;
1479    case ir_unop_floor:
1480       inst = emit(RNDD(result_dst, op[0]));
1481       break;
1482    case ir_unop_fract:
1483       inst = emit(FRC(result_dst, op[0]));
1484       break;
1485    case ir_unop_round_even:
1486       emit(RNDE(result_dst, op[0]));
1487       break;
1488
1489    case ir_binop_min:
1490       emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1491       break;
1492    case ir_binop_max:
1493       emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1494       break;
1495
1496    case ir_binop_pow:
1497       emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1498       break;
1499
1500    case ir_unop_bit_not:
1501       inst = emit(NOT(result_dst, op[0]));
1502       break;
1503    case ir_binop_bit_and:
1504       inst = emit(AND(result_dst, op[0], op[1]));
1505       break;
1506    case ir_binop_bit_xor:
1507       inst = emit(XOR(result_dst, op[0], op[1]));
1508       break;
1509    case ir_binop_bit_or:
1510       inst = emit(OR(result_dst, op[0], op[1]));
1511       break;
1512
1513    case ir_binop_lshift:
1514       inst = emit(SHL(result_dst, op[0], op[1]));
1515       break;
1516
1517    case ir_binop_rshift:
1518       if (ir->type->base_type == GLSL_TYPE_INT)
1519          inst = emit(ASR(result_dst, op[0], op[1]));
1520       else
1521          inst = emit(SHR(result_dst, op[0], op[1]));
1522       break;
1523
1524    case ir_binop_bfm:
1525       emit(BFI1(result_dst, op[0], op[1]));
1526       break;
1527
1528    case ir_binop_ubo_load: {
1529       ir_constant *uniform_block = ir->operands[0]->as_constant();
1530       ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1531       unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1532       src_reg offset = op[1];
1533
1534       /* Now, load the vector from that offset. */
1535       assert(ir->type->is_vector() || ir->type->is_scalar());
1536
1537       src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1538       packed_consts.type = result.type;
1539       src_reg surf_index =
1540          src_reg(SURF_INDEX_VEC4_UBO(uniform_block->value.u[0]));
1541       if (const_offset_ir) {
1542          offset = src_reg(const_offset / 16);
1543       } else {
1544          emit(SHR(dst_reg(offset), offset, src_reg(4)));
1545       }
1546
1547       vec4_instruction *pull =
1548          emit(new(mem_ctx) vec4_instruction(this,
1549                                             VS_OPCODE_PULL_CONSTANT_LOAD,
1550                                             dst_reg(packed_consts),
1551                                             surf_index,
1552                                             offset));
1553       pull->base_mrf = 14;
1554       pull->mlen = 1;
1555
1556       packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1557       packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1558                                             const_offset % 16 / 4,
1559                                             const_offset % 16 / 4,
1560                                             const_offset % 16 / 4);
1561
1562       /* UBO bools are any nonzero int.  We store bools as either 0 or 1. */
1563       if (ir->type->base_type == GLSL_TYPE_BOOL) {
1564          emit(CMP(result_dst, packed_consts, src_reg(0u),
1565                   BRW_CONDITIONAL_NZ));
1566          emit(AND(result_dst, result, src_reg(0x1)));
1567       } else {
1568          emit(MOV(result_dst, packed_consts));
1569       }
1570       break;
1571    }
1572
1573    case ir_binop_vector_extract:
1574       assert(!"should have been lowered by vec_index_to_cond_assign");
1575       break;
1576
1577    case ir_triop_fma:
1578       op[0] = fix_3src_operand(op[0]);
1579       op[1] = fix_3src_operand(op[1]);
1580       op[2] = fix_3src_operand(op[2]);
1581       /* Note that the instruction's argument order is reversed from GLSL
1582        * and the IR.
1583        */
1584       emit(MAD(result_dst, op[2], op[1], op[0]));
1585       break;
1586
1587    case ir_triop_lrp:
1588       op[0] = fix_3src_operand(op[0]);
1589       op[1] = fix_3src_operand(op[1]);
1590       op[2] = fix_3src_operand(op[2]);
1591       /* Note that the instruction's argument order is reversed from GLSL
1592        * and the IR.
1593        */
1594       emit(LRP(result_dst, op[2], op[1], op[0]));
1595       break;
1596
1597    case ir_triop_csel:
1598       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1599       inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1600       inst->predicate = BRW_PREDICATE_NORMAL;
1601       break;
1602
1603    case ir_triop_bfi:
1604       op[0] = fix_3src_operand(op[0]);
1605       op[1] = fix_3src_operand(op[1]);
1606       op[2] = fix_3src_operand(op[2]);
1607       emit(BFI2(result_dst, op[0], op[1], op[2]));
1608       break;
1609
1610    case ir_triop_bitfield_extract:
1611       op[0] = fix_3src_operand(op[0]);
1612       op[1] = fix_3src_operand(op[1]);
1613       op[2] = fix_3src_operand(op[2]);
1614       /* Note that the instruction's argument order is reversed from GLSL
1615        * and the IR.
1616        */
1617       emit(BFE(result_dst, op[2], op[1], op[0]));
1618       break;
1619
1620    case ir_triop_vector_insert:
1621       assert(!"should have been lowered by lower_vector_insert");
1622       break;
1623
1624    case ir_quadop_bitfield_insert:
1625       assert(!"not reached: should be handled by "
1626               "bitfield_insert_to_bfm_bfi\n");
1627       break;
1628
1629    case ir_quadop_vector:
1630       assert(!"not reached: should be handled by lower_quadop_vector");
1631       break;
1632
1633    case ir_unop_pack_half_2x16:
1634       emit_pack_half_2x16(result_dst, op[0]);
1635       break;
1636    case ir_unop_unpack_half_2x16:
1637       emit_unpack_half_2x16(result_dst, op[0]);
1638       break;
1639    case ir_unop_pack_snorm_2x16:
1640    case ir_unop_pack_snorm_4x8:
1641    case ir_unop_pack_unorm_2x16:
1642    case ir_unop_pack_unorm_4x8:
1643    case ir_unop_unpack_snorm_2x16:
1644    case ir_unop_unpack_snorm_4x8:
1645    case ir_unop_unpack_unorm_2x16:
1646    case ir_unop_unpack_unorm_4x8:
1647       assert(!"not reached: should be handled by lower_packing_builtins");
1648       break;
1649    case ir_unop_unpack_half_2x16_split_x:
1650    case ir_unop_unpack_half_2x16_split_y:
1651    case ir_binop_pack_half_2x16_split:
1652       assert(!"not reached: should not occur in vertex shader");
1653       break;
1654    }
1655 }
1656
1657
1658 void
1659 vec4_visitor::visit(ir_swizzle *ir)
1660 {
1661    src_reg src;
1662    int i = 0;
1663    int swizzle[4];
1664
1665    /* Note that this is only swizzles in expressions, not those on the left
1666     * hand side of an assignment, which do write masking.  See ir_assignment
1667     * for that.
1668     */
1669
1670    ir->val->accept(this);
1671    src = this->result;
1672    assert(src.file != BAD_FILE);
1673
1674    for (i = 0; i < ir->type->vector_elements; i++) {
1675       switch (i) {
1676       case 0:
1677          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1678          break;
1679       case 1:
1680          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1681          break;
1682       case 2:
1683          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1684          break;
1685       case 3:
1686          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1687             break;
1688       }
1689    }
1690    for (; i < 4; i++) {
1691       /* Replicate the last channel out. */
1692       swizzle[i] = swizzle[ir->type->vector_elements - 1];
1693    }
1694
1695    src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1696
1697    this->result = src;
1698 }
1699
1700 void
1701 vec4_visitor::visit(ir_dereference_variable *ir)
1702 {
1703    const struct glsl_type *type = ir->type;
1704    dst_reg *reg = variable_storage(ir->var);
1705
1706    if (!reg) {
1707       fail("Failed to find variable storage for %s\n", ir->var->name);
1708       this->result = src_reg(brw_null_reg());
1709       return;
1710    }
1711
1712    this->result = src_reg(*reg);
1713
1714    /* System values get their swizzle from the dst_reg writemask */
1715    if (ir->var->mode == ir_var_system_value)
1716       return;
1717
1718    if (type->is_scalar() || type->is_vector() || type->is_matrix())
1719       this->result.swizzle = swizzle_for_size(type->vector_elements);
1720 }
1721
1722
1723 int
1724 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1725 {
1726    /* Under normal circumstances array elements are stored consecutively, so
1727     * the stride is equal to the size of the array element.
1728     */
1729    return type_size(ir->type);
1730 }
1731
1732
1733 void
1734 vec4_visitor::visit(ir_dereference_array *ir)
1735 {
1736    ir_constant *constant_index;
1737    src_reg src;
1738    int array_stride = compute_array_stride(ir);
1739
1740    constant_index = ir->array_index->constant_expression_value();
1741
1742    ir->array->accept(this);
1743    src = this->result;
1744
1745    if (constant_index) {
1746       src.reg_offset += constant_index->value.i[0] * array_stride;
1747    } else {
1748       /* Variable index array dereference.  It eats the "vec4" of the
1749        * base of the array and an index that offsets the Mesa register
1750        * index.
1751        */
1752       ir->array_index->accept(this);
1753
1754       src_reg index_reg;
1755
1756       if (array_stride == 1) {
1757          index_reg = this->result;
1758       } else {
1759          index_reg = src_reg(this, glsl_type::int_type);
1760
1761          emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
1762       }
1763
1764       if (src.reladdr) {
1765          src_reg temp = src_reg(this, glsl_type::int_type);
1766
1767          emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1768
1769          index_reg = temp;
1770       }
1771
1772       src.reladdr = ralloc(mem_ctx, src_reg);
1773       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1774    }
1775
1776    /* If the type is smaller than a vec4, replicate the last channel out. */
1777    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1778       src.swizzle = swizzle_for_size(ir->type->vector_elements);
1779    else
1780       src.swizzle = BRW_SWIZZLE_NOOP;
1781    src.type = brw_type_for_base_type(ir->type);
1782
1783    this->result = src;
1784 }
1785
1786 void
1787 vec4_visitor::visit(ir_dereference_record *ir)
1788 {
1789    unsigned int i;
1790    const glsl_type *struct_type = ir->record->type;
1791    int offset = 0;
1792
1793    ir->record->accept(this);
1794
1795    for (i = 0; i < struct_type->length; i++) {
1796       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1797          break;
1798       offset += type_size(struct_type->fields.structure[i].type);
1799    }
1800
1801    /* If the type is smaller than a vec4, replicate the last channel out. */
1802    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1803       this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1804    else
1805       this->result.swizzle = BRW_SWIZZLE_NOOP;
1806    this->result.type = brw_type_for_base_type(ir->type);
1807
1808    this->result.reg_offset += offset;
1809 }
1810
1811 /**
1812  * We want to be careful in assignment setup to hit the actual storage
1813  * instead of potentially using a temporary like we might with the
1814  * ir_dereference handler.
1815  */
1816 static dst_reg
1817 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1818 {
1819    /* The LHS must be a dereference.  If the LHS is a variable indexed array
1820     * access of a vector, it must be separated into a series conditional moves
1821     * before reaching this point (see ir_vec_index_to_cond_assign).
1822     */
1823    assert(ir->as_dereference());
1824    ir_dereference_array *deref_array = ir->as_dereference_array();
1825    if (deref_array) {
1826       assert(!deref_array->array->type->is_vector());
1827    }
1828
1829    /* Use the rvalue deref handler for the most part.  We'll ignore
1830     * swizzles in it and write swizzles using writemask, though.
1831     */
1832    ir->accept(v);
1833    return dst_reg(v->result);
1834 }
1835
1836 void
1837 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1838                               const struct glsl_type *type, uint32_t predicate)
1839 {
1840    if (type->base_type == GLSL_TYPE_STRUCT) {
1841       for (unsigned int i = 0; i < type->length; i++) {
1842          emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1843       }
1844       return;
1845    }
1846
1847    if (type->is_array()) {
1848       for (unsigned int i = 0; i < type->length; i++) {
1849          emit_block_move(dst, src, type->fields.array, predicate);
1850       }
1851       return;
1852    }
1853
1854    if (type->is_matrix()) {
1855       const struct glsl_type *vec_type;
1856
1857       vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1858                                          type->vector_elements, 1);
1859
1860       for (int i = 0; i < type->matrix_columns; i++) {
1861          emit_block_move(dst, src, vec_type, predicate);
1862       }
1863       return;
1864    }
1865
1866    assert(type->is_scalar() || type->is_vector());
1867
1868    dst->type = brw_type_for_base_type(type);
1869    src->type = dst->type;
1870
1871    dst->writemask = (1 << type->vector_elements) - 1;
1872
1873    src->swizzle = swizzle_for_size(type->vector_elements);
1874
1875    vec4_instruction *inst = emit(MOV(*dst, *src));
1876    inst->predicate = predicate;
1877
1878    dst->reg_offset++;
1879    src->reg_offset++;
1880 }
1881
1882
1883 /* If the RHS processing resulted in an instruction generating a
1884  * temporary value, and it would be easy to rewrite the instruction to
1885  * generate its result right into the LHS instead, do so.  This ends
1886  * up reliably removing instructions where it can be tricky to do so
1887  * later without real UD chain information.
1888  */
1889 bool
1890 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
1891                                      dst_reg dst,
1892                                      src_reg src,
1893                                      vec4_instruction *pre_rhs_inst,
1894                                      vec4_instruction *last_rhs_inst)
1895 {
1896    /* This could be supported, but it would take more smarts. */
1897    if (ir->condition)
1898       return false;
1899
1900    if (pre_rhs_inst == last_rhs_inst)
1901       return false; /* No instructions generated to work with. */
1902
1903    /* Make sure the last instruction generated our source reg. */
1904    if (src.file != GRF ||
1905        src.file != last_rhs_inst->dst.file ||
1906        src.reg != last_rhs_inst->dst.reg ||
1907        src.reg_offset != last_rhs_inst->dst.reg_offset ||
1908        src.reladdr ||
1909        src.abs ||
1910        src.negate ||
1911        last_rhs_inst->predicate != BRW_PREDICATE_NONE)
1912       return false;
1913
1914    /* Check that that last instruction fully initialized the channels
1915     * we want to use, in the order we want to use them.  We could
1916     * potentially reswizzle the operands of many instructions so that
1917     * we could handle out of order channels, but don't yet.
1918     */
1919
1920    for (unsigned i = 0; i < 4; i++) {
1921       if (dst.writemask & (1 << i)) {
1922          if (!(last_rhs_inst->dst.writemask & (1 << i)))
1923             return false;
1924
1925          if (BRW_GET_SWZ(src.swizzle, i) != i)
1926             return false;
1927       }
1928    }
1929
1930    /* Success!  Rewrite the instruction. */
1931    last_rhs_inst->dst.file = dst.file;
1932    last_rhs_inst->dst.reg = dst.reg;
1933    last_rhs_inst->dst.reg_offset = dst.reg_offset;
1934    last_rhs_inst->dst.reladdr = dst.reladdr;
1935    last_rhs_inst->dst.writemask &= dst.writemask;
1936
1937    return true;
1938 }
1939
1940 void
1941 vec4_visitor::visit(ir_assignment *ir)
1942 {
1943    dst_reg dst = get_assignment_lhs(ir->lhs, this);
1944    uint32_t predicate = BRW_PREDICATE_NONE;
1945
1946    if (!ir->lhs->type->is_scalar() &&
1947        !ir->lhs->type->is_vector()) {
1948       ir->rhs->accept(this);
1949       src_reg src = this->result;
1950
1951       if (ir->condition) {
1952          emit_bool_to_cond_code(ir->condition, &predicate);
1953       }
1954
1955       /* emit_block_move doesn't account for swizzles in the source register.
1956        * This should be ok, since the source register is a structure or an
1957        * array, and those can't be swizzled.  But double-check to be sure.
1958        */
1959       assert(src.swizzle ==
1960              (ir->rhs->type->is_matrix()
1961               ? swizzle_for_size(ir->rhs->type->vector_elements)
1962               : BRW_SWIZZLE_NOOP));
1963
1964       emit_block_move(&dst, &src, ir->rhs->type, predicate);
1965       return;
1966    }
1967
1968    /* Now we're down to just a scalar/vector with writemasks. */
1969    int i;
1970
1971    vec4_instruction *pre_rhs_inst, *last_rhs_inst;
1972    pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1973
1974    ir->rhs->accept(this);
1975
1976    last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1977
1978    src_reg src = this->result;
1979
1980    int swizzles[4];
1981    int first_enabled_chan = 0;
1982    int src_chan = 0;
1983
1984    assert(ir->lhs->type->is_vector() ||
1985           ir->lhs->type->is_scalar());
1986    dst.writemask = ir->write_mask;
1987
1988    for (int i = 0; i < 4; i++) {
1989       if (dst.writemask & (1 << i)) {
1990          first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
1991          break;
1992       }
1993    }
1994
1995    /* Swizzle a small RHS vector into the channels being written.
1996     *
1997     * glsl ir treats write_mask as dictating how many channels are
1998     * present on the RHS while in our instructions we need to make
1999     * those channels appear in the slots of the vec4 they're written to.
2000     */
2001    for (int i = 0; i < 4; i++) {
2002       if (dst.writemask & (1 << i))
2003          swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
2004       else
2005          swizzles[i] = first_enabled_chan;
2006    }
2007    src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2008                               swizzles[2], swizzles[3]);
2009
2010    if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2011       return;
2012    }
2013
2014    if (ir->condition) {
2015       emit_bool_to_cond_code(ir->condition, &predicate);
2016    }
2017
2018    for (i = 0; i < type_size(ir->lhs->type); i++) {
2019       vec4_instruction *inst = emit(MOV(dst, src));
2020       inst->predicate = predicate;
2021
2022       dst.reg_offset++;
2023       src.reg_offset++;
2024    }
2025 }
2026
2027 void
2028 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2029 {
2030    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2031       foreach_list(node, &ir->components) {
2032          ir_constant *field_value = (ir_constant *)node;
2033
2034          emit_constant_values(dst, field_value);
2035       }
2036       return;
2037    }
2038
2039    if (ir->type->is_array()) {
2040       for (unsigned int i = 0; i < ir->type->length; i++) {
2041          emit_constant_values(dst, ir->array_elements[i]);
2042       }
2043       return;
2044    }
2045
2046    if (ir->type->is_matrix()) {
2047       for (int i = 0; i < ir->type->matrix_columns; i++) {
2048          float *vec = &ir->value.f[i * ir->type->vector_elements];
2049
2050          for (int j = 0; j < ir->type->vector_elements; j++) {
2051             dst->writemask = 1 << j;
2052             dst->type = BRW_REGISTER_TYPE_F;
2053
2054             emit(MOV(*dst, src_reg(vec[j])));
2055          }
2056          dst->reg_offset++;
2057       }
2058       return;
2059    }
2060
2061    int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2062
2063    for (int i = 0; i < ir->type->vector_elements; i++) {
2064       if (!(remaining_writemask & (1 << i)))
2065          continue;
2066
2067       dst->writemask = 1 << i;
2068       dst->type = brw_type_for_base_type(ir->type);
2069
2070       /* Find other components that match the one we're about to
2071        * write.  Emits fewer instructions for things like vec4(0.5,
2072        * 1.5, 1.5, 1.5).
2073        */
2074       for (int j = i + 1; j < ir->type->vector_elements; j++) {
2075          if (ir->type->base_type == GLSL_TYPE_BOOL) {
2076             if (ir->value.b[i] == ir->value.b[j])
2077                dst->writemask |= (1 << j);
2078          } else {
2079             /* u, i, and f storage all line up, so no need for a
2080              * switch case for comparing each type.
2081              */
2082             if (ir->value.u[i] == ir->value.u[j])
2083                dst->writemask |= (1 << j);
2084          }
2085       }
2086
2087       switch (ir->type->base_type) {
2088       case GLSL_TYPE_FLOAT:
2089          emit(MOV(*dst, src_reg(ir->value.f[i])));
2090          break;
2091       case GLSL_TYPE_INT:
2092          emit(MOV(*dst, src_reg(ir->value.i[i])));
2093          break;
2094       case GLSL_TYPE_UINT:
2095          emit(MOV(*dst, src_reg(ir->value.u[i])));
2096          break;
2097       case GLSL_TYPE_BOOL:
2098          emit(MOV(*dst, src_reg(ir->value.b[i])));
2099          break;
2100       default:
2101          assert(!"Non-float/uint/int/bool constant");
2102          break;
2103       }
2104
2105       remaining_writemask &= ~dst->writemask;
2106    }
2107    dst->reg_offset++;
2108 }
2109
2110 void
2111 vec4_visitor::visit(ir_constant *ir)
2112 {
2113    dst_reg dst = dst_reg(this, ir->type);
2114    this->result = src_reg(dst);
2115
2116    emit_constant_values(&dst, ir);
2117 }
2118
2119 void
2120 vec4_visitor::visit(ir_call *ir)
2121 {
2122    assert(!"not reached");
2123 }
2124
2125 void
2126 vec4_visitor::visit(ir_texture *ir)
2127 {
2128    int sampler =
2129       _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2130
2131    /* Should be lowered by do_lower_texture_projection */
2132    assert(!ir->projector);
2133
2134    /* Generate code to compute all the subexpression trees.  This has to be
2135     * done before loading any values into MRFs for the sampler message since
2136     * generating these values may involve SEND messages that need the MRFs.
2137     */
2138    src_reg coordinate;
2139    if (ir->coordinate) {
2140       ir->coordinate->accept(this);
2141       coordinate = this->result;
2142    }
2143
2144    src_reg shadow_comparitor;
2145    if (ir->shadow_comparitor) {
2146       ir->shadow_comparitor->accept(this);
2147       shadow_comparitor = this->result;
2148    }
2149
2150    const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2151    src_reg lod, dPdx, dPdy, sample_index;
2152    switch (ir->op) {
2153    case ir_tex:
2154       lod = src_reg(0.0f);
2155       lod_type = glsl_type::float_type;
2156       break;
2157    case ir_txf:
2158    case ir_txl:
2159    case ir_txs:
2160       ir->lod_info.lod->accept(this);
2161       lod = this->result;
2162       lod_type = ir->lod_info.lod->type;
2163       break;
2164    case ir_txf_ms:
2165       ir->lod_info.sample_index->accept(this);
2166       sample_index = this->result;
2167       sample_index_type = ir->lod_info.sample_index->type;
2168       break;
2169    case ir_txd:
2170       ir->lod_info.grad.dPdx->accept(this);
2171       dPdx = this->result;
2172
2173       ir->lod_info.grad.dPdy->accept(this);
2174       dPdy = this->result;
2175
2176       lod_type = ir->lod_info.grad.dPdx->type;
2177       break;
2178    case ir_txb:
2179    case ir_lod:
2180       break;
2181    }
2182
2183    vec4_instruction *inst = NULL;
2184    switch (ir->op) {
2185    case ir_tex:
2186    case ir_txl:
2187       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXL);
2188       break;
2189    case ir_txd:
2190       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXD);
2191       break;
2192    case ir_txf:
2193       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF);
2194       break;
2195    case ir_txf_ms:
2196       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_MS);
2197       break;
2198    case ir_txs:
2199       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
2200       break;
2201    case ir_txb:
2202       assert(!"TXB is not valid for vertex shaders.");
2203       break;
2204    case ir_lod:
2205       assert(!"LOD is not valid for vertex shaders.");
2206       break;
2207    }
2208
2209    bool use_texture_offset = ir->offset != NULL && ir->op != ir_txf;
2210
2211    /* Texel offsets go in the message header; Gen4 also requires headers. */
2212    inst->header_present = use_texture_offset || brw->gen < 5;
2213    inst->base_mrf = 2;
2214    inst->mlen = inst->header_present + 1; /* always at least one */
2215    inst->sampler = sampler;
2216    inst->dst = dst_reg(this, ir->type);
2217    inst->dst.writemask = WRITEMASK_XYZW;
2218    inst->shadow_compare = ir->shadow_comparitor != NULL;
2219
2220    if (use_texture_offset)
2221       inst->texture_offset = brw_texture_offset(ir->offset->as_constant());
2222
2223    /* MRF for the first parameter */
2224    int param_base = inst->base_mrf + inst->header_present;
2225
2226    if (ir->op == ir_txs) {
2227       int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2228       emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2229    } else {
2230       /* Load the coordinate */
2231       /* FINISHME: gl_clamp_mask and saturate */
2232       int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2233       int zero_mask = 0xf & ~coord_mask;
2234
2235       if (ir->offset && ir->op == ir_txf) {
2236          /* It appears that the ld instruction used for txf does its
2237           * address bounds check before adding in the offset.  To work
2238           * around this, just add the integer offset to the integer
2239           * texel coordinate, and don't put the offset in the header.
2240           */
2241          ir_constant *offset = ir->offset->as_constant();
2242          assert(offset);
2243
2244          for (int j = 0; j < ir->coordinate->type->vector_elements; j++) {
2245             src_reg src = coordinate;
2246             src.swizzle = BRW_SWIZZLE4(BRW_GET_SWZ(src.swizzle, j),
2247                                        BRW_GET_SWZ(src.swizzle, j),
2248                                        BRW_GET_SWZ(src.swizzle, j),
2249                                        BRW_GET_SWZ(src.swizzle, j));
2250             emit(ADD(dst_reg(MRF, param_base, ir->coordinate->type, 1 << j),
2251                      src, offset->value.i[j]));
2252          }
2253       } else {
2254          emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2255                   coordinate));
2256       }
2257       if (zero_mask != 0) {
2258          emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2259                   src_reg(0)));
2260       }
2261       /* Load the shadow comparitor */
2262       if (ir->shadow_comparitor && ir->op != ir_txd) {
2263          emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2264                           WRITEMASK_X),
2265                   shadow_comparitor));
2266          inst->mlen++;
2267       }
2268
2269       /* Load the LOD info */
2270       if (ir->op == ir_tex || ir->op == ir_txl) {
2271          int mrf, writemask;
2272          if (brw->gen >= 5) {
2273             mrf = param_base + 1;
2274             if (ir->shadow_comparitor) {
2275                writemask = WRITEMASK_Y;
2276                /* mlen already incremented */
2277             } else {
2278                writemask = WRITEMASK_X;
2279                inst->mlen++;
2280             }
2281          } else /* brw->gen == 4 */ {
2282             mrf = param_base;
2283             writemask = WRITEMASK_W;
2284          }
2285          emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2286       } else if (ir->op == ir_txf) {
2287          emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2288       } else if (ir->op == ir_txf_ms) {
2289          emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2290                   sample_index));
2291          inst->mlen++;
2292
2293          /* on Gen7, there is an additional MCS parameter here after SI,
2294           * but we don't bother to emit it since it's always zero. If
2295           * we start supporting texturing from CMS surfaces, this will have
2296           * to change
2297           */
2298       } else if (ir->op == ir_txd) {
2299          const glsl_type *type = lod_type;
2300
2301          if (brw->gen >= 5) {
2302             dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2303             dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2304             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2305             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2306             inst->mlen++;
2307
2308             if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2309                dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2310                dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2311                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2312                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2313                inst->mlen++;
2314
2315                if (ir->shadow_comparitor) {
2316                   emit(MOV(dst_reg(MRF, param_base + 2,
2317                                    ir->shadow_comparitor->type, WRITEMASK_Z),
2318                            shadow_comparitor));
2319                }
2320             }
2321          } else /* brw->gen == 4 */ {
2322             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2323             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2324             inst->mlen += 2;
2325          }
2326       }
2327    }
2328
2329    emit(inst);
2330
2331    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2332     * spec requires layers.
2333     */
2334    if (ir->op == ir_txs) {
2335       glsl_type const *type = ir->sampler->type;
2336       if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2337           type->sampler_array) {
2338          emit_math(SHADER_OPCODE_INT_QUOTIENT,
2339                    with_writemask(inst->dst, WRITEMASK_Z),
2340                    src_reg(inst->dst), src_reg(6));
2341       }
2342    }
2343
2344    swizzle_result(ir, src_reg(inst->dst), sampler);
2345 }
2346
2347 void
2348 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, int sampler)
2349 {
2350    int s = key->tex.swizzles[sampler];
2351
2352    this->result = src_reg(this, ir->type);
2353    dst_reg swizzled_result(this->result);
2354
2355    if (ir->op == ir_txs || ir->type == glsl_type::float_type
2356                         || s == SWIZZLE_NOOP) {
2357       emit(MOV(swizzled_result, orig_val));
2358       return;
2359    }
2360
2361    int zero_mask = 0, one_mask = 0, copy_mask = 0;
2362    int swizzle[4] = {0};
2363
2364    for (int i = 0; i < 4; i++) {
2365       switch (GET_SWZ(s, i)) {
2366       case SWIZZLE_ZERO:
2367          zero_mask |= (1 << i);
2368          break;
2369       case SWIZZLE_ONE:
2370          one_mask |= (1 << i);
2371          break;
2372       default:
2373          copy_mask |= (1 << i);
2374          swizzle[i] = GET_SWZ(s, i);
2375          break;
2376       }
2377    }
2378
2379    if (copy_mask) {
2380       orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2381       swizzled_result.writemask = copy_mask;
2382       emit(MOV(swizzled_result, orig_val));
2383    }
2384
2385    if (zero_mask) {
2386       swizzled_result.writemask = zero_mask;
2387       emit(MOV(swizzled_result, src_reg(0.0f)));
2388    }
2389
2390    if (one_mask) {
2391       swizzled_result.writemask = one_mask;
2392       emit(MOV(swizzled_result, src_reg(1.0f)));
2393    }
2394 }
2395
2396 void
2397 vec4_visitor::visit(ir_return *ir)
2398 {
2399    assert(!"not reached");
2400 }
2401
2402 void
2403 vec4_visitor::visit(ir_discard *ir)
2404 {
2405    assert(!"not reached");
2406 }
2407
2408 void
2409 vec4_visitor::visit(ir_if *ir)
2410 {
2411    /* Don't point the annotation at the if statement, because then it plus
2412     * the then and else blocks get printed.
2413     */
2414    this->base_ir = ir->condition;
2415
2416    if (brw->gen == 6) {
2417       emit_if_gen6(ir);
2418    } else {
2419       uint32_t predicate;
2420       emit_bool_to_cond_code(ir->condition, &predicate);
2421       emit(IF(predicate));
2422    }
2423
2424    visit_instructions(&ir->then_instructions);
2425
2426    if (!ir->else_instructions.is_empty()) {
2427       this->base_ir = ir->condition;
2428       emit(BRW_OPCODE_ELSE);
2429
2430       visit_instructions(&ir->else_instructions);
2431    }
2432
2433    this->base_ir = ir->condition;
2434    emit(BRW_OPCODE_ENDIF);
2435 }
2436
2437 void
2438 vec4_visitor::visit(ir_emit_vertex *)
2439 {
2440    assert(!"not reached");
2441 }
2442
2443 void
2444 vec4_visitor::visit(ir_end_primitive *)
2445 {
2446    assert(!"not reached");
2447 }
2448
2449 void
2450 vec4_visitor::emit_ndc_computation()
2451 {
2452    /* Get the position */
2453    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2454
2455    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2456    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2457    output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2458
2459    current_annotation = "NDC";
2460    dst_reg ndc_w = ndc;
2461    ndc_w.writemask = WRITEMASK_W;
2462    src_reg pos_w = pos;
2463    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2464    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2465
2466    dst_reg ndc_xyz = ndc;
2467    ndc_xyz.writemask = WRITEMASK_XYZ;
2468
2469    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2470 }
2471
2472 void
2473 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2474 {
2475    if (brw->gen < 6 &&
2476        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2477         key->userclip_active || brw->has_negative_rhw_bug)) {
2478       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2479       dst_reg header1_w = header1;
2480       header1_w.writemask = WRITEMASK_W;
2481
2482       emit(MOV(header1, 0u));
2483
2484       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2485          src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2486
2487          current_annotation = "Point size";
2488          emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2489          emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2490       }
2491
2492       if (key->userclip_active) {
2493          current_annotation = "Clipping flags";
2494          dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
2495          dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
2496
2497          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
2498          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
2499          emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
2500
2501          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
2502          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
2503          emit(SHL(flags1, src_reg(flags1), src_reg(4)));
2504          emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
2505       }
2506
2507       /* i965 clipping workaround:
2508        * 1) Test for -ve rhw
2509        * 2) If set,
2510        *      set ndc = (0,0,0,0)
2511        *      set ucp[6] = 1
2512        *
2513        * Later, clipping will detect ucp[6] and ensure the primitive is
2514        * clipped against all fixed planes.
2515        */
2516       if (brw->has_negative_rhw_bug) {
2517          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
2518          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
2519          emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
2520          vec4_instruction *inst;
2521          inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
2522          inst->predicate = BRW_PREDICATE_NORMAL;
2523          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
2524          inst->predicate = BRW_PREDICATE_NORMAL;
2525       }
2526
2527       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2528    } else if (brw->gen < 6) {
2529       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2530    } else {
2531       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2532       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2533          emit(MOV(brw_writemask(reg, WRITEMASK_W),
2534                   src_reg(output_reg[VARYING_SLOT_PSIZ])));
2535       }
2536       if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
2537          emit(MOV(retype(brw_writemask(reg, WRITEMASK_Y), BRW_REGISTER_TYPE_D),
2538                   src_reg(output_reg[VARYING_SLOT_LAYER])));
2539       }
2540    }
2541 }
2542
2543 void
2544 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
2545 {
2546    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2547     *
2548     *     "If a linked set of shaders forming the vertex stage contains no
2549     *     static write to gl_ClipVertex or gl_ClipDistance, but the
2550     *     application has requested clipping against user clip planes through
2551     *     the API, then the coordinate written to gl_Position is used for
2552     *     comparison against the user clip planes."
2553     *
2554     * This function is only called if the shader didn't write to
2555     * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
2556     * if the user wrote to it; otherwise we use gl_Position.
2557     */
2558    gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
2559    if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
2560       clip_vertex = VARYING_SLOT_POS;
2561    }
2562
2563    for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
2564         ++i) {
2565       reg.writemask = 1 << i;
2566       emit(DP4(reg,
2567                src_reg(output_reg[clip_vertex]),
2568                src_reg(this->userplane[i + offset])));
2569    }
2570 }
2571
2572 void
2573 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
2574 {
2575    assert (varying < VARYING_SLOT_MAX);
2576    reg.type = output_reg[varying].type;
2577    current_annotation = output_reg_annotation[varying];
2578    /* Copy the register, saturating if necessary */
2579    vec4_instruction *inst = emit(MOV(reg,
2580                                      src_reg(output_reg[varying])));
2581    if ((varying == VARYING_SLOT_COL0 ||
2582         varying == VARYING_SLOT_COL1 ||
2583         varying == VARYING_SLOT_BFC0 ||
2584         varying == VARYING_SLOT_BFC1) &&
2585        key->clamp_vertex_color) {
2586       inst->saturate = true;
2587    }
2588 }
2589
2590 void
2591 vec4_visitor::emit_urb_slot(int mrf, int varying)
2592 {
2593    struct brw_reg hw_reg = brw_message_reg(mrf);
2594    dst_reg reg = dst_reg(MRF, mrf);
2595    reg.type = BRW_REGISTER_TYPE_F;
2596
2597    switch (varying) {
2598    case VARYING_SLOT_PSIZ:
2599       /* PSIZ is always in slot 0, and is coupled with other flags. */
2600       current_annotation = "indices, point width, clip flags";
2601       emit_psiz_and_flags(hw_reg);
2602       break;
2603    case BRW_VARYING_SLOT_NDC:
2604       current_annotation = "NDC";
2605       emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
2606       break;
2607    case VARYING_SLOT_POS:
2608       current_annotation = "gl_Position";
2609       emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
2610       break;
2611    case VARYING_SLOT_EDGE:
2612       /* This is present when doing unfilled polygons.  We're supposed to copy
2613        * the edge flag from the user-provided vertex array
2614        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
2615        * of that attribute (starts as 1.0f).  This is then used in clipping to
2616        * determine which edges should be drawn as wireframe.
2617        */
2618       current_annotation = "edge flag";
2619       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
2620                                     glsl_type::float_type, WRITEMASK_XYZW))));
2621       break;
2622    case BRW_VARYING_SLOT_PAD:
2623       /* No need to write to this slot */
2624       break;
2625    default:
2626       emit_generic_urb_slot(reg, varying);
2627       break;
2628    }
2629 }
2630
2631 static int
2632 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
2633 {
2634    if (brw->gen >= 6) {
2635       /* URB data written (does not include the message header reg) must
2636        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
2637        * section 5.4.3.2.2: URB_INTERLEAVED.
2638        *
2639        * URB entries are allocated on a multiple of 1024 bits, so an
2640        * extra 128 bits written here to make the end align to 256 is
2641        * no problem.
2642        */
2643       if ((mlen % 2) != 1)
2644          mlen++;
2645    }
2646
2647    return mlen;
2648 }
2649
2650
2651 /**
2652  * Generates the VUE payload plus the necessary URB write instructions to
2653  * output it.
2654  *
2655  * The VUE layout is documented in Volume 2a.
2656  */
2657 void
2658 vec4_visitor::emit_vertex()
2659 {
2660    /* MRF 0 is reserved for the debugger, so start with message header
2661     * in MRF 1.
2662     */
2663    int base_mrf = 1;
2664    int mrf = base_mrf;
2665    /* In the process of generating our URB write message contents, we
2666     * may need to unspill a register or load from an array.  Those
2667     * reads would use MRFs 14-15.
2668     */
2669    int max_usable_mrf = 13;
2670
2671    /* The following assertion verifies that max_usable_mrf causes an
2672     * even-numbered amount of URB write data, which will meet gen6's
2673     * requirements for length alignment.
2674     */
2675    assert ((max_usable_mrf - base_mrf) % 2 == 0);
2676
2677    /* First mrf is the g0-based message header containing URB handles and
2678     * such.
2679     */
2680    emit_urb_write_header(mrf++);
2681
2682    if (brw->gen < 6) {
2683       emit_ndc_computation();
2684    }
2685
2686    /* Lower legacy ff and ClipVertex clipping to clip distances */
2687    if (key->userclip_active && !key->uses_clip_distance) {
2688       current_annotation = "user clip distances";
2689
2690       output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
2691       output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
2692
2693       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
2694       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
2695    }
2696
2697    /* We may need to split this up into several URB writes, so do them in a
2698     * loop.
2699     */
2700    int slot = 0;
2701    bool complete = false;
2702    do {
2703       /* URB offset is in URB row increments, and each of our MRFs is half of
2704        * one of those, since we're doing interleaved writes.
2705        */
2706       int offset = slot / 2;
2707
2708       mrf = base_mrf + 1;
2709       for (; slot < prog_data->vue_map.num_slots; ++slot) {
2710          emit_urb_slot(mrf++, prog_data->vue_map.slot_to_varying[slot]);
2711
2712          /* If this was max_usable_mrf, we can't fit anything more into this
2713           * URB WRITE.
2714           */
2715          if (mrf > max_usable_mrf) {
2716             slot++;
2717             break;
2718          }
2719       }
2720
2721       complete = slot >= prog_data->vue_map.num_slots;
2722       current_annotation = "URB write";
2723       vec4_instruction *inst = emit_urb_write_opcode(complete);
2724       inst->base_mrf = base_mrf;
2725       inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2726       inst->offset += offset;
2727    } while(!complete);
2728 }
2729
2730
2731 src_reg
2732 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
2733                                  src_reg *reladdr, int reg_offset)
2734 {
2735    /* Because we store the values to scratch interleaved like our
2736     * vertex data, we need to scale the vec4 index by 2.
2737     */
2738    int message_header_scale = 2;
2739
2740    /* Pre-gen6, the message header uses byte offsets instead of vec4
2741     * (16-byte) offset units.
2742     */
2743    if (brw->gen < 6)
2744       message_header_scale *= 16;
2745
2746    if (reladdr) {
2747       src_reg index = src_reg(this, glsl_type::int_type);
2748
2749       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2750       emit_before(inst, MUL(dst_reg(index),
2751                             index, src_reg(message_header_scale)));
2752
2753       return index;
2754    } else {
2755       return src_reg(reg_offset * message_header_scale);
2756    }
2757 }
2758
2759 src_reg
2760 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
2761                                        src_reg *reladdr, int reg_offset)
2762 {
2763    if (reladdr) {
2764       src_reg index = src_reg(this, glsl_type::int_type);
2765
2766       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2767
2768       /* Pre-gen6, the message header uses byte offsets instead of vec4
2769        * (16-byte) offset units.
2770        */
2771       if (brw->gen < 6) {
2772          emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
2773       }
2774
2775       return index;
2776    } else {
2777       int message_header_scale = brw->gen < 6 ? 16 : 1;
2778       return src_reg(reg_offset * message_header_scale);
2779    }
2780 }
2781
2782 /**
2783  * Emits an instruction before @inst to load the value named by @orig_src
2784  * from scratch space at @base_offset to @temp.
2785  *
2786  * @base_offset is measured in 32-byte units (the size of a register).
2787  */
2788 void
2789 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
2790                                 dst_reg temp, src_reg orig_src,
2791                                 int base_offset)
2792 {
2793    int reg_offset = base_offset + orig_src.reg_offset;
2794    src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
2795
2796    emit_before(inst, SCRATCH_READ(temp, index));
2797 }
2798
2799 /**
2800  * Emits an instruction after @inst to store the value to be written
2801  * to @orig_dst to scratch space at @base_offset, from @temp.
2802  *
2803  * @base_offset is measured in 32-byte units (the size of a register).
2804  */
2805 void
2806 vec4_visitor::emit_scratch_write(vec4_instruction *inst, int base_offset)
2807 {
2808    int reg_offset = base_offset + inst->dst.reg_offset;
2809    src_reg index = get_scratch_offset(inst, inst->dst.reladdr, reg_offset);
2810
2811    /* Create a temporary register to store *inst's result in.
2812     *
2813     * We have to be careful in MOVing from our temporary result register in
2814     * the scratch write.  If we swizzle from channels of the temporary that
2815     * weren't initialized, it will confuse live interval analysis, which will
2816     * make spilling fail to make progress.
2817     */
2818    src_reg temp = src_reg(this, glsl_type::vec4_type);
2819    temp.type = inst->dst.type;
2820    int first_writemask_chan = ffs(inst->dst.writemask) - 1;
2821    int swizzles[4];
2822    for (int i = 0; i < 4; i++)
2823       if (inst->dst.writemask & (1 << i))
2824          swizzles[i] = i;
2825       else
2826          swizzles[i] = first_writemask_chan;
2827    temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2828                                swizzles[2], swizzles[3]);
2829
2830    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
2831                                        inst->dst.writemask));
2832    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
2833    write->predicate = inst->predicate;
2834    write->ir = inst->ir;
2835    write->annotation = inst->annotation;
2836    inst->insert_after(write);
2837
2838    inst->dst.file = temp.file;
2839    inst->dst.reg = temp.reg;
2840    inst->dst.reg_offset = temp.reg_offset;
2841    inst->dst.reladdr = NULL;
2842 }
2843
2844 /**
2845  * We can't generally support array access in GRF space, because a
2846  * single instruction's destination can only span 2 contiguous
2847  * registers.  So, we send all GRF arrays that get variable index
2848  * access to scratch space.
2849  */
2850 void
2851 vec4_visitor::move_grf_array_access_to_scratch()
2852 {
2853    int scratch_loc[this->virtual_grf_count];
2854
2855    for (int i = 0; i < this->virtual_grf_count; i++) {
2856       scratch_loc[i] = -1;
2857    }
2858
2859    /* First, calculate the set of virtual GRFs that need to be punted
2860     * to scratch due to having any array access on them, and where in
2861     * scratch.
2862     */
2863    foreach_list(node, &this->instructions) {
2864       vec4_instruction *inst = (vec4_instruction *)node;
2865
2866       if (inst->dst.file == GRF && inst->dst.reladdr &&
2867           scratch_loc[inst->dst.reg] == -1) {
2868          scratch_loc[inst->dst.reg] = c->last_scratch;
2869          c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
2870       }
2871
2872       for (int i = 0 ; i < 3; i++) {
2873          src_reg *src = &inst->src[i];
2874
2875          if (src->file == GRF && src->reladdr &&
2876              scratch_loc[src->reg] == -1) {
2877             scratch_loc[src->reg] = c->last_scratch;
2878             c->last_scratch += this->virtual_grf_sizes[src->reg];
2879          }
2880       }
2881    }
2882
2883    /* Now, for anything that will be accessed through scratch, rewrite
2884     * it to load/store.  Note that this is a _safe list walk, because
2885     * we may generate a new scratch_write instruction after the one
2886     * we're processing.
2887     */
2888    foreach_list_safe(node, &this->instructions) {
2889       vec4_instruction *inst = (vec4_instruction *)node;
2890
2891       /* Set up the annotation tracking for new generated instructions. */
2892       base_ir = inst->ir;
2893       current_annotation = inst->annotation;
2894
2895       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
2896          emit_scratch_write(inst, scratch_loc[inst->dst.reg]);
2897       }
2898
2899       for (int i = 0 ; i < 3; i++) {
2900          if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
2901             continue;
2902
2903          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
2904
2905          emit_scratch_read(inst, temp, inst->src[i],
2906                            scratch_loc[inst->src[i].reg]);
2907
2908          inst->src[i].file = temp.file;
2909          inst->src[i].reg = temp.reg;
2910          inst->src[i].reg_offset = temp.reg_offset;
2911          inst->src[i].reladdr = NULL;
2912       }
2913    }
2914 }
2915
2916 /**
2917  * Emits an instruction before @inst to load the value named by @orig_src
2918  * from the pull constant buffer (surface) at @base_offset to @temp.
2919  */
2920 void
2921 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
2922                                       dst_reg temp, src_reg orig_src,
2923                                       int base_offset)
2924 {
2925    int reg_offset = base_offset + orig_src.reg_offset;
2926    src_reg index = src_reg((unsigned)SURF_INDEX_VEC4_CONST_BUFFER);
2927    src_reg offset = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
2928    vec4_instruction *load;
2929
2930    if (brw->gen >= 7) {
2931       dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
2932       grf_offset.type = offset.type;
2933       emit_before(inst, MOV(grf_offset, offset));
2934
2935       load = new(mem_ctx) vec4_instruction(this,
2936                                            VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
2937                                            temp, index, src_reg(grf_offset));
2938    } else {
2939       load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
2940                                            temp, index, offset);
2941       load->base_mrf = 14;
2942       load->mlen = 1;
2943    }
2944    emit_before(inst, load);
2945 }
2946
2947 /**
2948  * Implements array access of uniforms by inserting a
2949  * PULL_CONSTANT_LOAD instruction.
2950  *
2951  * Unlike temporary GRF array access (where we don't support it due to
2952  * the difficulty of doing relative addressing on instruction
2953  * destinations), we could potentially do array access of uniforms
2954  * that were loaded in GRF space as push constants.  In real-world
2955  * usage we've seen, though, the arrays being used are always larger
2956  * than we could load as push constants, so just always move all
2957  * uniform array access out to a pull constant buffer.
2958  */
2959 void
2960 vec4_visitor::move_uniform_array_access_to_pull_constants()
2961 {
2962    int pull_constant_loc[this->uniforms];
2963
2964    for (int i = 0; i < this->uniforms; i++) {
2965       pull_constant_loc[i] = -1;
2966    }
2967
2968    /* Walk through and find array access of uniforms.  Put a copy of that
2969     * uniform in the pull constant buffer.
2970     *
2971     * Note that we don't move constant-indexed accesses to arrays.  No
2972     * testing has been done of the performance impact of this choice.
2973     */
2974    foreach_list_safe(node, &this->instructions) {
2975       vec4_instruction *inst = (vec4_instruction *)node;
2976
2977       for (int i = 0 ; i < 3; i++) {
2978          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2979             continue;
2980
2981          int uniform = inst->src[i].reg;
2982
2983          /* If this array isn't already present in the pull constant buffer,
2984           * add it.
2985           */
2986          if (pull_constant_loc[uniform] == -1) {
2987             const float **values = &prog_data->param[uniform * 4];
2988
2989             pull_constant_loc[uniform] = prog_data->nr_pull_params / 4;
2990
2991             for (int j = 0; j < uniform_size[uniform] * 4; j++) {
2992                prog_data->pull_param[prog_data->nr_pull_params++]
2993                   = values[j];
2994             }
2995          }
2996
2997          /* Set up the annotation tracking for new generated instructions. */
2998          base_ir = inst->ir;
2999          current_annotation = inst->annotation;
3000
3001          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3002
3003          emit_pull_constant_load(inst, temp, inst->src[i],
3004                                  pull_constant_loc[uniform]);
3005
3006          inst->src[i].file = temp.file;
3007          inst->src[i].reg = temp.reg;
3008          inst->src[i].reg_offset = temp.reg_offset;
3009          inst->src[i].reladdr = NULL;
3010       }
3011    }
3012
3013    /* Now there are no accesses of the UNIFORM file with a reladdr, so
3014     * no need to track them as larger-than-vec4 objects.  This will be
3015     * relied on in cutting out unused uniform vectors from push
3016     * constants.
3017     */
3018    split_uniform_registers();
3019 }
3020
3021 void
3022 vec4_visitor::resolve_ud_negate(src_reg *reg)
3023 {
3024    if (reg->type != BRW_REGISTER_TYPE_UD ||
3025        !reg->negate)
3026       return;
3027
3028    src_reg temp = src_reg(this, glsl_type::uvec4_type);
3029    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3030    *reg = temp;
3031 }
3032
3033 vec4_visitor::vec4_visitor(struct brw_context *brw,
3034                            struct brw_vec4_compile *c,
3035                            struct gl_program *prog,
3036                            const struct brw_vec4_prog_key *key,
3037                            struct brw_vec4_prog_data *prog_data,
3038                            struct gl_shader_program *shader_prog,
3039                            struct brw_shader *shader,
3040                            void *mem_ctx,
3041                            bool debug_flag)
3042    : debug_flag(debug_flag)
3043 {
3044    this->brw = brw;
3045    this->ctx = &brw->ctx;
3046    this->shader_prog = shader_prog;
3047    this->shader = shader;
3048
3049    this->mem_ctx = mem_ctx;
3050    this->failed = false;
3051
3052    this->base_ir = NULL;
3053    this->current_annotation = NULL;
3054    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3055
3056    this->c = c;
3057    this->prog = prog;
3058    this->key = key;
3059    this->prog_data = prog_data;
3060
3061    this->variable_ht = hash_table_ctor(0,
3062                                        hash_table_pointer_hash,
3063                                        hash_table_pointer_compare);
3064
3065    this->virtual_grf_start = NULL;
3066    this->virtual_grf_end = NULL;
3067    this->virtual_grf_sizes = NULL;
3068    this->virtual_grf_count = 0;
3069    this->virtual_grf_reg_map = NULL;
3070    this->virtual_grf_reg_count = 0;
3071    this->virtual_grf_array_size = 0;
3072    this->live_intervals_valid = false;
3073
3074    this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3075
3076    this->uniforms = 0;
3077 }
3078
3079 vec4_visitor::~vec4_visitor()
3080 {
3081    hash_table_dtor(this->variable_ht);
3082 }
3083
3084
3085 void
3086 vec4_visitor::fail(const char *format, ...)
3087 {
3088    va_list va;
3089    char *msg;
3090
3091    if (failed)
3092       return;
3093
3094    failed = true;
3095
3096    va_start(va, format);
3097    msg = ralloc_vasprintf(mem_ctx, format, va);
3098    va_end(va);
3099    msg = ralloc_asprintf(mem_ctx, "VS compile failed: %s\n", msg);
3100
3101    this->fail_msg = msg;
3102
3103    if (debug_flag) {
3104       fprintf(stderr, "%s",  msg);
3105    }
3106 }
3107
3108 } /* namespace brw */