src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "glsl/ir_uniform.h"
  26 extern "C" {
  27 #include "program/sampler.h"
  28 }
  29
  30 namespace brw {
  31
  32 vec4_instruction::vec4_instruction(vec4_visitor *v,
  33                                    enum opcode opcode, dst_reg dst,
  34                                    src_reg src0, src_reg src1, src_reg src2)
  35 {
  36    this->opcode = opcode;
  37    this->dst = dst;
  38    this->src[0] = src0;
  39    this->src[1] = src1;
  40    this->src[2] = src2;
  41    this->saturate = false;
  42    this->force_writemask_all = false;
  43    this->no_dd_clear = false;
  44    this->no_dd_check = false;
  45    this->conditional_mod = BRW_CONDITIONAL_NONE;
  46    this->sampler = 0;
  47    this->texture_offset = 0;
  48    this->target = 0;
  49    this->shadow_compare = false;
  50    this->ir = v->base_ir;
  51    this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
  52    this->header_present = false;
  53    this->mlen = 0;
  54    this->base_mrf = 0;
  55    this->offset = 0;
  56    this->ir = NULL;
  57    this->annotation = v->current_annotation;
  58 }
  59
  60 vec4_instruction *
  61 vec4_visitor::emit(vec4_instruction *inst)
  62 {
  63    this->instructions.push_tail(inst);
  64
  65    return inst;
  66 }
  67
  68 vec4_instruction *
  69 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
  70 {
  71    new_inst->ir = inst->ir;
  72    new_inst->annotation = inst->annotation;
  73
  74    inst->insert_before(new_inst);
  75
  76    return inst;
  77 }
  78
  79 vec4_instruction *
  80 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
  81                    src_reg src0, src_reg src1, src_reg src2)
  82 {
  83    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
  84                                              src0, src1, src2));
  85 }
  86
  87
  88 vec4_instruction *
  89 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
  90 {
  91    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
  92 }
  93
  94 vec4_instruction *
  95 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
  96 {
  97    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
  98 }
  99
 100 vec4_instruction *
 101 vec4_visitor::emit(enum opcode opcode, dst_reg dst)
 102 {
 103    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst));
 104 }
 105
 106 vec4_instruction *
 107 vec4_visitor::emit(enum opcode opcode)
 108 {
 109    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
 110 }
 111
 112 #define ALU1(op)                                                        \
 113    vec4_instruction *                                                   \
 114    vec4_visitor::op(dst_reg dst, src_reg src0)                          \
 115    {                                                                    \
 116       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 117                                            src0);                       \
 118    }
 119
 120 #define ALU2(op)                                                        \
 121    vec4_instruction *                                                   \
 122    vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1)            \
 123    {                                                                    \
 124       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 125                                            src0, src1);                 \
 126    }
 127
 128 #define ALU3(op)                                                        \
 129    vec4_instruction *                                                   \
 130    vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1, src_reg src2)\
 131    {                                                                    \
 132       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 133                                            src0, src1, src2);           \
 134    }
 135
 136 ALU1(NOT)
 137 ALU1(MOV)
 138 ALU1(FRC)
 139 ALU1(RNDD)
 140 ALU1(RNDE)
 141 ALU1(RNDZ)
 142 ALU1(F32TO16)
 143 ALU1(F16TO32)
 144 ALU2(ADD)
 145 ALU2(MUL)
 146 ALU2(MACH)
 147 ALU2(AND)
 148 ALU2(OR)
 149 ALU2(XOR)
 150 ALU2(DP3)
 151 ALU2(DP4)
 152 ALU2(DPH)
 153 ALU2(SHL)
 154 ALU2(SHR)
 155 ALU2(ASR)
 156 ALU3(LRP)
 157 ALU1(BFREV)
 158 ALU3(BFE)
 159 ALU2(BFI1)
 160 ALU3(BFI2)
 161 ALU1(FBH)
 162 ALU1(FBL)
 163 ALU1(CBIT)
 164 ALU3(MAD)
 165 ALU2(ADDC)
 166 ALU2(SUBB)
 167
 168 /** Gen4 predicated IF. */
 169 vec4_instruction *
 170 vec4_visitor::IF(uint32_t predicate)
 171 {
 172    vec4_instruction *inst;
 173
 174    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
 175    inst->predicate = predicate;
 176
 177    return inst;
 178 }
 179
 180 /** Gen6+ IF with embedded comparison. */
 181 vec4_instruction *
 182 vec4_visitor::IF(src_reg src0, src_reg src1, uint32_t condition)
 183 {
 184    assert(brw->gen >= 6);
 185
 186    vec4_instruction *inst;
 187
 188    resolve_ud_negate(&src0);
 189    resolve_ud_negate(&src1);
 190
 191    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
 192                                         src0, src1);
 193    inst->conditional_mod = condition;
 194
 195    return inst;
 196 }
 197
 198 /**
 199  * CMP: Sets the low bit of the destination channels with the result
 200  * of the comparison, while the upper bits are undefined, and updates
 201  * the flag register with the packed 16 bits of the result.
 202  */
 203 vec4_instruction *
 204 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, uint32_t condition)
 205 {
 206    vec4_instruction *inst;
 207
 208    /* original gen4 does type conversion to the destination type
 209     * before before comparison, producing garbage results for floating
 210     * point comparisons.
 211     */
 212    if (brw->gen == 4) {
 213       dst.type = src0.type;
 214       if (dst.file == HW_REG)
 215          dst.fixed_hw_reg.type = dst.type;
 216    }
 217
 218    resolve_ud_negate(&src0);
 219    resolve_ud_negate(&src1);
 220
 221    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
 222    inst->conditional_mod = condition;
 223
 224    return inst;
 225 }
 226
 227 vec4_instruction *
 228 vec4_visitor::SCRATCH_READ(dst_reg dst, src_reg index)
 229 {
 230    vec4_instruction *inst;
 231
 232    inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_READ,
 233                                         dst, index);
 234    inst->base_mrf = 14;
 235    inst->mlen = 2;
 236
 237    return inst;
 238 }
 239
 240 vec4_instruction *
 241 vec4_visitor::SCRATCH_WRITE(dst_reg dst, src_reg src, src_reg index)
 242 {
 243    vec4_instruction *inst;
 244
 245    inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_WRITE,
 246                                         dst, src, index);
 247    inst->base_mrf = 13;
 248    inst->mlen = 3;
 249
 250    return inst;
 251 }
 252
 253 void
 254 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
 255 {
 256    static enum opcode dot_opcodes[] = {
 257       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
 258    };
 259
 260    emit(dot_opcodes[elements - 2], dst, src0, src1);
 261 }
 262
 263 src_reg
 264 vec4_visitor::fix_3src_operand(src_reg src)
 265 {
 266    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
 267     * able to use vertical stride of zero to replicate the vec4 uniform, like
 268     *
 269     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
 270     *
 271     * But you can't, since vertical stride is always four in three-source
 272     * instructions. Instead, insert a MOV instruction to do the replication so
 273     * that the three-source instruction can consume it.
 274     */
 275
 276    /* The MOV is only needed if the source is a uniform or immediate. */
 277    if (src.file != UNIFORM && src.file != IMM)
 278       return src;
 279
 280    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 281    expanded.type = src.type;
 282    emit(MOV(expanded, src));
 283    return src_reg(expanded);
 284 }
 285
 286 src_reg
 287 vec4_visitor::fix_math_operand(src_reg src)
 288 {
 289    /* The gen6 math instruction ignores the source modifiers --
 290     * swizzle, abs, negate, and at least some parts of the register
 291     * region description.
 292     *
 293     * Rather than trying to enumerate all these cases, *always* expand the
 294     * operand to a temp GRF for gen6.
 295     *
 296     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 297     * can't use.
 298     */
 299
 300    if (brw->gen == 7 && src.file != IMM)
 301       return src;
 302
 303    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 304    expanded.type = src.type;
 305    emit(MOV(expanded, src));
 306    return src_reg(expanded);
 307 }
 308
 309 void
 310 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
 311 {
 312    src = fix_math_operand(src);
 313
 314    if (dst.writemask != WRITEMASK_XYZW) {
 315       /* The gen6 math instruction must be align1, so we can't do
 316        * writemasks.
 317        */
 318       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 319
 320       emit(opcode, temp_dst, src);
 321
 322       emit(MOV(dst, src_reg(temp_dst)));
 323    } else {
 324       emit(opcode, dst, src);
 325    }
 326 }
 327
 328 void
 329 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
 330 {
 331    vec4_instruction *inst = emit(opcode, dst, src);
 332    inst->base_mrf = 1;
 333    inst->mlen = 1;
 334 }
 335
 336 void
 337 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
 338 {
 339    switch (opcode) {
 340    case SHADER_OPCODE_RCP:
 341    case SHADER_OPCODE_RSQ:
 342    case SHADER_OPCODE_SQRT:
 343    case SHADER_OPCODE_EXP2:
 344    case SHADER_OPCODE_LOG2:
 345    case SHADER_OPCODE_SIN:
 346    case SHADER_OPCODE_COS:
 347       break;
 348    default:
 349       assert(!"not reached: bad math opcode");
 350       return;
 351    }
 352
 353    if (brw->gen >= 6) {
 354       return emit_math1_gen6(opcode, dst, src);
 355    } else {
 356       return emit_math1_gen4(opcode, dst, src);
 357    }
 358 }
 359
 360 void
 361 vec4_visitor::emit_math2_gen6(enum opcode opcode,
 362                               dst_reg dst, src_reg src0, src_reg src1)
 363 {
 364    src0 = fix_math_operand(src0);
 365    src1 = fix_math_operand(src1);
 366
 367    if (dst.writemask != WRITEMASK_XYZW) {
 368       /* The gen6 math instruction must be align1, so we can't do
 369        * writemasks.
 370        */
 371       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 372       temp_dst.type = dst.type;
 373
 374       emit(opcode, temp_dst, src0, src1);
 375
 376       emit(MOV(dst, src_reg(temp_dst)));
 377    } else {
 378       emit(opcode, dst, src0, src1);
 379    }
 380 }
 381
 382 void
 383 vec4_visitor::emit_math2_gen4(enum opcode opcode,
 384                               dst_reg dst, src_reg src0, src_reg src1)
 385 {
 386    vec4_instruction *inst = emit(opcode, dst, src0, src1);
 387    inst->base_mrf = 1;
 388    inst->mlen = 2;
 389 }
 390
 391 void
 392 vec4_visitor::emit_math(enum opcode opcode,
 393                         dst_reg dst, src_reg src0, src_reg src1)
 394 {
 395    switch (opcode) {
 396    case SHADER_OPCODE_POW:
 397    case SHADER_OPCODE_INT_QUOTIENT:
 398    case SHADER_OPCODE_INT_REMAINDER:
 399       break;
 400    default:
 401       assert(!"not reached: unsupported binary math opcode");
 402       return;
 403    }
 404
 405    if (brw->gen >= 6) {
 406       return emit_math2_gen6(opcode, dst, src0, src1);
 407    } else {
 408       return emit_math2_gen4(opcode, dst, src0, src1);
 409    }
 410 }
 411
 412 void
 413 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 414 {
 415    if (brw->gen < 7)
 416       assert(!"ir_unop_pack_half_2x16 should be lowered");
 417
 418    assert(dst.type == BRW_REGISTER_TYPE_UD);
 419    assert(src0.type == BRW_REGISTER_TYPE_F);
 420
 421    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
 422     *
 423     *   Because this instruction does not have a 16-bit floating-point type,
 424     *   the destination data type must be Word (W).
 425     *
 426     *   The destination must be DWord-aligned and specify a horizontal stride
 427     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
 428     *   each destination channel and the upper word is not modified.
 429     *
 430     * The above restriction implies that the f32to16 instruction must use
 431     * align1 mode, because only in align1 mode is it possible to specify
 432     * horizontal stride.  We choose here to defy the hardware docs and emit
 433     * align16 instructions.
 434     *
 435     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
 436     * instructions. I was partially successful in that the code passed all
 437     * tests.  However, the code was dubiously correct and fragile, and the
 438     * tests were not harsh enough to probe that frailty. Not trusting the
 439     * code, I chose instead to remain in align16 mode in defiance of the hw
 440     * docs).
 441     *
 442     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
 443     * simulator, emitting a f32to16 in align16 mode with UD as destination
 444     * data type is safe. The behavior differs from that specified in the PRM
 445     * in that the upper word of each destination channel is cleared to 0.
 446     */
 447
 448    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 449    src_reg tmp_src(tmp_dst);
 450
 451 #if 0
 452    /* Verify the undocumented behavior on which the following instructions
 453     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
 454     * then the result of the bit-or instruction below will be incorrect.
 455     *
 456     * You should inspect the disasm output in order to verify that the MOV is
 457     * not optimized away.
 458     */
 459    emit(MOV(tmp_dst, src_reg(0x12345678u)));
 460 #endif
 461
 462    /* Give tmp the form below, where "." means untouched.
 463     *
 464     *     w z          y          x w z          y          x
 465     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
 466     *
 467     * That the upper word of each write-channel be 0 is required for the
 468     * following bit-shift and bit-or instructions to work. Note that this
 469     * relies on the undocumented hardware behavior mentioned above.
 470     */
 471    tmp_dst.writemask = WRITEMASK_XY;
 472    emit(F32TO16(tmp_dst, src0));
 473
 474    /* Give the write-channels of dst the form:
 475     *   0xhhhh0000
 476     */
 477    tmp_src.swizzle = SWIZZLE_Y;
 478    emit(SHL(dst, tmp_src, src_reg(16u)));
 479
 480    /* Finally, give the write-channels of dst the form of packHalf2x16's
 481     * output:
 482     *   0xhhhhllll
 483     */
 484    tmp_src.swizzle = SWIZZLE_X;
 485    emit(OR(dst, src_reg(dst), tmp_src));
 486 }
 487
 488 void
 489 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 490 {
 491    if (brw->gen < 7)
 492       assert(!"ir_unop_unpack_half_2x16 should be lowered");
 493
 494    assert(dst.type == BRW_REGISTER_TYPE_F);
 495    assert(src0.type == BRW_REGISTER_TYPE_UD);
 496
 497    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
 498     *
 499     *   Because this instruction does not have a 16-bit floating-point type,
 500     *   the source data type must be Word (W). The destination type must be
 501     *   F (Float).
 502     *
 503     * To use W as the source data type, we must adjust horizontal strides,
 504     * which is only possible in align1 mode. All my [chadv] attempts at
 505     * emitting align1 instructions for unpackHalf2x16 failed to pass the
 506     * Piglit tests, so I gave up.
 507     *
 508     * I've verified that, on gen7 hardware and the simulator, it is safe to
 509     * emit f16to32 in align16 mode with UD as source data type.
 510     */
 511
 512    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 513    src_reg tmp_src(tmp_dst);
 514
 515    tmp_dst.writemask = WRITEMASK_X;
 516    emit(AND(tmp_dst, src0, src_reg(0xffffu)));
 517
 518    tmp_dst.writemask = WRITEMASK_Y;
 519    emit(SHR(tmp_dst, src0, src_reg(16u)));
 520
 521    dst.writemask = WRITEMASK_XY;
 522    emit(F16TO32(dst, tmp_src));
 523 }
 524
 525 void
 526 vec4_visitor::visit_instructions(const exec_list *list)
 527 {
 528    foreach_list(node, list) {
 529       ir_instruction *ir = (ir_instruction *)node;
 530
 531       base_ir = ir;
 532       ir->accept(this);
 533    }
 534 }
 535
 536
 537 static int
 538 type_size(const struct glsl_type *type)
 539 {
 540    unsigned int i;
 541    int size;
 542
 543    switch (type->base_type) {
 544    case GLSL_TYPE_UINT:
 545    case GLSL_TYPE_INT:
 546    case GLSL_TYPE_FLOAT:
 547    case GLSL_TYPE_BOOL:
 548       if (type->is_matrix()) {
 549          return type->matrix_columns;
 550       } else {
 551          /* Regardless of size of vector, it gets a vec4. This is bad
 552           * packing for things like floats, but otherwise arrays become a
 553           * mess.  Hopefully a later pass over the code can pack scalars
 554           * down if appropriate.
 555           */
 556          return 1;
 557       }
 558    case GLSL_TYPE_ARRAY:
 559       assert(type->length > 0);
 560       return type_size(type->fields.array) * type->length;
 561    case GLSL_TYPE_STRUCT:
 562       size = 0;
 563       for (i = 0; i < type->length; i++) {
 564          size += type_size(type->fields.structure[i].type);
 565       }
 566       return size;
 567    case GLSL_TYPE_SAMPLER:
 568       /* Samplers take up one slot in UNIFORMS[], but they're baked in
 569        * at link time.
 570        */
 571       return 1;
 572    case GLSL_TYPE_VOID:
 573    case GLSL_TYPE_ERROR:
 574    case GLSL_TYPE_INTERFACE:
 575       assert(0);
 576       break;
 577    }
 578
 579    return 0;
 580 }
 581
 582 int
 583 vec4_visitor::virtual_grf_alloc(int size)
 584 {
 585    if (virtual_grf_array_size <= virtual_grf_count) {
 586       if (virtual_grf_array_size == 0)
 587          virtual_grf_array_size = 16;
 588       else
 589          virtual_grf_array_size *= 2;
 590       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 591                                    virtual_grf_array_size);
 592       virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
 593                                      virtual_grf_array_size);
 594    }
 595    virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
 596    virtual_grf_reg_count += size;
 597    virtual_grf_sizes[virtual_grf_count] = size;
 598    return virtual_grf_count++;
 599 }
 600
 601 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 602 {
 603    init();
 604
 605    this->file = GRF;
 606    this->reg = v->virtual_grf_alloc(type_size(type));
 607
 608    if (type->is_array() || type->is_record()) {
 609       this->swizzle = BRW_SWIZZLE_NOOP;
 610    } else {
 611       this->swizzle = swizzle_for_size(type->vector_elements);
 612    }
 613
 614    this->type = brw_type_for_base_type(type);
 615 }
 616
 617 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 618 {
 619    init();
 620
 621    this->file = GRF;
 622    this->reg = v->virtual_grf_alloc(type_size(type));
 623
 624    if (type->is_array() || type->is_record()) {
 625       this->writemask = WRITEMASK_XYZW;
 626    } else {
 627       this->writemask = (1 << type->vector_elements) - 1;
 628    }
 629
 630    this->type = brw_type_for_base_type(type);
 631 }
 632
 633 /* Our support for uniforms is piggy-backed on the struct
 634  * gl_fragment_program, because that's where the values actually
 635  * get stored, rather than in some global gl_shader_program uniform
 636  * store.
 637  */
 638 void
 639 vec4_visitor::setup_uniform_values(ir_variable *ir)
 640 {
 641    int namelen = strlen(ir->name);
 642
 643    /* The data for our (non-builtin) uniforms is stored in a series of
 644     * gl_uniform_driver_storage structs for each subcomponent that
 645     * glGetUniformLocation() could name.  We know it's been set up in the same
 646     * order we'd walk the type, so walk the list of storage and find anything
 647     * with our name, or the prefix of a component that starts with our name.
 648     */
 649    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 650       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 651
 652       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 653           (storage->name[namelen] != 0 &&
 654            storage->name[namelen] != '.' &&
 655            storage->name[namelen] != '[')) {
 656          continue;
 657       }
 658
 659       gl_constant_value *components = storage->storage;
 660       unsigned vector_count = (MAX2(storage->array_elements, 1) *
 661                                storage->type->matrix_columns);
 662
 663       for (unsigned s = 0; s < vector_count; s++) {
 664          uniform_vector_size[uniforms] = storage->type->vector_elements;
 665
 666          int i;
 667          for (i = 0; i < uniform_vector_size[uniforms]; i++) {
 668             prog_data->param[uniforms * 4 + i] = &components->f;
 669             components++;
 670          }
 671          for (; i < 4; i++) {
 672             static float zero = 0;
 673             prog_data->param[uniforms * 4 + i] = &zero;
 674          }
 675
 676          uniforms++;
 677       }
 678    }
 679 }
 680
 681 void
 682 vec4_visitor::setup_uniform_clipplane_values()
 683 {
 684    gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
 685
 686    for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
 687       this->uniform_vector_size[this->uniforms] = 4;
 688       this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
 689       this->userplane[i].type = BRW_REGISTER_TYPE_F;
 690       for (int j = 0; j < 4; ++j) {
 691          prog_data->param[this->uniforms * 4 + j] = &clip_planes[i][j];
 692       }
 693       ++this->uniforms;
 694    }
 695 }
 696
 697 /* Our support for builtin uniforms is even scarier than non-builtin.
 698  * It sits on top of the PROG_STATE_VAR parameters that are
 699  * automatically updated from GL context state.
 700  */
 701 void
 702 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
 703 {
 704    const ir_state_slot *const slots = ir->state_slots;
 705    assert(ir->state_slots != NULL);
 706
 707    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 708       /* This state reference has already been setup by ir_to_mesa,
 709        * but we'll get the same index back here.  We can reference
 710        * ParameterValues directly, since unlike brw_fs.cpp, we never
 711        * add new state references during compile.
 712        */
 713       int index = _mesa_add_state_reference(this->prog->Parameters,
 714                                             (gl_state_index *)slots[i].tokens);
 715       float *values = &this->prog->Parameters->ParameterValues[index][0].f;
 716
 717       this->uniform_vector_size[this->uniforms] = 0;
 718       /* Add each of the unique swizzled channels of the element.
 719        * This will end up matching the size of the glsl_type of this field.
 720        */
 721       int last_swiz = -1;
 722       for (unsigned int j = 0; j < 4; j++) {
 723          int swiz = GET_SWZ(slots[i].swizzle, j);
 724          last_swiz = swiz;
 725
 726          prog_data->param[this->uniforms * 4 + j] = &values[swiz];
 727          if (swiz <= last_swiz)
 728             this->uniform_vector_size[this->uniforms]++;
 729       }
 730       this->uniforms++;
 731    }
 732 }
 733
 734 dst_reg *
 735 vec4_visitor::variable_storage(ir_variable *var)
 736 {
 737    return (dst_reg *)hash_table_find(this->variable_ht, var);
 738 }
 739
 740 void
 741 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir, uint32_t *predicate)
 742 {
 743    ir_expression *expr = ir->as_expression();
 744
 745    *predicate = BRW_PREDICATE_NORMAL;
 746
 747    if (expr) {
 748       src_reg op[2];
 749       vec4_instruction *inst;
 750
 751       assert(expr->get_num_operands() <= 2);
 752       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 753          expr->operands[i]->accept(this);
 754          op[i] = this->result;
 755
 756          resolve_ud_negate(&op[i]);
 757       }
 758
 759       switch (expr->operation) {
 760       case ir_unop_logic_not:
 761          inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
 762          inst->conditional_mod = BRW_CONDITIONAL_Z;
 763          break;
 764
 765       case ir_binop_logic_xor:
 766          inst = emit(XOR(dst_null_d(), op[0], op[1]));
 767          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 768          break;
 769
 770       case ir_binop_logic_or:
 771          inst = emit(OR(dst_null_d(), op[0], op[1]));
 772          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 773          break;
 774
 775       case ir_binop_logic_and:
 776          inst = emit(AND(dst_null_d(), op[0], op[1]));
 777          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 778          break;
 779
 780       case ir_unop_f2b:
 781          if (brw->gen >= 6) {
 782             emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
 783          } else {
 784             inst = emit(MOV(dst_null_f(), op[0]));
 785             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 786          }
 787          break;
 788
 789       case ir_unop_i2b:
 790          if (brw->gen >= 6) {
 791             emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 792          } else {
 793             inst = emit(MOV(dst_null_d(), op[0]));
 794             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 795          }
 796          break;
 797
 798       case ir_binop_all_equal:
 799          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 800          *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
 801          break;
 802
 803       case ir_binop_any_nequal:
 804          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 805          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 806          break;
 807
 808       case ir_unop_any:
 809          inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 810          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 811          break;
 812
 813       case ir_binop_greater:
 814       case ir_binop_gequal:
 815       case ir_binop_less:
 816       case ir_binop_lequal:
 817       case ir_binop_equal:
 818       case ir_binop_nequal:
 819          emit(CMP(dst_null_d(), op[0], op[1],
 820                   brw_conditional_for_comparison(expr->operation)));
 821          break;
 822
 823       default:
 824          assert(!"not reached");
 825          break;
 826       }
 827       return;
 828    }
 829
 830    ir->accept(this);
 831
 832    resolve_ud_negate(&this->result);
 833
 834    if (brw->gen >= 6) {
 835       vec4_instruction *inst = emit(AND(dst_null_d(),
 836                                         this->result, src_reg(1)));
 837       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 838    } else {
 839       vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
 840       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 841    }
 842 }
 843
 844 /**
 845  * Emit a gen6 IF statement with the comparison folded into the IF
 846  * instruction.
 847  */
 848 void
 849 vec4_visitor::emit_if_gen6(ir_if *ir)
 850 {
 851    ir_expression *expr = ir->condition->as_expression();
 852
 853    if (expr) {
 854       src_reg op[2];
 855       dst_reg temp;
 856
 857       assert(expr->get_num_operands() <= 2);
 858       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 859          expr->operands[i]->accept(this);
 860          op[i] = this->result;
 861       }
 862
 863       switch (expr->operation) {
 864       case ir_unop_logic_not:
 865          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
 866          return;
 867
 868       case ir_binop_logic_xor:
 869          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
 870          return;
 871
 872       case ir_binop_logic_or:
 873          temp = dst_reg(this, glsl_type::bool_type);
 874          emit(OR(temp, op[0], op[1]));
 875          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 876          return;
 877
 878       case ir_binop_logic_and:
 879          temp = dst_reg(this, glsl_type::bool_type);
 880          emit(AND(temp, op[0], op[1]));
 881          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 882          return;
 883
 884       case ir_unop_f2b:
 885          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 886          return;
 887
 888       case ir_unop_i2b:
 889          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 890          return;
 891
 892       case ir_binop_greater:
 893       case ir_binop_gequal:
 894       case ir_binop_less:
 895       case ir_binop_lequal:
 896       case ir_binop_equal:
 897       case ir_binop_nequal:
 898          emit(IF(op[0], op[1],
 899                  brw_conditional_for_comparison(expr->operation)));
 900          return;
 901
 902       case ir_binop_all_equal:
 903          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 904          emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
 905          return;
 906
 907       case ir_binop_any_nequal:
 908          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 909          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 910          return;
 911
 912       case ir_unop_any:
 913          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 914          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 915          return;
 916
 917       default:
 918          assert(!"not reached");
 919          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 920          return;
 921       }
 922       return;
 923    }
 924
 925    ir->condition->accept(this);
 926
 927    emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
 928 }
 929
 930 dst_reg
 931 with_writemask(dst_reg const & r, int mask)
 932 {
 933    dst_reg result = r;
 934    result.writemask = mask;
 935    return result;
 936 }
 937
 938
 939 void
 940 vec4_visitor::visit(ir_variable *ir)
 941 {
 942    dst_reg *reg = NULL;
 943
 944    if (variable_storage(ir))
 945       return;
 946
 947    switch (ir->mode) {
 948    case ir_var_shader_in:
 949       reg = new(mem_ctx) dst_reg(ATTR, ir->location);
 950       break;
 951
 952    case ir_var_shader_out:
 953       reg = new(mem_ctx) dst_reg(this, ir->type);
 954
 955       for (int i = 0; i < type_size(ir->type); i++) {
 956          output_reg[ir->location + i] = *reg;
 957          output_reg[ir->location + i].reg_offset = i;
 958          output_reg[ir->location + i].type =
 959             brw_type_for_base_type(ir->type->get_scalar_type());
 960          output_reg_annotation[ir->location + i] = ir->name;
 961       }
 962       break;
 963
 964    case ir_var_auto:
 965    case ir_var_temporary:
 966       reg = new(mem_ctx) dst_reg(this, ir->type);
 967       break;
 968
 969    case ir_var_uniform:
 970       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
 971
 972       /* Thanks to the lower_ubo_reference pass, we will see only
 973        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
 974        * variables, so no need for them to be in variable_ht.
 975        */
 976       if (ir->is_in_uniform_block())
 977          return;
 978
 979       /* Track how big the whole uniform variable is, in case we need to put a
 980        * copy of its data into pull constants for array access.
 981        */
 982       this->uniform_size[this->uniforms] = type_size(ir->type);
 983
 984       if (!strncmp(ir->name, "gl_", 3)) {
 985          setup_builtin_uniform_values(ir);
 986       } else {
 987          setup_uniform_values(ir);
 988       }
 989       break;
 990
 991    case ir_var_system_value:
 992       reg = make_reg_for_system_value(ir);
 993       break;
 994
 995    default:
 996       assert(!"not reached");
 997    }
 998
 999    reg->type = brw_type_for_base_type(ir->type);
1000    hash_table_insert(this->variable_ht, reg, ir);
1001 }
1002
1003 void
1004 vec4_visitor::visit(ir_loop *ir)
1005 {
1006    dst_reg counter;
1007
1008    /* We don't want debugging output to print the whole body of the
1009     * loop as the annotation.
1010     */
1011    this->base_ir = NULL;
1012
1013    if (ir->counter != NULL) {
1014       this->base_ir = ir->counter;
1015       ir->counter->accept(this);
1016       counter = *(variable_storage(ir->counter));
1017
1018       if (ir->from != NULL) {
1019          this->base_ir = ir->from;
1020          ir->from->accept(this);
1021
1022          emit(MOV(counter, this->result));
1023       }
1024    }
1025
1026    emit(BRW_OPCODE_DO);
1027
1028    if (ir->to) {
1029       this->base_ir = ir->to;
1030       ir->to->accept(this);
1031
1032       emit(CMP(dst_null_d(), src_reg(counter), this->result,
1033                brw_conditional_for_comparison(ir->cmp)));
1034
1035       vec4_instruction *inst = emit(BRW_OPCODE_BREAK);
1036       inst->predicate = BRW_PREDICATE_NORMAL;
1037    }
1038
1039    visit_instructions(&ir->body_instructions);
1040
1041
1042    if (ir->increment) {
1043       this->base_ir = ir->increment;
1044       ir->increment->accept(this);
1045       emit(ADD(counter, src_reg(counter), this->result));
1046    }
1047
1048    emit(BRW_OPCODE_WHILE);
1049 }
1050
1051 void
1052 vec4_visitor::visit(ir_loop_jump *ir)
1053 {
1054    switch (ir->mode) {
1055    case ir_loop_jump::jump_break:
1056       emit(BRW_OPCODE_BREAK);
1057       break;
1058    case ir_loop_jump::jump_continue:
1059       emit(BRW_OPCODE_CONTINUE);
1060       break;
1061    }
1062 }
1063
1064
1065 void
1066 vec4_visitor::visit(ir_function_signature *ir)
1067 {
1068    assert(0);
1069    (void)ir;
1070 }
1071
1072 void
1073 vec4_visitor::visit(ir_function *ir)
1074 {
1075    /* Ignore function bodies other than main() -- we shouldn't see calls to
1076     * them since they should all be inlined.
1077     */
1078    if (strcmp(ir->name, "main") == 0) {
1079       const ir_function_signature *sig;
1080       exec_list empty;
1081
1082       sig = ir->matching_signature(NULL, &empty);
1083
1084       assert(sig);
1085
1086       visit_instructions(&sig->body);
1087    }
1088 }
1089
1090 bool
1091 vec4_visitor::try_emit_sat(ir_expression *ir)
1092 {
1093    ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
1094    if (!sat_src)
1095       return false;
1096
1097    sat_src->accept(this);
1098    src_reg src = this->result;
1099
1100    this->result = src_reg(this, ir->type);
1101    vec4_instruction *inst;
1102    inst = emit(MOV(dst_reg(this->result), src));
1103    inst->saturate = true;
1104
1105    return true;
1106 }
1107
1108 bool
1109 vec4_visitor::try_emit_mad(ir_expression *ir, int mul_arg)
1110 {
1111    /* 3-src instructions were introduced in gen6. */
1112    if (brw->gen < 6)
1113       return false;
1114
1115    /* MAD can only handle floating-point data. */
1116    if (ir->type->base_type != GLSL_TYPE_FLOAT)
1117       return false;
1118
1119    ir_rvalue *nonmul = ir->operands[1 - mul_arg];
1120    ir_expression *mul = ir->operands[mul_arg]->as_expression();
1121
1122    if (!mul || mul->operation != ir_binop_mul)
1123       return false;
1124
1125    nonmul->accept(this);
1126    src_reg src0 = fix_3src_operand(this->result);
1127
1128    mul->operands[0]->accept(this);
1129    src_reg src1 = fix_3src_operand(this->result);
1130
1131    mul->operands[1]->accept(this);
1132    src_reg src2 = fix_3src_operand(this->result);
1133
1134    this->result = src_reg(this, ir->type);
1135    emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1136
1137    return true;
1138 }
1139
1140 void
1141 vec4_visitor::emit_bool_comparison(unsigned int op,
1142                                  dst_reg dst, src_reg src0, src_reg src1)
1143 {
1144    /* original gen4 does destination conversion before comparison. */
1145    if (brw->gen < 5)
1146       dst.type = src0.type;
1147
1148    emit(CMP(dst, src0, src1, brw_conditional_for_comparison(op)));
1149
1150    dst.type = BRW_REGISTER_TYPE_D;
1151    emit(AND(dst, src_reg(dst), src_reg(0x1)));
1152 }
1153
1154 void
1155 vec4_visitor::emit_minmax(uint32_t conditionalmod, dst_reg dst,
1156                           src_reg src0, src_reg src1)
1157 {
1158    vec4_instruction *inst;
1159
1160    if (brw->gen >= 6) {
1161       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1162       inst->conditional_mod = conditionalmod;
1163    } else {
1164       emit(CMP(dst, src0, src1, conditionalmod));
1165
1166       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1167       inst->predicate = BRW_PREDICATE_NORMAL;
1168    }
1169 }
1170
1171 static bool
1172 is_16bit_constant(ir_rvalue *rvalue)
1173 {
1174    ir_constant *constant = rvalue->as_constant();
1175    if (!constant)
1176       return false;
1177
1178    if (constant->type != glsl_type::int_type &&
1179        constant->type != glsl_type::uint_type)
1180       return false;
1181
1182    return constant->value.u[0] < (1 << 16);
1183 }
1184
1185 void
1186 vec4_visitor::visit(ir_expression *ir)
1187 {
1188    unsigned int operand;
1189    src_reg op[Elements(ir->operands)];
1190    src_reg result_src;
1191    dst_reg result_dst;
1192    vec4_instruction *inst;
1193
1194    if (try_emit_sat(ir))
1195       return;
1196
1197    if (ir->operation == ir_binop_add) {
1198       if (try_emit_mad(ir, 0) || try_emit_mad(ir, 1))
1199          return;
1200    }
1201
1202    for (operand = 0; operand < ir->get_num_operands(); operand++) {
1203       this->result.file = BAD_FILE;
1204       ir->operands[operand]->accept(this);
1205       if (this->result.file == BAD_FILE) {
1206          printf("Failed to get tree for expression operand:\n");
1207          ir->operands[operand]->print();
1208          exit(1);
1209       }
1210       op[operand] = this->result;
1211
1212       /* Matrix expression operands should have been broken down to vector
1213        * operations already.
1214        */
1215       assert(!ir->operands[operand]->type->is_matrix());
1216    }
1217
1218    int vector_elements = ir->operands[0]->type->vector_elements;
1219    if (ir->operands[1]) {
1220       vector_elements = MAX2(vector_elements,
1221                              ir->operands[1]->type->vector_elements);
1222    }
1223
1224    this->result.file = BAD_FILE;
1225
1226    /* Storage for our result.  Ideally for an assignment we'd be using
1227     * the actual storage for the result here, instead.
1228     */
1229    result_src = src_reg(this, ir->type);
1230    /* convenience for the emit functions below. */
1231    result_dst = dst_reg(result_src);
1232    /* If nothing special happens, this is the result. */
1233    this->result = result_src;
1234    /* Limit writes to the channels that will be used by result_src later.
1235     * This does limit this temp's use as a temporary for multi-instruction
1236     * sequences.
1237     */
1238    result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1239
1240    switch (ir->operation) {
1241    case ir_unop_logic_not:
1242       /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
1243        * ones complement of the whole register, not just bit 0.
1244        */
1245       emit(XOR(result_dst, op[0], src_reg(1)));
1246       break;
1247    case ir_unop_neg:
1248       op[0].negate = !op[0].negate;
1249       emit(MOV(result_dst, op[0]));
1250       break;
1251    case ir_unop_abs:
1252       op[0].abs = true;
1253       op[0].negate = false;
1254       emit(MOV(result_dst, op[0]));
1255       break;
1256
1257    case ir_unop_sign:
1258       emit(MOV(result_dst, src_reg(0.0f)));
1259
1260       emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_G));
1261       inst = emit(MOV(result_dst, src_reg(1.0f)));
1262       inst->predicate = BRW_PREDICATE_NORMAL;
1263
1264       emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_L));
1265       inst = emit(MOV(result_dst, src_reg(-1.0f)));
1266       inst->predicate = BRW_PREDICATE_NORMAL;
1267
1268       break;
1269
1270    case ir_unop_rcp:
1271       emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1272       break;
1273
1274    case ir_unop_exp2:
1275       emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1276       break;
1277    case ir_unop_log2:
1278       emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1279       break;
1280    case ir_unop_exp:
1281    case ir_unop_log:
1282       assert(!"not reached: should be handled by ir_explog_to_explog2");
1283       break;
1284    case ir_unop_sin:
1285    case ir_unop_sin_reduced:
1286       emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1287       break;
1288    case ir_unop_cos:
1289    case ir_unop_cos_reduced:
1290       emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1291       break;
1292
1293    case ir_unop_dFdx:
1294    case ir_unop_dFdy:
1295       assert(!"derivatives not valid in vertex shader");
1296       break;
1297
1298    case ir_unop_bitfield_reverse:
1299       emit(BFREV(result_dst, op[0]));
1300       break;
1301    case ir_unop_bit_count:
1302       emit(CBIT(result_dst, op[0]));
1303       break;
1304    case ir_unop_find_msb: {
1305       src_reg temp = src_reg(this, glsl_type::uint_type);
1306
1307       inst = emit(FBH(dst_reg(temp), op[0]));
1308       inst->dst.writemask = WRITEMASK_XYZW;
1309
1310       /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1311        * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1312        * subtract the result from 31 to convert the MSB count into an LSB count.
1313        */
1314
1315       /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1316       temp.swizzle = BRW_SWIZZLE_NOOP;
1317       emit(MOV(result_dst, temp));
1318
1319       src_reg src_tmp = src_reg(result_dst);
1320       emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1321
1322       src_tmp.negate = true;
1323       inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1324       inst->predicate = BRW_PREDICATE_NORMAL;
1325       break;
1326    }
1327    case ir_unop_find_lsb:
1328       emit(FBL(result_dst, op[0]));
1329       break;
1330
1331    case ir_unop_noise:
1332       assert(!"not reached: should be handled by lower_noise");
1333       break;
1334
1335    case ir_binop_add:
1336       emit(ADD(result_dst, op[0], op[1]));
1337       break;
1338    case ir_binop_sub:
1339       assert(!"not reached: should be handled by ir_sub_to_add_neg");
1340       break;
1341
1342    case ir_binop_mul:
1343       if (ir->type->is_integer()) {
1344          /* For integer multiplication, the MUL uses the low 16 bits of one of
1345           * the operands (src0 through SNB, src1 on IVB and later).  The MACH
1346           * accumulates in the contribution of the upper 16 bits of that
1347           * operand.  If we can determine that one of the args is in the low
1348           * 16 bits, though, we can just emit a single MUL.
1349           */
1350          if (is_16bit_constant(ir->operands[0])) {
1351             if (brw->gen < 7)
1352                emit(MUL(result_dst, op[0], op[1]));
1353             else
1354                emit(MUL(result_dst, op[1], op[0]));
1355          } else if (is_16bit_constant(ir->operands[1])) {
1356             if (brw->gen < 7)
1357                emit(MUL(result_dst, op[1], op[0]));
1358             else
1359                emit(MUL(result_dst, op[0], op[1]));
1360          } else {
1361             struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1362
1363             emit(MUL(acc, op[0], op[1]));
1364             emit(MACH(dst_null_d(), op[0], op[1]));
1365             emit(MOV(result_dst, src_reg(acc)));
1366          }
1367       } else {
1368          emit(MUL(result_dst, op[0], op[1]));
1369       }
1370       break;
1371    case ir_binop_imul_high: {
1372       struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1373
1374       emit(MUL(acc, op[0], op[1]));
1375       emit(MACH(result_dst, op[0], op[1]));
1376       break;
1377    }
1378    case ir_binop_div:
1379       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1380       assert(ir->type->is_integer());
1381       emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1382       break;
1383    case ir_binop_carry: {
1384       struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1385
1386       emit(ADDC(dst_null_ud(), op[0], op[1]));
1387       emit(MOV(result_dst, src_reg(acc)));
1388       break;
1389    }
1390    case ir_binop_borrow: {
1391       struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1392
1393       emit(SUBB(dst_null_ud(), op[0], op[1]));
1394       emit(MOV(result_dst, src_reg(acc)));
1395       break;
1396    }
1397    case ir_binop_mod:
1398       /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1399       assert(ir->type->is_integer());
1400       emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1401       break;
1402
1403    case ir_binop_less:
1404    case ir_binop_greater:
1405    case ir_binop_lequal:
1406    case ir_binop_gequal:
1407    case ir_binop_equal:
1408    case ir_binop_nequal: {
1409       emit(CMP(result_dst, op[0], op[1],
1410                brw_conditional_for_comparison(ir->operation)));
1411       emit(AND(result_dst, result_src, src_reg(0x1)));
1412       break;
1413    }
1414
1415    case ir_binop_all_equal:
1416       /* "==" operator producing a scalar boolean. */
1417       if (ir->operands[0]->type->is_vector() ||
1418           ir->operands[1]->type->is_vector()) {
1419          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1420          emit(MOV(result_dst, src_reg(0)));
1421          inst = emit(MOV(result_dst, src_reg(1)));
1422          inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1423       } else {
1424          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1425          emit(AND(result_dst, result_src, src_reg(0x1)));
1426       }
1427       break;
1428    case ir_binop_any_nequal:
1429       /* "!=" operator producing a scalar boolean. */
1430       if (ir->operands[0]->type->is_vector() ||
1431           ir->operands[1]->type->is_vector()) {
1432          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1433
1434          emit(MOV(result_dst, src_reg(0)));
1435          inst = emit(MOV(result_dst, src_reg(1)));
1436          inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1437       } else {
1438          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1439          emit(AND(result_dst, result_src, src_reg(0x1)));
1440       }
1441       break;
1442
1443    case ir_unop_any:
1444       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1445       emit(MOV(result_dst, src_reg(0)));
1446
1447       inst = emit(MOV(result_dst, src_reg(1)));
1448       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1449       break;
1450
1451    case ir_binop_logic_xor:
1452       emit(XOR(result_dst, op[0], op[1]));
1453       break;
1454
1455    case ir_binop_logic_or:
1456       emit(OR(result_dst, op[0], op[1]));
1457       break;
1458
1459    case ir_binop_logic_and:
1460       emit(AND(result_dst, op[0], op[1]));
1461       break;
1462
1463    case ir_binop_dot:
1464       assert(ir->operands[0]->type->is_vector());
1465       assert(ir->operands[0]->type == ir->operands[1]->type);
1466       emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1467       break;
1468
1469    case ir_unop_sqrt:
1470       emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1471       break;
1472    case ir_unop_rsq:
1473       emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1474       break;
1475
1476    case ir_unop_bitcast_i2f:
1477    case ir_unop_bitcast_u2f:
1478       this->result = op[0];
1479       this->result.type = BRW_REGISTER_TYPE_F;
1480       break;
1481
1482    case ir_unop_bitcast_f2i:
1483       this->result = op[0];
1484       this->result.type = BRW_REGISTER_TYPE_D;
1485       break;
1486
1487    case ir_unop_bitcast_f2u:
1488       this->result = op[0];
1489       this->result.type = BRW_REGISTER_TYPE_UD;
1490       break;
1491
1492    case ir_unop_i2f:
1493    case ir_unop_i2u:
1494    case ir_unop_u2i:
1495    case ir_unop_u2f:
1496    case ir_unop_b2f:
1497    case ir_unop_b2i:
1498    case ir_unop_f2i:
1499    case ir_unop_f2u:
1500       emit(MOV(result_dst, op[0]));
1501       break;
1502    case ir_unop_f2b:
1503    case ir_unop_i2b: {
1504       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1505       emit(AND(result_dst, result_src, src_reg(1)));
1506       break;
1507    }
1508
1509    case ir_unop_trunc:
1510       emit(RNDZ(result_dst, op[0]));
1511       break;
1512    case ir_unop_ceil:
1513       op[0].negate = !op[0].negate;
1514       inst = emit(RNDD(result_dst, op[0]));
1515       this->result.negate = true;
1516       break;
1517    case ir_unop_floor:
1518       inst = emit(RNDD(result_dst, op[0]));
1519       break;
1520    case ir_unop_fract:
1521       inst = emit(FRC(result_dst, op[0]));
1522       break;
1523    case ir_unop_round_even:
1524       emit(RNDE(result_dst, op[0]));
1525       break;
1526
1527    case ir_binop_min:
1528       emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1529       break;
1530    case ir_binop_max:
1531       emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1532       break;
1533
1534    case ir_binop_pow:
1535       emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1536       break;
1537
1538    case ir_unop_bit_not:
1539       inst = emit(NOT(result_dst, op[0]));
1540       break;
1541    case ir_binop_bit_and:
1542       inst = emit(AND(result_dst, op[0], op[1]));
1543       break;
1544    case ir_binop_bit_xor:
1545       inst = emit(XOR(result_dst, op[0], op[1]));
1546       break;
1547    case ir_binop_bit_or:
1548       inst = emit(OR(result_dst, op[0], op[1]));
1549       break;
1550
1551    case ir_binop_lshift:
1552       inst = emit(SHL(result_dst, op[0], op[1]));
1553       break;
1554
1555    case ir_binop_rshift:
1556       if (ir->type->base_type == GLSL_TYPE_INT)
1557          inst = emit(ASR(result_dst, op[0], op[1]));
1558       else
1559          inst = emit(SHR(result_dst, op[0], op[1]));
1560       break;
1561
1562    case ir_binop_bfm:
1563       emit(BFI1(result_dst, op[0], op[1]));
1564       break;
1565
1566    case ir_binop_ubo_load: {
1567       ir_constant *uniform_block = ir->operands[0]->as_constant();
1568       ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1569       unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1570       src_reg offset = op[1];
1571
1572       /* Now, load the vector from that offset. */
1573       assert(ir->type->is_vector() || ir->type->is_scalar());
1574
1575       src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1576       packed_consts.type = result.type;
1577       src_reg surf_index =
1578          src_reg(prog_data->base.binding_table.ubo_start + uniform_block->value.u[0]);
1579       if (const_offset_ir) {
1580          offset = src_reg(const_offset / 16);
1581       } else {
1582          emit(SHR(dst_reg(offset), offset, src_reg(4)));
1583       }
1584
1585       vec4_instruction *pull =
1586          emit(new(mem_ctx) vec4_instruction(this,
1587                                             VS_OPCODE_PULL_CONSTANT_LOAD,
1588                                             dst_reg(packed_consts),
1589                                             surf_index,
1590                                             offset));
1591       pull->base_mrf = 14;
1592       pull->mlen = 1;
1593
1594       packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1595       packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1596                                             const_offset % 16 / 4,
1597                                             const_offset % 16 / 4,
1598                                             const_offset % 16 / 4);
1599
1600       /* UBO bools are any nonzero int.  We store bools as either 0 or 1. */
1601       if (ir->type->base_type == GLSL_TYPE_BOOL) {
1602          emit(CMP(result_dst, packed_consts, src_reg(0u),
1603                   BRW_CONDITIONAL_NZ));
1604          emit(AND(result_dst, result, src_reg(0x1)));
1605       } else {
1606          emit(MOV(result_dst, packed_consts));
1607       }
1608       break;
1609    }
1610
1611    case ir_binop_vector_extract:
1612       assert(!"should have been lowered by vec_index_to_cond_assign");
1613       break;
1614
1615    case ir_triop_fma:
1616       op[0] = fix_3src_operand(op[0]);
1617       op[1] = fix_3src_operand(op[1]);
1618       op[2] = fix_3src_operand(op[2]);
1619       /* Note that the instruction's argument order is reversed from GLSL
1620        * and the IR.
1621        */
1622       emit(MAD(result_dst, op[2], op[1], op[0]));
1623       break;
1624
1625    case ir_triop_lrp:
1626       op[0] = fix_3src_operand(op[0]);
1627       op[1] = fix_3src_operand(op[1]);
1628       op[2] = fix_3src_operand(op[2]);
1629       /* Note that the instruction's argument order is reversed from GLSL
1630        * and the IR.
1631        */
1632       emit(LRP(result_dst, op[2], op[1], op[0]));
1633       break;
1634
1635    case ir_triop_csel:
1636       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1637       inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1638       inst->predicate = BRW_PREDICATE_NORMAL;
1639       break;
1640
1641    case ir_triop_bfi:
1642       op[0] = fix_3src_operand(op[0]);
1643       op[1] = fix_3src_operand(op[1]);
1644       op[2] = fix_3src_operand(op[2]);
1645       emit(BFI2(result_dst, op[0], op[1], op[2]));
1646       break;
1647
1648    case ir_triop_bitfield_extract:
1649       op[0] = fix_3src_operand(op[0]);
1650       op[1] = fix_3src_operand(op[1]);
1651       op[2] = fix_3src_operand(op[2]);
1652       /* Note that the instruction's argument order is reversed from GLSL
1653        * and the IR.
1654        */
1655       emit(BFE(result_dst, op[2], op[1], op[0]));
1656       break;
1657
1658    case ir_triop_vector_insert:
1659       assert(!"should have been lowered by lower_vector_insert");
1660       break;
1661
1662    case ir_quadop_bitfield_insert:
1663       assert(!"not reached: should be handled by "
1664               "bitfield_insert_to_bfm_bfi\n");
1665       break;
1666
1667    case ir_quadop_vector:
1668       assert(!"not reached: should be handled by lower_quadop_vector");
1669       break;
1670
1671    case ir_unop_pack_half_2x16:
1672       emit_pack_half_2x16(result_dst, op[0]);
1673       break;
1674    case ir_unop_unpack_half_2x16:
1675       emit_unpack_half_2x16(result_dst, op[0]);
1676       break;
1677    case ir_unop_pack_snorm_2x16:
1678    case ir_unop_pack_snorm_4x8:
1679    case ir_unop_pack_unorm_2x16:
1680    case ir_unop_pack_unorm_4x8:
1681    case ir_unop_unpack_snorm_2x16:
1682    case ir_unop_unpack_snorm_4x8:
1683    case ir_unop_unpack_unorm_2x16:
1684    case ir_unop_unpack_unorm_4x8:
1685       assert(!"not reached: should be handled by lower_packing_builtins");
1686       break;
1687    case ir_unop_unpack_half_2x16_split_x:
1688    case ir_unop_unpack_half_2x16_split_y:
1689    case ir_binop_pack_half_2x16_split:
1690       assert(!"not reached: should not occur in vertex shader");
1691       break;
1692    case ir_binop_ldexp:
1693       assert(!"not reached: should be handled by ldexp_to_arith()");
1694       break;
1695    }
1696 }
1697
1698
1699 void
1700 vec4_visitor::visit(ir_swizzle *ir)
1701 {
1702    src_reg src;
1703    int i = 0;
1704    int swizzle[4];
1705
1706    /* Note that this is only swizzles in expressions, not those on the left
1707     * hand side of an assignment, which do write masking.  See ir_assignment
1708     * for that.
1709     */
1710
1711    ir->val->accept(this);
1712    src = this->result;
1713    assert(src.file != BAD_FILE);
1714
1715    for (i = 0; i < ir->type->vector_elements; i++) {
1716       switch (i) {
1717       case 0:
1718          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1719          break;
1720       case 1:
1721          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1722          break;
1723       case 2:
1724          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1725          break;
1726       case 3:
1727          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1728             break;
1729       }
1730    }
1731    for (; i < 4; i++) {
1732       /* Replicate the last channel out. */
1733       swizzle[i] = swizzle[ir->type->vector_elements - 1];
1734    }
1735
1736    src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1737
1738    this->result = src;
1739 }
1740
1741 void
1742 vec4_visitor::visit(ir_dereference_variable *ir)
1743 {
1744    const struct glsl_type *type = ir->type;
1745    dst_reg *reg = variable_storage(ir->var);
1746
1747    if (!reg) {
1748       fail("Failed to find variable storage for %s\n", ir->var->name);
1749       this->result = src_reg(brw_null_reg());
1750       return;
1751    }
1752
1753    this->result = src_reg(*reg);
1754
1755    /* System values get their swizzle from the dst_reg writemask */
1756    if (ir->var->mode == ir_var_system_value)
1757       return;
1758
1759    if (type->is_scalar() || type->is_vector() || type->is_matrix())
1760       this->result.swizzle = swizzle_for_size(type->vector_elements);
1761 }
1762
1763
1764 int
1765 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1766 {
1767    /* Under normal circumstances array elements are stored consecutively, so
1768     * the stride is equal to the size of the array element.
1769     */
1770    return type_size(ir->type);
1771 }
1772
1773
1774 void
1775 vec4_visitor::visit(ir_dereference_array *ir)
1776 {
1777    ir_constant *constant_index;
1778    src_reg src;
1779    int array_stride = compute_array_stride(ir);
1780
1781    constant_index = ir->array_index->constant_expression_value();
1782
1783    ir->array->accept(this);
1784    src = this->result;
1785
1786    if (constant_index) {
1787       src.reg_offset += constant_index->value.i[0] * array_stride;
1788    } else {
1789       /* Variable index array dereference.  It eats the "vec4" of the
1790        * base of the array and an index that offsets the Mesa register
1791        * index.
1792        */
1793       ir->array_index->accept(this);
1794
1795       src_reg index_reg;
1796
1797       if (array_stride == 1) {
1798          index_reg = this->result;
1799       } else {
1800          index_reg = src_reg(this, glsl_type::int_type);
1801
1802          emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
1803       }
1804
1805       if (src.reladdr) {
1806          src_reg temp = src_reg(this, glsl_type::int_type);
1807
1808          emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1809
1810          index_reg = temp;
1811       }
1812
1813       src.reladdr = ralloc(mem_ctx, src_reg);
1814       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1815    }
1816
1817    /* If the type is smaller than a vec4, replicate the last channel out. */
1818    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1819       src.swizzle = swizzle_for_size(ir->type->vector_elements);
1820    else
1821       src.swizzle = BRW_SWIZZLE_NOOP;
1822    src.type = brw_type_for_base_type(ir->type);
1823
1824    this->result = src;
1825 }
1826
1827 void
1828 vec4_visitor::visit(ir_dereference_record *ir)
1829 {
1830    unsigned int i;
1831    const glsl_type *struct_type = ir->record->type;
1832    int offset = 0;
1833
1834    ir->record->accept(this);
1835
1836    for (i = 0; i < struct_type->length; i++) {
1837       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1838          break;
1839       offset += type_size(struct_type->fields.structure[i].type);
1840    }
1841
1842    /* If the type is smaller than a vec4, replicate the last channel out. */
1843    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1844       this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1845    else
1846       this->result.swizzle = BRW_SWIZZLE_NOOP;
1847    this->result.type = brw_type_for_base_type(ir->type);
1848
1849    this->result.reg_offset += offset;
1850 }
1851
1852 /**
1853  * We want to be careful in assignment setup to hit the actual storage
1854  * instead of potentially using a temporary like we might with the
1855  * ir_dereference handler.
1856  */
1857 static dst_reg
1858 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1859 {
1860    /* The LHS must be a dereference.  If the LHS is a variable indexed array
1861     * access of a vector, it must be separated into a series conditional moves
1862     * before reaching this point (see ir_vec_index_to_cond_assign).
1863     */
1864    assert(ir->as_dereference());
1865    ir_dereference_array *deref_array = ir->as_dereference_array();
1866    if (deref_array) {
1867       assert(!deref_array->array->type->is_vector());
1868    }
1869
1870    /* Use the rvalue deref handler for the most part.  We'll ignore
1871     * swizzles in it and write swizzles using writemask, though.
1872     */
1873    ir->accept(v);
1874    return dst_reg(v->result);
1875 }
1876
1877 void
1878 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1879                               const struct glsl_type *type, uint32_t predicate)
1880 {
1881    if (type->base_type == GLSL_TYPE_STRUCT) {
1882       for (unsigned int i = 0; i < type->length; i++) {
1883          emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1884       }
1885       return;
1886    }
1887
1888    if (type->is_array()) {
1889       for (unsigned int i = 0; i < type->length; i++) {
1890          emit_block_move(dst, src, type->fields.array, predicate);
1891       }
1892       return;
1893    }
1894
1895    if (type->is_matrix()) {
1896       const struct glsl_type *vec_type;
1897
1898       vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1899                                          type->vector_elements, 1);
1900
1901       for (int i = 0; i < type->matrix_columns; i++) {
1902          emit_block_move(dst, src, vec_type, predicate);
1903       }
1904       return;
1905    }
1906
1907    assert(type->is_scalar() || type->is_vector());
1908
1909    dst->type = brw_type_for_base_type(type);
1910    src->type = dst->type;
1911
1912    dst->writemask = (1 << type->vector_elements) - 1;
1913
1914    src->swizzle = swizzle_for_size(type->vector_elements);
1915
1916    vec4_instruction *inst = emit(MOV(*dst, *src));
1917    inst->predicate = predicate;
1918
1919    dst->reg_offset++;
1920    src->reg_offset++;
1921 }
1922
1923
1924 /* If the RHS processing resulted in an instruction generating a
1925  * temporary value, and it would be easy to rewrite the instruction to
1926  * generate its result right into the LHS instead, do so.  This ends
1927  * up reliably removing instructions where it can be tricky to do so
1928  * later without real UD chain information.
1929  */
1930 bool
1931 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
1932                                      dst_reg dst,
1933                                      src_reg src,
1934                                      vec4_instruction *pre_rhs_inst,
1935                                      vec4_instruction *last_rhs_inst)
1936 {
1937    /* This could be supported, but it would take more smarts. */
1938    if (ir->condition)
1939       return false;
1940
1941    if (pre_rhs_inst == last_rhs_inst)
1942       return false; /* No instructions generated to work with. */
1943
1944    /* Make sure the last instruction generated our source reg. */
1945    if (src.file != GRF ||
1946        src.file != last_rhs_inst->dst.file ||
1947        src.reg != last_rhs_inst->dst.reg ||
1948        src.reg_offset != last_rhs_inst->dst.reg_offset ||
1949        src.reladdr ||
1950        src.abs ||
1951        src.negate ||
1952        last_rhs_inst->predicate != BRW_PREDICATE_NONE)
1953       return false;
1954
1955    /* Check that that last instruction fully initialized the channels
1956     * we want to use, in the order we want to use them.  We could
1957     * potentially reswizzle the operands of many instructions so that
1958     * we could handle out of order channels, but don't yet.
1959     */
1960
1961    for (unsigned i = 0; i < 4; i++) {
1962       if (dst.writemask & (1 << i)) {
1963          if (!(last_rhs_inst->dst.writemask & (1 << i)))
1964             return false;
1965
1966          if (BRW_GET_SWZ(src.swizzle, i) != i)
1967             return false;
1968       }
1969    }
1970
1971    /* Success!  Rewrite the instruction. */
1972    last_rhs_inst->dst.file = dst.file;
1973    last_rhs_inst->dst.reg = dst.reg;
1974    last_rhs_inst->dst.reg_offset = dst.reg_offset;
1975    last_rhs_inst->dst.reladdr = dst.reladdr;
1976    last_rhs_inst->dst.writemask &= dst.writemask;
1977
1978    return true;
1979 }
1980
1981 void
1982 vec4_visitor::visit(ir_assignment *ir)
1983 {
1984    dst_reg dst = get_assignment_lhs(ir->lhs, this);
1985    uint32_t predicate = BRW_PREDICATE_NONE;
1986
1987    if (!ir->lhs->type->is_scalar() &&
1988        !ir->lhs->type->is_vector()) {
1989       ir->rhs->accept(this);
1990       src_reg src = this->result;
1991
1992       if (ir->condition) {
1993          emit_bool_to_cond_code(ir->condition, &predicate);
1994       }
1995
1996       /* emit_block_move doesn't account for swizzles in the source register.
1997        * This should be ok, since the source register is a structure or an
1998        * array, and those can't be swizzled.  But double-check to be sure.
1999        */
2000       assert(src.swizzle ==
2001              (ir->rhs->type->is_matrix()
2002               ? swizzle_for_size(ir->rhs->type->vector_elements)
2003               : BRW_SWIZZLE_NOOP));
2004
2005       emit_block_move(&dst, &src, ir->rhs->type, predicate);
2006       return;
2007    }
2008
2009    /* Now we're down to just a scalar/vector with writemasks. */
2010    int i;
2011
2012    vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2013    pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2014
2015    ir->rhs->accept(this);
2016
2017    last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2018
2019    src_reg src = this->result;
2020
2021    int swizzles[4];
2022    int first_enabled_chan = 0;
2023    int src_chan = 0;
2024
2025    assert(ir->lhs->type->is_vector() ||
2026           ir->lhs->type->is_scalar());
2027    dst.writemask = ir->write_mask;
2028
2029    for (int i = 0; i < 4; i++) {
2030       if (dst.writemask & (1 << i)) {
2031          first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
2032          break;
2033       }
2034    }
2035
2036    /* Swizzle a small RHS vector into the channels being written.
2037     *
2038     * glsl ir treats write_mask as dictating how many channels are
2039     * present on the RHS while in our instructions we need to make
2040     * those channels appear in the slots of the vec4 they're written to.
2041     */
2042    for (int i = 0; i < 4; i++) {
2043       if (dst.writemask & (1 << i))
2044          swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
2045       else
2046          swizzles[i] = first_enabled_chan;
2047    }
2048    src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2049                               swizzles[2], swizzles[3]);
2050
2051    if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2052       return;
2053    }
2054
2055    if (ir->condition) {
2056       emit_bool_to_cond_code(ir->condition, &predicate);
2057    }
2058
2059    for (i = 0; i < type_size(ir->lhs->type); i++) {
2060       vec4_instruction *inst = emit(MOV(dst, src));
2061       inst->predicate = predicate;
2062
2063       dst.reg_offset++;
2064       src.reg_offset++;
2065    }
2066 }
2067
2068 void
2069 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2070 {
2071    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2072       foreach_list(node, &ir->components) {
2073          ir_constant *field_value = (ir_constant *)node;
2074
2075          emit_constant_values(dst, field_value);
2076       }
2077       return;
2078    }
2079
2080    if (ir->type->is_array()) {
2081       for (unsigned int i = 0; i < ir->type->length; i++) {
2082          emit_constant_values(dst, ir->array_elements[i]);
2083       }
2084       return;
2085    }
2086
2087    if (ir->type->is_matrix()) {
2088       for (int i = 0; i < ir->type->matrix_columns; i++) {
2089          float *vec = &ir->value.f[i * ir->type->vector_elements];
2090
2091          for (int j = 0; j < ir->type->vector_elements; j++) {
2092             dst->writemask = 1 << j;
2093             dst->type = BRW_REGISTER_TYPE_F;
2094
2095             emit(MOV(*dst, src_reg(vec[j])));
2096          }
2097          dst->reg_offset++;
2098       }
2099       return;
2100    }
2101
2102    int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2103
2104    for (int i = 0; i < ir->type->vector_elements; i++) {
2105       if (!(remaining_writemask & (1 << i)))
2106          continue;
2107
2108       dst->writemask = 1 << i;
2109       dst->type = brw_type_for_base_type(ir->type);
2110
2111       /* Find other components that match the one we're about to
2112        * write.  Emits fewer instructions for things like vec4(0.5,
2113        * 1.5, 1.5, 1.5).
2114        */
2115       for (int j = i + 1; j < ir->type->vector_elements; j++) {
2116          if (ir->type->base_type == GLSL_TYPE_BOOL) {
2117             if (ir->value.b[i] == ir->value.b[j])
2118                dst->writemask |= (1 << j);
2119          } else {
2120             /* u, i, and f storage all line up, so no need for a
2121              * switch case for comparing each type.
2122              */
2123             if (ir->value.u[i] == ir->value.u[j])
2124                dst->writemask |= (1 << j);
2125          }
2126       }
2127
2128       switch (ir->type->base_type) {
2129       case GLSL_TYPE_FLOAT:
2130          emit(MOV(*dst, src_reg(ir->value.f[i])));
2131          break;
2132       case GLSL_TYPE_INT:
2133          emit(MOV(*dst, src_reg(ir->value.i[i])));
2134          break;
2135       case GLSL_TYPE_UINT:
2136          emit(MOV(*dst, src_reg(ir->value.u[i])));
2137          break;
2138       case GLSL_TYPE_BOOL:
2139          emit(MOV(*dst, src_reg(ir->value.b[i])));
2140          break;
2141       default:
2142          assert(!"Non-float/uint/int/bool constant");
2143          break;
2144       }
2145
2146       remaining_writemask &= ~dst->writemask;
2147    }
2148    dst->reg_offset++;
2149 }
2150
2151 void
2152 vec4_visitor::visit(ir_constant *ir)
2153 {
2154    dst_reg dst = dst_reg(this, ir->type);
2155    this->result = src_reg(dst);
2156
2157    emit_constant_values(&dst, ir);
2158 }
2159
2160 void
2161 vec4_visitor::visit(ir_call *ir)
2162 {
2163    assert(!"not reached");
2164 }
2165
2166 void
2167 vec4_visitor::visit(ir_texture *ir)
2168 {
2169    int sampler =
2170       _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2171
2172    /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2173     * emitting anything other than setting up the constant result.
2174     */
2175    if (ir->op == ir_tg4) {
2176       ir_constant *chan = ir->lod_info.component->as_constant();
2177       int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2178       if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2179          dst_reg result(this, ir->type);
2180          this->result = src_reg(result);
2181          emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2182          return;
2183       }
2184    }
2185
2186    /* Should be lowered by do_lower_texture_projection */
2187    assert(!ir->projector);
2188
2189    /* Generate code to compute all the subexpression trees.  This has to be
2190     * done before loading any values into MRFs for the sampler message since
2191     * generating these values may involve SEND messages that need the MRFs.
2192     */
2193    src_reg coordinate;
2194    if (ir->coordinate) {
2195       ir->coordinate->accept(this);
2196       coordinate = this->result;
2197    }
2198
2199    src_reg shadow_comparitor;
2200    if (ir->shadow_comparitor) {
2201       ir->shadow_comparitor->accept(this);
2202       shadow_comparitor = this->result;
2203    }
2204
2205    const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2206    src_reg lod, dPdx, dPdy, sample_index;
2207    switch (ir->op) {
2208    case ir_tex:
2209       lod = src_reg(0.0f);
2210       lod_type = glsl_type::float_type;
2211       break;
2212    case ir_txf:
2213    case ir_txl:
2214    case ir_txs:
2215       ir->lod_info.lod->accept(this);
2216       lod = this->result;
2217       lod_type = ir->lod_info.lod->type;
2218       break;
2219    case ir_query_levels:
2220       lod = src_reg(0);
2221       lod_type = glsl_type::int_type;
2222       break;
2223    case ir_txf_ms:
2224       ir->lod_info.sample_index->accept(this);
2225       sample_index = this->result;
2226       sample_index_type = ir->lod_info.sample_index->type;
2227       break;
2228    case ir_txd:
2229       ir->lod_info.grad.dPdx->accept(this);
2230       dPdx = this->result;
2231
2232       ir->lod_info.grad.dPdy->accept(this);
2233       dPdy = this->result;
2234
2235       lod_type = ir->lod_info.grad.dPdx->type;
2236       break;
2237    case ir_txb:
2238    case ir_lod:
2239    case ir_tg4:
2240       break;
2241    }
2242
2243    vec4_instruction *inst = NULL;
2244    switch (ir->op) {
2245    case ir_tex:
2246    case ir_txl:
2247       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXL);
2248       break;
2249    case ir_txd:
2250       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXD);
2251       break;
2252    case ir_txf:
2253       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF);
2254       break;
2255    case ir_txf_ms:
2256       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_MS);
2257       break;
2258    case ir_txs:
2259       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
2260       break;
2261    case ir_tg4:
2262       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TG4);
2263       break;
2264    case ir_query_levels:
2265       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
2266       break;
2267    case ir_txb:
2268       assert(!"TXB is not valid for vertex shaders.");
2269       break;
2270    case ir_lod:
2271       assert(!"LOD is not valid for vertex shaders.");
2272       break;
2273    default:
2274       assert(!"Unrecognized tex op");
2275    }
2276
2277    bool use_texture_offset = ir->offset != NULL && ir->op != ir_txf;
2278
2279    /* Texel offsets go in the message header; Gen4 also requires headers. */
2280    inst->header_present = use_texture_offset || brw->gen < 5 || ir->op == ir_tg4;
2281    inst->base_mrf = 2;
2282    inst->mlen = inst->header_present + 1; /* always at least one */
2283    inst->sampler = sampler;
2284    inst->dst = dst_reg(this, ir->type);
2285    inst->dst.writemask = WRITEMASK_XYZW;
2286    inst->shadow_compare = ir->shadow_comparitor != NULL;
2287
2288    if (use_texture_offset)
2289       inst->texture_offset = brw_texture_offset(ir->offset->as_constant());
2290
2291    /* Stuff the channel select bits in the top of the texture offset */
2292    if (ir->op == ir_tg4)
2293       inst->texture_offset |= gather_channel(ir, sampler)<<16;
2294
2295    /* MRF for the first parameter */
2296    int param_base = inst->base_mrf + inst->header_present;
2297
2298    if (ir->op == ir_txs || ir->op == ir_query_levels) {
2299       int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2300       emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2301    } else {
2302       /* Load the coordinate */
2303       /* FINISHME: gl_clamp_mask and saturate */
2304       int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2305       int zero_mask = 0xf & ~coord_mask;
2306
2307       if (ir->offset && ir->op == ir_txf) {
2308          /* It appears that the ld instruction used for txf does its
2309           * address bounds check before adding in the offset.  To work
2310           * around this, just add the integer offset to the integer
2311           * texel coordinate, and don't put the offset in the header.
2312           */
2313          ir_constant *offset = ir->offset->as_constant();
2314          assert(offset);
2315
2316          for (int j = 0; j < ir->coordinate->type->vector_elements; j++) {
2317             src_reg src = coordinate;
2318             src.swizzle = BRW_SWIZZLE4(BRW_GET_SWZ(src.swizzle, j),
2319                                        BRW_GET_SWZ(src.swizzle, j),
2320                                        BRW_GET_SWZ(src.swizzle, j),
2321                                        BRW_GET_SWZ(src.swizzle, j));
2322             emit(ADD(dst_reg(MRF, param_base, ir->coordinate->type, 1 << j),
2323                      src, offset->value.i[j]));
2324          }
2325       } else {
2326          emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2327                   coordinate));
2328       }
2329       if (zero_mask != 0) {
2330          emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2331                   src_reg(0)));
2332       }
2333       /* Load the shadow comparitor */
2334       if (ir->shadow_comparitor && ir->op != ir_txd) {
2335          emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2336                           WRITEMASK_X),
2337                   shadow_comparitor));
2338          inst->mlen++;
2339       }
2340
2341       /* Load the LOD info */
2342       if (ir->op == ir_tex || ir->op == ir_txl) {
2343          int mrf, writemask;
2344          if (brw->gen >= 5) {
2345             mrf = param_base + 1;
2346             if (ir->shadow_comparitor) {
2347                writemask = WRITEMASK_Y;
2348                /* mlen already incremented */
2349             } else {
2350                writemask = WRITEMASK_X;
2351                inst->mlen++;
2352             }
2353          } else /* brw->gen == 4 */ {
2354             mrf = param_base;
2355             writemask = WRITEMASK_W;
2356          }
2357          emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2358       } else if (ir->op == ir_txf) {
2359          emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2360       } else if (ir->op == ir_txf_ms) {
2361          emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2362                   sample_index));
2363          inst->mlen++;
2364
2365          /* on Gen7, there is an additional MCS parameter here after SI,
2366           * but we don't bother to emit it since it's always zero. If
2367           * we start supporting texturing from CMS surfaces, this will have
2368           * to change
2369           */
2370       } else if (ir->op == ir_txd) {
2371          const glsl_type *type = lod_type;
2372
2373          if (brw->gen >= 5) {
2374             dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2375             dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2376             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2377             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2378             inst->mlen++;
2379
2380             if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2381                dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2382                dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2383                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2384                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2385                inst->mlen++;
2386
2387                if (ir->shadow_comparitor) {
2388                   emit(MOV(dst_reg(MRF, param_base + 2,
2389                                    ir->shadow_comparitor->type, WRITEMASK_Z),
2390                            shadow_comparitor));
2391                }
2392             }
2393          } else /* brw->gen == 4 */ {
2394             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2395             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2396             inst->mlen += 2;
2397          }
2398       }
2399    }
2400
2401    emit(inst);
2402
2403    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2404     * spec requires layers.
2405     */
2406    if (ir->op == ir_txs) {
2407       glsl_type const *type = ir->sampler->type;
2408       if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2409           type->sampler_array) {
2410          emit_math(SHADER_OPCODE_INT_QUOTIENT,
2411                    with_writemask(inst->dst, WRITEMASK_Z),
2412                    src_reg(inst->dst), src_reg(6));
2413       }
2414    }
2415
2416    swizzle_result(ir, src_reg(inst->dst), sampler);
2417 }
2418
2419 /**
2420  * Set up the gather channel based on the swizzle, for gather4.
2421  */
2422 uint32_t
2423 vec4_visitor::gather_channel(ir_texture *ir, int sampler)
2424 {
2425    ir_constant *chan = ir->lod_info.component->as_constant();
2426    int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2427    switch (swiz) {
2428       case SWIZZLE_X: return 0;
2429       case SWIZZLE_Y:
2430          /* gather4 sampler is broken for green channel on RG32F --
2431           * we must ask for blue instead.
2432           */
2433          if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2434             return 2;
2435          return 1;
2436       case SWIZZLE_Z: return 2;
2437       case SWIZZLE_W: return 3;
2438       default:
2439          assert(!"Not reached"); /* zero, one swizzles handled already */
2440          return 0;
2441    }
2442 }
2443
2444 void
2445 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, int sampler)
2446 {
2447    int s = key->tex.swizzles[sampler];
2448
2449    this->result = src_reg(this, ir->type);
2450    dst_reg swizzled_result(this->result);
2451
2452    if (ir->op == ir_query_levels) {
2453       /* # levels is in .w */
2454       orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2455       emit(MOV(swizzled_result, orig_val));
2456       return;
2457    }
2458
2459    if (ir->op == ir_txs || ir->type == glsl_type::float_type
2460                         || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2461       emit(MOV(swizzled_result, orig_val));
2462       return;
2463    }
2464
2465
2466    int zero_mask = 0, one_mask = 0, copy_mask = 0;
2467    int swizzle[4] = {0};
2468
2469    for (int i = 0; i < 4; i++) {
2470       switch (GET_SWZ(s, i)) {
2471       case SWIZZLE_ZERO:
2472          zero_mask |= (1 << i);
2473          break;
2474       case SWIZZLE_ONE:
2475          one_mask |= (1 << i);
2476          break;
2477       default:
2478          copy_mask |= (1 << i);
2479          swizzle[i] = GET_SWZ(s, i);
2480          break;
2481       }
2482    }
2483
2484    if (copy_mask) {
2485       orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2486       swizzled_result.writemask = copy_mask;
2487       emit(MOV(swizzled_result, orig_val));
2488    }
2489
2490    if (zero_mask) {
2491       swizzled_result.writemask = zero_mask;
2492       emit(MOV(swizzled_result, src_reg(0.0f)));
2493    }
2494
2495    if (one_mask) {
2496       swizzled_result.writemask = one_mask;
2497       emit(MOV(swizzled_result, src_reg(1.0f)));
2498    }
2499 }
2500
2501 void
2502 vec4_visitor::visit(ir_return *ir)
2503 {
2504    assert(!"not reached");
2505 }
2506
2507 void
2508 vec4_visitor::visit(ir_discard *ir)
2509 {
2510    assert(!"not reached");
2511 }
2512
2513 void
2514 vec4_visitor::visit(ir_if *ir)
2515 {
2516    /* Don't point the annotation at the if statement, because then it plus
2517     * the then and else blocks get printed.
2518     */
2519    this->base_ir = ir->condition;
2520
2521    if (brw->gen == 6) {
2522       emit_if_gen6(ir);
2523    } else {
2524       uint32_t predicate;
2525       emit_bool_to_cond_code(ir->condition, &predicate);
2526       emit(IF(predicate));
2527    }
2528
2529    visit_instructions(&ir->then_instructions);
2530
2531    if (!ir->else_instructions.is_empty()) {
2532       this->base_ir = ir->condition;
2533       emit(BRW_OPCODE_ELSE);
2534
2535       visit_instructions(&ir->else_instructions);
2536    }
2537
2538    this->base_ir = ir->condition;
2539    emit(BRW_OPCODE_ENDIF);
2540 }
2541
2542 void
2543 vec4_visitor::visit(ir_emit_vertex *)
2544 {
2545    assert(!"not reached");
2546 }
2547
2548 void
2549 vec4_visitor::visit(ir_end_primitive *)
2550 {
2551    assert(!"not reached");
2552 }
2553
2554 void
2555 vec4_visitor::emit_ndc_computation()
2556 {
2557    /* Get the position */
2558    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2559
2560    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2561    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2562    output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2563
2564    current_annotation = "NDC";
2565    dst_reg ndc_w = ndc;
2566    ndc_w.writemask = WRITEMASK_W;
2567    src_reg pos_w = pos;
2568    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2569    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2570
2571    dst_reg ndc_xyz = ndc;
2572    ndc_xyz.writemask = WRITEMASK_XYZ;
2573
2574    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2575 }
2576
2577 void
2578 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2579 {
2580    if (brw->gen < 6 &&
2581        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2582         key->userclip_active || brw->has_negative_rhw_bug)) {
2583       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2584       dst_reg header1_w = header1;
2585       header1_w.writemask = WRITEMASK_W;
2586
2587       emit(MOV(header1, 0u));
2588
2589       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2590          src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2591
2592          current_annotation = "Point size";
2593          emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2594          emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2595       }
2596
2597       if (key->userclip_active) {
2598          current_annotation = "Clipping flags";
2599          dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
2600          dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
2601
2602          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
2603          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
2604          emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
2605
2606          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
2607          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
2608          emit(SHL(flags1, src_reg(flags1), src_reg(4)));
2609          emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
2610       }
2611
2612       /* i965 clipping workaround:
2613        * 1) Test for -ve rhw
2614        * 2) If set,
2615        *      set ndc = (0,0,0,0)
2616        *      set ucp[6] = 1
2617        *
2618        * Later, clipping will detect ucp[6] and ensure the primitive is
2619        * clipped against all fixed planes.
2620        */
2621       if (brw->has_negative_rhw_bug) {
2622          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
2623          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
2624          emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
2625          vec4_instruction *inst;
2626          inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
2627          inst->predicate = BRW_PREDICATE_NORMAL;
2628          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
2629          inst->predicate = BRW_PREDICATE_NORMAL;
2630       }
2631
2632       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2633    } else if (brw->gen < 6) {
2634       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2635    } else {
2636       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2637       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2638          emit(MOV(brw_writemask(reg, WRITEMASK_W),
2639                   src_reg(output_reg[VARYING_SLOT_PSIZ])));
2640       }
2641       if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
2642          emit(MOV(retype(brw_writemask(reg, WRITEMASK_Y), BRW_REGISTER_TYPE_D),
2643                   src_reg(output_reg[VARYING_SLOT_LAYER])));
2644       }
2645    }
2646 }
2647
2648 void
2649 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
2650 {
2651    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2652     *
2653     *     "If a linked set of shaders forming the vertex stage contains no
2654     *     static write to gl_ClipVertex or gl_ClipDistance, but the
2655     *     application has requested clipping against user clip planes through
2656     *     the API, then the coordinate written to gl_Position is used for
2657     *     comparison against the user clip planes."
2658     *
2659     * This function is only called if the shader didn't write to
2660     * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
2661     * if the user wrote to it; otherwise we use gl_Position.
2662     */
2663    gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
2664    if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
2665       clip_vertex = VARYING_SLOT_POS;
2666    }
2667
2668    for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
2669         ++i) {
2670       reg.writemask = 1 << i;
2671       emit(DP4(reg,
2672                src_reg(output_reg[clip_vertex]),
2673                src_reg(this->userplane[i + offset])));
2674    }
2675 }
2676
2677 void
2678 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
2679 {
2680    assert (varying < VARYING_SLOT_MAX);
2681    reg.type = output_reg[varying].type;
2682    current_annotation = output_reg_annotation[varying];
2683    /* Copy the register, saturating if necessary */
2684    vec4_instruction *inst = emit(MOV(reg,
2685                                      src_reg(output_reg[varying])));
2686    if ((varying == VARYING_SLOT_COL0 ||
2687         varying == VARYING_SLOT_COL1 ||
2688         varying == VARYING_SLOT_BFC0 ||
2689         varying == VARYING_SLOT_BFC1) &&
2690        key->clamp_vertex_color) {
2691       inst->saturate = true;
2692    }
2693 }
2694
2695 void
2696 vec4_visitor::emit_urb_slot(int mrf, int varying)
2697 {
2698    struct brw_reg hw_reg = brw_message_reg(mrf);
2699    dst_reg reg = dst_reg(MRF, mrf);
2700    reg.type = BRW_REGISTER_TYPE_F;
2701
2702    switch (varying) {
2703    case VARYING_SLOT_PSIZ:
2704       /* PSIZ is always in slot 0, and is coupled with other flags. */
2705       current_annotation = "indices, point width, clip flags";
2706       emit_psiz_and_flags(hw_reg);
2707       break;
2708    case BRW_VARYING_SLOT_NDC:
2709       current_annotation = "NDC";
2710       emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
2711       break;
2712    case VARYING_SLOT_POS:
2713       current_annotation = "gl_Position";
2714       emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
2715       break;
2716    case VARYING_SLOT_EDGE:
2717       /* This is present when doing unfilled polygons.  We're supposed to copy
2718        * the edge flag from the user-provided vertex array
2719        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
2720        * of that attribute (starts as 1.0f).  This is then used in clipping to
2721        * determine which edges should be drawn as wireframe.
2722        */
2723       current_annotation = "edge flag";
2724       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
2725                                     glsl_type::float_type, WRITEMASK_XYZW))));
2726       break;
2727    case BRW_VARYING_SLOT_PAD:
2728       /* No need to write to this slot */
2729       break;
2730    default:
2731       emit_generic_urb_slot(reg, varying);
2732       break;
2733    }
2734 }
2735
2736 static int
2737 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
2738 {
2739    if (brw->gen >= 6) {
2740       /* URB data written (does not include the message header reg) must
2741        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
2742        * section 5.4.3.2.2: URB_INTERLEAVED.
2743        *
2744        * URB entries are allocated on a multiple of 1024 bits, so an
2745        * extra 128 bits written here to make the end align to 256 is
2746        * no problem.
2747        */
2748       if ((mlen % 2) != 1)
2749          mlen++;
2750    }
2751
2752    return mlen;
2753 }
2754
2755
2756 /**
2757  * Generates the VUE payload plus the necessary URB write instructions to
2758  * output it.
2759  *
2760  * The VUE layout is documented in Volume 2a.
2761  */
2762 void
2763 vec4_visitor::emit_vertex()
2764 {
2765    /* MRF 0 is reserved for the debugger, so start with message header
2766     * in MRF 1.
2767     */
2768    int base_mrf = 1;
2769    int mrf = base_mrf;
2770    /* In the process of generating our URB write message contents, we
2771     * may need to unspill a register or load from an array.  Those
2772     * reads would use MRFs 14-15.
2773     */
2774    int max_usable_mrf = 13;
2775
2776    /* The following assertion verifies that max_usable_mrf causes an
2777     * even-numbered amount of URB write data, which will meet gen6's
2778     * requirements for length alignment.
2779     */
2780    assert ((max_usable_mrf - base_mrf) % 2 == 0);
2781
2782    /* First mrf is the g0-based message header containing URB handles and
2783     * such.
2784     */
2785    emit_urb_write_header(mrf++);
2786
2787    if (brw->gen < 6) {
2788       emit_ndc_computation();
2789    }
2790
2791    /* Lower legacy ff and ClipVertex clipping to clip distances */
2792    if (key->userclip_active && !prog->UsesClipDistanceOut) {
2793       current_annotation = "user clip distances";
2794
2795       output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
2796       output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
2797
2798       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
2799       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
2800    }
2801
2802    /* We may need to split this up into several URB writes, so do them in a
2803     * loop.
2804     */
2805    int slot = 0;
2806    bool complete = false;
2807    do {
2808       /* URB offset is in URB row increments, and each of our MRFs is half of
2809        * one of those, since we're doing interleaved writes.
2810        */
2811       int offset = slot / 2;
2812
2813       mrf = base_mrf + 1;
2814       for (; slot < prog_data->vue_map.num_slots; ++slot) {
2815          emit_urb_slot(mrf++, prog_data->vue_map.slot_to_varying[slot]);
2816
2817          /* If this was max_usable_mrf, we can't fit anything more into this
2818           * URB WRITE.
2819           */
2820          if (mrf > max_usable_mrf) {
2821             slot++;
2822             break;
2823          }
2824       }
2825
2826       complete = slot >= prog_data->vue_map.num_slots;
2827       current_annotation = "URB write";
2828       vec4_instruction *inst = emit_urb_write_opcode(complete);
2829       inst->base_mrf = base_mrf;
2830       inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2831       inst->offset += offset;
2832    } while(!complete);
2833 }
2834
2835
2836 src_reg
2837 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
2838                                  src_reg *reladdr, int reg_offset)
2839 {
2840    /* Because we store the values to scratch interleaved like our
2841     * vertex data, we need to scale the vec4 index by 2.
2842     */
2843    int message_header_scale = 2;
2844
2845    /* Pre-gen6, the message header uses byte offsets instead of vec4
2846     * (16-byte) offset units.
2847     */
2848    if (brw->gen < 6)
2849       message_header_scale *= 16;
2850
2851    if (reladdr) {
2852       src_reg index = src_reg(this, glsl_type::int_type);
2853
2854       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2855       emit_before(inst, MUL(dst_reg(index),
2856                             index, src_reg(message_header_scale)));
2857
2858       return index;
2859    } else {
2860       return src_reg(reg_offset * message_header_scale);
2861    }
2862 }
2863
2864 src_reg
2865 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
2866                                        src_reg *reladdr, int reg_offset)
2867 {
2868    if (reladdr) {
2869       src_reg index = src_reg(this, glsl_type::int_type);
2870
2871       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2872
2873       /* Pre-gen6, the message header uses byte offsets instead of vec4
2874        * (16-byte) offset units.
2875        */
2876       if (brw->gen < 6) {
2877          emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
2878       }
2879
2880       return index;
2881    } else {
2882       int message_header_scale = brw->gen < 6 ? 16 : 1;
2883       return src_reg(reg_offset * message_header_scale);
2884    }
2885 }
2886
2887 /**
2888  * Emits an instruction before @inst to load the value named by @orig_src
2889  * from scratch space at @base_offset to @temp.
2890  *
2891  * @base_offset is measured in 32-byte units (the size of a register).
2892  */
2893 void
2894 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
2895                                 dst_reg temp, src_reg orig_src,
2896                                 int base_offset)
2897 {
2898    int reg_offset = base_offset + orig_src.reg_offset;
2899    src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
2900
2901    emit_before(inst, SCRATCH_READ(temp, index));
2902 }
2903
2904 /**
2905  * Emits an instruction after @inst to store the value to be written
2906  * to @orig_dst to scratch space at @base_offset, from @temp.
2907  *
2908  * @base_offset is measured in 32-byte units (the size of a register).
2909  */
2910 void
2911 vec4_visitor::emit_scratch_write(vec4_instruction *inst, int base_offset)
2912 {
2913    int reg_offset = base_offset + inst->dst.reg_offset;
2914    src_reg index = get_scratch_offset(inst, inst->dst.reladdr, reg_offset);
2915
2916    /* Create a temporary register to store *inst's result in.
2917     *
2918     * We have to be careful in MOVing from our temporary result register in
2919     * the scratch write.  If we swizzle from channels of the temporary that
2920     * weren't initialized, it will confuse live interval analysis, which will
2921     * make spilling fail to make progress.
2922     */
2923    src_reg temp = src_reg(this, glsl_type::vec4_type);
2924    temp.type = inst->dst.type;
2925    int first_writemask_chan = ffs(inst->dst.writemask) - 1;
2926    int swizzles[4];
2927    for (int i = 0; i < 4; i++)
2928       if (inst->dst.writemask & (1 << i))
2929          swizzles[i] = i;
2930       else
2931          swizzles[i] = first_writemask_chan;
2932    temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2933                                swizzles[2], swizzles[3]);
2934
2935    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
2936                                        inst->dst.writemask));
2937    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
2938    write->predicate = inst->predicate;
2939    write->ir = inst->ir;
2940    write->annotation = inst->annotation;
2941    inst->insert_after(write);
2942
2943    inst->dst.file = temp.file;
2944    inst->dst.reg = temp.reg;
2945    inst->dst.reg_offset = temp.reg_offset;
2946    inst->dst.reladdr = NULL;
2947 }
2948
2949 /**
2950  * We can't generally support array access in GRF space, because a
2951  * single instruction's destination can only span 2 contiguous
2952  * registers.  So, we send all GRF arrays that get variable index
2953  * access to scratch space.
2954  */
2955 void
2956 vec4_visitor::move_grf_array_access_to_scratch()
2957 {
2958    int scratch_loc[this->virtual_grf_count];
2959
2960    for (int i = 0; i < this->virtual_grf_count; i++) {
2961       scratch_loc[i] = -1;
2962    }
2963
2964    /* First, calculate the set of virtual GRFs that need to be punted
2965     * to scratch due to having any array access on them, and where in
2966     * scratch.
2967     */
2968    foreach_list(node, &this->instructions) {
2969       vec4_instruction *inst = (vec4_instruction *)node;
2970
2971       if (inst->dst.file == GRF && inst->dst.reladdr &&
2972           scratch_loc[inst->dst.reg] == -1) {
2973          scratch_loc[inst->dst.reg] = c->last_scratch;
2974          c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
2975       }
2976
2977       for (int i = 0 ; i < 3; i++) {
2978          src_reg *src = &inst->src[i];
2979
2980          if (src->file == GRF && src->reladdr &&
2981              scratch_loc[src->reg] == -1) {
2982             scratch_loc[src->reg] = c->last_scratch;
2983             c->last_scratch += this->virtual_grf_sizes[src->reg];
2984          }
2985       }
2986    }
2987
2988    /* Now, for anything that will be accessed through scratch, rewrite
2989     * it to load/store.  Note that this is a _safe list walk, because
2990     * we may generate a new scratch_write instruction after the one
2991     * we're processing.
2992     */
2993    foreach_list_safe(node, &this->instructions) {
2994       vec4_instruction *inst = (vec4_instruction *)node;
2995
2996       /* Set up the annotation tracking for new generated instructions. */
2997       base_ir = inst->ir;
2998       current_annotation = inst->annotation;
2999
3000       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
3001          emit_scratch_write(inst, scratch_loc[inst->dst.reg]);
3002       }
3003
3004       for (int i = 0 ; i < 3; i++) {
3005          if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
3006             continue;
3007
3008          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3009
3010          emit_scratch_read(inst, temp, inst->src[i],
3011                            scratch_loc[inst->src[i].reg]);
3012
3013          inst->src[i].file = temp.file;
3014          inst->src[i].reg = temp.reg;
3015          inst->src[i].reg_offset = temp.reg_offset;
3016          inst->src[i].reladdr = NULL;
3017       }
3018    }
3019 }
3020
3021 /**
3022  * Emits an instruction before @inst to load the value named by @orig_src
3023  * from the pull constant buffer (surface) at @base_offset to @temp.
3024  */
3025 void
3026 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
3027                                       dst_reg temp, src_reg orig_src,
3028                                       int base_offset)
3029 {
3030    int reg_offset = base_offset + orig_src.reg_offset;
3031    src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3032    src_reg offset = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
3033    vec4_instruction *load;
3034
3035    if (brw->gen >= 7) {
3036       dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
3037       grf_offset.type = offset.type;
3038       emit_before(inst, MOV(grf_offset, offset));
3039
3040       load = new(mem_ctx) vec4_instruction(this,
3041                                            VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
3042                                            temp, index, src_reg(grf_offset));
3043    } else {
3044       load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
3045                                            temp, index, offset);
3046       load->base_mrf = 14;
3047       load->mlen = 1;
3048    }
3049    emit_before(inst, load);
3050 }
3051
3052 /**
3053  * Implements array access of uniforms by inserting a
3054  * PULL_CONSTANT_LOAD instruction.
3055  *
3056  * Unlike temporary GRF array access (where we don't support it due to
3057  * the difficulty of doing relative addressing on instruction
3058  * destinations), we could potentially do array access of uniforms
3059  * that were loaded in GRF space as push constants.  In real-world
3060  * usage we've seen, though, the arrays being used are always larger
3061  * than we could load as push constants, so just always move all
3062  * uniform array access out to a pull constant buffer.
3063  */
3064 void
3065 vec4_visitor::move_uniform_array_access_to_pull_constants()
3066 {
3067    int pull_constant_loc[this->uniforms];
3068
3069    for (int i = 0; i < this->uniforms; i++) {
3070       pull_constant_loc[i] = -1;
3071    }
3072
3073    /* Walk through and find array access of uniforms.  Put a copy of that
3074     * uniform in the pull constant buffer.
3075     *
3076     * Note that we don't move constant-indexed accesses to arrays.  No
3077     * testing has been done of the performance impact of this choice.
3078     */
3079    foreach_list_safe(node, &this->instructions) {
3080       vec4_instruction *inst = (vec4_instruction *)node;
3081
3082       for (int i = 0 ; i < 3; i++) {
3083          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3084             continue;
3085
3086          int uniform = inst->src[i].reg;
3087
3088          /* If this array isn't already present in the pull constant buffer,
3089           * add it.
3090           */
3091          if (pull_constant_loc[uniform] == -1) {
3092             const float **values = &prog_data->param[uniform * 4];
3093
3094             pull_constant_loc[uniform] = prog_data->nr_pull_params / 4;
3095
3096             for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3097                prog_data->pull_param[prog_data->nr_pull_params++]
3098                   = values[j];
3099             }
3100          }
3101
3102          /* Set up the annotation tracking for new generated instructions. */
3103          base_ir = inst->ir;
3104          current_annotation = inst->annotation;
3105
3106          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3107
3108          emit_pull_constant_load(inst, temp, inst->src[i],
3109                                  pull_constant_loc[uniform]);
3110
3111          inst->src[i].file = temp.file;
3112          inst->src[i].reg = temp.reg;
3113          inst->src[i].reg_offset = temp.reg_offset;
3114          inst->src[i].reladdr = NULL;
3115       }
3116    }
3117
3118    /* Now there are no accesses of the UNIFORM file with a reladdr, so
3119     * no need to track them as larger-than-vec4 objects.  This will be
3120     * relied on in cutting out unused uniform vectors from push
3121     * constants.
3122     */
3123    split_uniform_registers();
3124 }
3125
3126 void
3127 vec4_visitor::resolve_ud_negate(src_reg *reg)
3128 {
3129    if (reg->type != BRW_REGISTER_TYPE_UD ||
3130        !reg->negate)
3131       return;
3132
3133    src_reg temp = src_reg(this, glsl_type::uvec4_type);
3134    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3135    *reg = temp;
3136 }
3137
3138 vec4_visitor::vec4_visitor(struct brw_context *brw,
3139                            struct brw_vec4_compile *c,
3140                            struct gl_program *prog,
3141                            const struct brw_vec4_prog_key *key,
3142                            struct brw_vec4_prog_data *prog_data,
3143                            struct gl_shader_program *shader_prog,
3144                            struct brw_shader *shader,
3145                            void *mem_ctx,
3146                            bool debug_flag,
3147                            bool no_spills)
3148    : debug_flag(debug_flag), no_spills(no_spills)
3149 {
3150    this->brw = brw;
3151    this->ctx = &brw->ctx;
3152    this->shader_prog = shader_prog;
3153    this->shader = shader;
3154
3155    this->mem_ctx = mem_ctx;
3156    this->failed = false;
3157
3158    this->base_ir = NULL;
3159    this->current_annotation = NULL;
3160    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3161
3162    this->c = c;
3163    this->prog = prog;
3164    this->key = key;
3165    this->prog_data = prog_data;
3166    this->stage_prog_data = &prog_data->base;
3167
3168    this->variable_ht = hash_table_ctor(0,
3169                                        hash_table_pointer_hash,
3170                                        hash_table_pointer_compare);
3171
3172    this->virtual_grf_start = NULL;
3173    this->virtual_grf_end = NULL;
3174    this->virtual_grf_sizes = NULL;
3175    this->virtual_grf_count = 0;
3176    this->virtual_grf_reg_map = NULL;
3177    this->virtual_grf_reg_count = 0;
3178    this->virtual_grf_array_size = 0;
3179    this->live_intervals_valid = false;
3180
3181    this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3182
3183    this->uniforms = 0;
3184 }
3185
3186 vec4_visitor::~vec4_visitor()
3187 {
3188    hash_table_dtor(this->variable_ht);
3189 }
3190
3191
3192 void
3193 vec4_visitor::fail(const char *format, ...)
3194 {
3195    va_list va;
3196    char *msg;
3197
3198    if (failed)
3199       return;
3200
3201    failed = true;
3202
3203    va_start(va, format);
3204    msg = ralloc_vasprintf(mem_ctx, format, va);
3205    va_end(va);
3206    msg = ralloc_asprintf(mem_ctx, "VS compile failed: %s\n", msg);
3207
3208    this->fail_msg = msg;
3209
3210    if (debug_flag) {
3211       fprintf(stderr, "%s",  msg);
3212    }
3213 }
3214
3215 } /* namespace brw */