src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "glsl/ir_uniform.h"
  26 extern "C" {
  27 #include "program/sampler.h"
  28 }
  29
  30 namespace brw {
  31
  32 vec4_instruction::vec4_instruction(vec4_visitor *v,
  33                                    enum opcode opcode, dst_reg dst,
  34                                    src_reg src0, src_reg src1, src_reg src2)
  35 {
  36    this->opcode = opcode;
  37    this->dst = dst;
  38    this->src[0] = src0;
  39    this->src[1] = src1;
  40    this->src[2] = src2;
  41    this->saturate = false;
  42    this->force_writemask_all = false;
  43    this->no_dd_clear = false;
  44    this->no_dd_check = false;
  45    this->conditional_mod = BRW_CONDITIONAL_NONE;
  46    this->sampler = 0;
  47    this->texture_offset = 0;
  48    this->target = 0;
  49    this->shadow_compare = false;
  50    this->ir = v->base_ir;
  51    this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
  52    this->header_present = false;
  53    this->mlen = 0;
  54    this->base_mrf = 0;
  55    this->offset = 0;
  56    this->annotation = v->current_annotation;
  57 }
  58
  59 vec4_instruction *
  60 vec4_visitor::emit(vec4_instruction *inst)
  61 {
  62    this->instructions.push_tail(inst);
  63
  64    return inst;
  65 }
  66
  67 vec4_instruction *
  68 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
  69 {
  70    new_inst->ir = inst->ir;
  71    new_inst->annotation = inst->annotation;
  72
  73    inst->insert_before(new_inst);
  74
  75    return inst;
  76 }
  77
  78 vec4_instruction *
  79 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
  80                    src_reg src0, src_reg src1, src_reg src2)
  81 {
  82    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
  83                                              src0, src1, src2));
  84 }
  85
  86
  87 vec4_instruction *
  88 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
  89 {
  90    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
  91 }
  92
  93 vec4_instruction *
  94 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
  95 {
  96    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
  97 }
  98
  99 vec4_instruction *
 100 vec4_visitor::emit(enum opcode opcode, dst_reg dst)
 101 {
 102    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst));
 103 }
 104
 105 vec4_instruction *
 106 vec4_visitor::emit(enum opcode opcode)
 107 {
 108    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
 109 }
 110
 111 #define ALU1(op)                                                        \
 112    vec4_instruction *                                                   \
 113    vec4_visitor::op(dst_reg dst, src_reg src0)                          \
 114    {                                                                    \
 115       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 116                                            src0);                       \
 117    }
 118
 119 #define ALU2(op)                                                        \
 120    vec4_instruction *                                                   \
 121    vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1)            \
 122    {                                                                    \
 123       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 124                                            src0, src1);                 \
 125    }
 126
 127 #define ALU3(op)                                                        \
 128    vec4_instruction *                                                   \
 129    vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1, src_reg src2)\
 130    {                                                                    \
 131       assert(brw->gen >= 6);                                            \
 132       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 133                                            src0, src1, src2);           \
 134    }
 135
 136 ALU1(NOT)
 137 ALU1(MOV)
 138 ALU1(FRC)
 139 ALU1(RNDD)
 140 ALU1(RNDE)
 141 ALU1(RNDZ)
 142 ALU1(F32TO16)
 143 ALU1(F16TO32)
 144 ALU2(ADD)
 145 ALU2(MUL)
 146 ALU2(MACH)
 147 ALU2(AND)
 148 ALU2(OR)
 149 ALU2(XOR)
 150 ALU2(DP3)
 151 ALU2(DP4)
 152 ALU2(DPH)
 153 ALU2(SHL)
 154 ALU2(SHR)
 155 ALU2(ASR)
 156 ALU3(LRP)
 157 ALU1(BFREV)
 158 ALU3(BFE)
 159 ALU2(BFI1)
 160 ALU3(BFI2)
 161 ALU1(FBH)
 162 ALU1(FBL)
 163 ALU1(CBIT)
 164 ALU3(MAD)
 165 ALU2(ADDC)
 166 ALU2(SUBB)
 167
 168 /** Gen4 predicated IF. */
 169 vec4_instruction *
 170 vec4_visitor::IF(uint32_t predicate)
 171 {
 172    vec4_instruction *inst;
 173
 174    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
 175    inst->predicate = predicate;
 176
 177    return inst;
 178 }
 179
 180 /** Gen6 IF with embedded comparison. */
 181 vec4_instruction *
 182 vec4_visitor::IF(src_reg src0, src_reg src1, uint32_t condition)
 183 {
 184    assert(brw->gen == 6);
 185
 186    vec4_instruction *inst;
 187
 188    resolve_ud_negate(&src0);
 189    resolve_ud_negate(&src1);
 190
 191    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
 192                                         src0, src1);
 193    inst->conditional_mod = condition;
 194
 195    return inst;
 196 }
 197
 198 /**
 199  * CMP: Sets the low bit of the destination channels with the result
 200  * of the comparison, while the upper bits are undefined, and updates
 201  * the flag register with the packed 16 bits of the result.
 202  */
 203 vec4_instruction *
 204 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, uint32_t condition)
 205 {
 206    vec4_instruction *inst;
 207
 208    /* original gen4 does type conversion to the destination type
 209     * before before comparison, producing garbage results for floating
 210     * point comparisons.
 211     */
 212    if (brw->gen == 4) {
 213       dst.type = src0.type;
 214       if (dst.file == HW_REG)
 215          dst.fixed_hw_reg.type = dst.type;
 216    }
 217
 218    resolve_ud_negate(&src0);
 219    resolve_ud_negate(&src1);
 220
 221    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
 222    inst->conditional_mod = condition;
 223
 224    return inst;
 225 }
 226
 227 vec4_instruction *
 228 vec4_visitor::SCRATCH_READ(dst_reg dst, src_reg index)
 229 {
 230    vec4_instruction *inst;
 231
 232    inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_READ,
 233                                         dst, index);
 234    inst->base_mrf = 14;
 235    inst->mlen = 2;
 236
 237    return inst;
 238 }
 239
 240 vec4_instruction *
 241 vec4_visitor::SCRATCH_WRITE(dst_reg dst, src_reg src, src_reg index)
 242 {
 243    vec4_instruction *inst;
 244
 245    inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_WRITE,
 246                                         dst, src, index);
 247    inst->base_mrf = 13;
 248    inst->mlen = 3;
 249
 250    return inst;
 251 }
 252
 253 void
 254 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
 255 {
 256    static enum opcode dot_opcodes[] = {
 257       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
 258    };
 259
 260    emit(dot_opcodes[elements - 2], dst, src0, src1);
 261 }
 262
 263 src_reg
 264 vec4_visitor::fix_3src_operand(src_reg src)
 265 {
 266    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
 267     * able to use vertical stride of zero to replicate the vec4 uniform, like
 268     *
 269     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
 270     *
 271     * But you can't, since vertical stride is always four in three-source
 272     * instructions. Instead, insert a MOV instruction to do the replication so
 273     * that the three-source instruction can consume it.
 274     */
 275
 276    /* The MOV is only needed if the source is a uniform or immediate. */
 277    if (src.file != UNIFORM && src.file != IMM)
 278       return src;
 279
 280    if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
 281       return src;
 282
 283    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 284    expanded.type = src.type;
 285    emit(MOV(expanded, src));
 286    return src_reg(expanded);
 287 }
 288
 289 src_reg
 290 vec4_visitor::fix_math_operand(src_reg src)
 291 {
 292    /* The gen6 math instruction ignores the source modifiers --
 293     * swizzle, abs, negate, and at least some parts of the register
 294     * region description.
 295     *
 296     * Rather than trying to enumerate all these cases, *always* expand the
 297     * operand to a temp GRF for gen6.
 298     *
 299     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 300     * can't use.
 301     */
 302
 303    if (brw->gen == 7 && src.file != IMM)
 304       return src;
 305
 306    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 307    expanded.type = src.type;
 308    emit(MOV(expanded, src));
 309    return src_reg(expanded);
 310 }
 311
 312 void
 313 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
 314 {
 315    src = fix_math_operand(src);
 316
 317    if (dst.writemask != WRITEMASK_XYZW) {
 318       /* The gen6 math instruction must be align1, so we can't do
 319        * writemasks.
 320        */
 321       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 322
 323       emit(opcode, temp_dst, src);
 324
 325       emit(MOV(dst, src_reg(temp_dst)));
 326    } else {
 327       emit(opcode, dst, src);
 328    }
 329 }
 330
 331 void
 332 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
 333 {
 334    vec4_instruction *inst = emit(opcode, dst, src);
 335    inst->base_mrf = 1;
 336    inst->mlen = 1;
 337 }
 338
 339 void
 340 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
 341 {
 342    switch (opcode) {
 343    case SHADER_OPCODE_RCP:
 344    case SHADER_OPCODE_RSQ:
 345    case SHADER_OPCODE_SQRT:
 346    case SHADER_OPCODE_EXP2:
 347    case SHADER_OPCODE_LOG2:
 348    case SHADER_OPCODE_SIN:
 349    case SHADER_OPCODE_COS:
 350       break;
 351    default:
 352       assert(!"not reached: bad math opcode");
 353       return;
 354    }
 355
 356    if (brw->gen >= 6) {
 357       return emit_math1_gen6(opcode, dst, src);
 358    } else {
 359       return emit_math1_gen4(opcode, dst, src);
 360    }
 361 }
 362
 363 void
 364 vec4_visitor::emit_math2_gen6(enum opcode opcode,
 365                               dst_reg dst, src_reg src0, src_reg src1)
 366 {
 367    src0 = fix_math_operand(src0);
 368    src1 = fix_math_operand(src1);
 369
 370    if (dst.writemask != WRITEMASK_XYZW) {
 371       /* The gen6 math instruction must be align1, so we can't do
 372        * writemasks.
 373        */
 374       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 375       temp_dst.type = dst.type;
 376
 377       emit(opcode, temp_dst, src0, src1);
 378
 379       emit(MOV(dst, src_reg(temp_dst)));
 380    } else {
 381       emit(opcode, dst, src0, src1);
 382    }
 383 }
 384
 385 void
 386 vec4_visitor::emit_math2_gen4(enum opcode opcode,
 387                               dst_reg dst, src_reg src0, src_reg src1)
 388 {
 389    vec4_instruction *inst = emit(opcode, dst, src0, src1);
 390    inst->base_mrf = 1;
 391    inst->mlen = 2;
 392 }
 393
 394 void
 395 vec4_visitor::emit_math(enum opcode opcode,
 396                         dst_reg dst, src_reg src0, src_reg src1)
 397 {
 398    switch (opcode) {
 399    case SHADER_OPCODE_POW:
 400    case SHADER_OPCODE_INT_QUOTIENT:
 401    case SHADER_OPCODE_INT_REMAINDER:
 402       break;
 403    default:
 404       assert(!"not reached: unsupported binary math opcode");
 405       return;
 406    }
 407
 408    if (brw->gen >= 6) {
 409       return emit_math2_gen6(opcode, dst, src0, src1);
 410    } else {
 411       return emit_math2_gen4(opcode, dst, src0, src1);
 412    }
 413 }
 414
 415 void
 416 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 417 {
 418    if (brw->gen < 7)
 419       assert(!"ir_unop_pack_half_2x16 should be lowered");
 420
 421    assert(dst.type == BRW_REGISTER_TYPE_UD);
 422    assert(src0.type == BRW_REGISTER_TYPE_F);
 423
 424    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
 425     *
 426     *   Because this instruction does not have a 16-bit floating-point type,
 427     *   the destination data type must be Word (W).
 428     *
 429     *   The destination must be DWord-aligned and specify a horizontal stride
 430     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
 431     *   each destination channel and the upper word is not modified.
 432     *
 433     * The above restriction implies that the f32to16 instruction must use
 434     * align1 mode, because only in align1 mode is it possible to specify
 435     * horizontal stride.  We choose here to defy the hardware docs and emit
 436     * align16 instructions.
 437     *
 438     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
 439     * instructions. I was partially successful in that the code passed all
 440     * tests.  However, the code was dubiously correct and fragile, and the
 441     * tests were not harsh enough to probe that frailty. Not trusting the
 442     * code, I chose instead to remain in align16 mode in defiance of the hw
 443     * docs).
 444     *
 445     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
 446     * simulator, emitting a f32to16 in align16 mode with UD as destination
 447     * data type is safe. The behavior differs from that specified in the PRM
 448     * in that the upper word of each destination channel is cleared to 0.
 449     */
 450
 451    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 452    src_reg tmp_src(tmp_dst);
 453
 454 #if 0
 455    /* Verify the undocumented behavior on which the following instructions
 456     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
 457     * then the result of the bit-or instruction below will be incorrect.
 458     *
 459     * You should inspect the disasm output in order to verify that the MOV is
 460     * not optimized away.
 461     */
 462    emit(MOV(tmp_dst, src_reg(0x12345678u)));
 463 #endif
 464
 465    /* Give tmp the form below, where "." means untouched.
 466     *
 467     *     w z          y          x w z          y          x
 468     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
 469     *
 470     * That the upper word of each write-channel be 0 is required for the
 471     * following bit-shift and bit-or instructions to work. Note that this
 472     * relies on the undocumented hardware behavior mentioned above.
 473     */
 474    tmp_dst.writemask = WRITEMASK_XY;
 475    emit(F32TO16(tmp_dst, src0));
 476
 477    /* Give the write-channels of dst the form:
 478     *   0xhhhh0000
 479     */
 480    tmp_src.swizzle = BRW_SWIZZLE_YYYY;
 481    emit(SHL(dst, tmp_src, src_reg(16u)));
 482
 483    /* Finally, give the write-channels of dst the form of packHalf2x16's
 484     * output:
 485     *   0xhhhhllll
 486     */
 487    tmp_src.swizzle = BRW_SWIZZLE_XXXX;
 488    emit(OR(dst, src_reg(dst), tmp_src));
 489 }
 490
 491 void
 492 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 493 {
 494    if (brw->gen < 7)
 495       assert(!"ir_unop_unpack_half_2x16 should be lowered");
 496
 497    assert(dst.type == BRW_REGISTER_TYPE_F);
 498    assert(src0.type == BRW_REGISTER_TYPE_UD);
 499
 500    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
 501     *
 502     *   Because this instruction does not have a 16-bit floating-point type,
 503     *   the source data type must be Word (W). The destination type must be
 504     *   F (Float).
 505     *
 506     * To use W as the source data type, we must adjust horizontal strides,
 507     * which is only possible in align1 mode. All my [chadv] attempts at
 508     * emitting align1 instructions for unpackHalf2x16 failed to pass the
 509     * Piglit tests, so I gave up.
 510     *
 511     * I've verified that, on gen7 hardware and the simulator, it is safe to
 512     * emit f16to32 in align16 mode with UD as source data type.
 513     */
 514
 515    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 516    src_reg tmp_src(tmp_dst);
 517
 518    tmp_dst.writemask = WRITEMASK_X;
 519    emit(AND(tmp_dst, src0, src_reg(0xffffu)));
 520
 521    tmp_dst.writemask = WRITEMASK_Y;
 522    emit(SHR(tmp_dst, src0, src_reg(16u)));
 523
 524    dst.writemask = WRITEMASK_XY;
 525    emit(F16TO32(dst, tmp_src));
 526 }
 527
 528 void
 529 vec4_visitor::visit_instructions(const exec_list *list)
 530 {
 531    foreach_list(node, list) {
 532       ir_instruction *ir = (ir_instruction *)node;
 533
 534       base_ir = ir;
 535       ir->accept(this);
 536    }
 537 }
 538
 539
 540 static int
 541 type_size(const struct glsl_type *type)
 542 {
 543    unsigned int i;
 544    int size;
 545
 546    switch (type->base_type) {
 547    case GLSL_TYPE_UINT:
 548    case GLSL_TYPE_INT:
 549    case GLSL_TYPE_FLOAT:
 550    case GLSL_TYPE_BOOL:
 551       if (type->is_matrix()) {
 552          return type->matrix_columns;
 553       } else {
 554          /* Regardless of size of vector, it gets a vec4. This is bad
 555           * packing for things like floats, but otherwise arrays become a
 556           * mess.  Hopefully a later pass over the code can pack scalars
 557           * down if appropriate.
 558           */
 559          return 1;
 560       }
 561    case GLSL_TYPE_ARRAY:
 562       assert(type->length > 0);
 563       return type_size(type->fields.array) * type->length;
 564    case GLSL_TYPE_STRUCT:
 565       size = 0;
 566       for (i = 0; i < type->length; i++) {
 567          size += type_size(type->fields.structure[i].type);
 568       }
 569       return size;
 570    case GLSL_TYPE_SAMPLER:
 571       /* Samplers take up one slot in UNIFORMS[], but they're baked in
 572        * at link time.
 573        */
 574       return 1;
 575    case GLSL_TYPE_ATOMIC_UINT:
 576       return 0;
 577    case GLSL_TYPE_IMAGE:
 578    case GLSL_TYPE_VOID:
 579    case GLSL_TYPE_ERROR:
 580    case GLSL_TYPE_INTERFACE:
 581       assert(0);
 582       break;
 583    }
 584
 585    return 0;
 586 }
 587
 588 int
 589 vec4_visitor::virtual_grf_alloc(int size)
 590 {
 591    if (virtual_grf_array_size <= virtual_grf_count) {
 592       if (virtual_grf_array_size == 0)
 593          virtual_grf_array_size = 16;
 594       else
 595          virtual_grf_array_size *= 2;
 596       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 597                                    virtual_grf_array_size);
 598       virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
 599                                      virtual_grf_array_size);
 600    }
 601    virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
 602    virtual_grf_reg_count += size;
 603    virtual_grf_sizes[virtual_grf_count] = size;
 604    return virtual_grf_count++;
 605 }
 606
 607 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 608 {
 609    init();
 610
 611    this->file = GRF;
 612    this->reg = v->virtual_grf_alloc(type_size(type));
 613
 614    if (type->is_array() || type->is_record()) {
 615       this->swizzle = BRW_SWIZZLE_NOOP;
 616    } else {
 617       this->swizzle = swizzle_for_size(type->vector_elements);
 618    }
 619
 620    this->type = brw_type_for_base_type(type);
 621 }
 622
 623 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 624 {
 625    init();
 626
 627    this->file = GRF;
 628    this->reg = v->virtual_grf_alloc(type_size(type));
 629
 630    if (type->is_array() || type->is_record()) {
 631       this->writemask = WRITEMASK_XYZW;
 632    } else {
 633       this->writemask = (1 << type->vector_elements) - 1;
 634    }
 635
 636    this->type = brw_type_for_base_type(type);
 637 }
 638
 639 /* Our support for uniforms is piggy-backed on the struct
 640  * gl_fragment_program, because that's where the values actually
 641  * get stored, rather than in some global gl_shader_program uniform
 642  * store.
 643  */
 644 void
 645 vec4_visitor::setup_uniform_values(ir_variable *ir)
 646 {
 647    int namelen = strlen(ir->name);
 648
 649    /* The data for our (non-builtin) uniforms is stored in a series of
 650     * gl_uniform_driver_storage structs for each subcomponent that
 651     * glGetUniformLocation() could name.  We know it's been set up in the same
 652     * order we'd walk the type, so walk the list of storage and find anything
 653     * with our name, or the prefix of a component that starts with our name.
 654     */
 655    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 656       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 657
 658       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 659           (storage->name[namelen] != 0 &&
 660            storage->name[namelen] != '.' &&
 661            storage->name[namelen] != '[')) {
 662          continue;
 663       }
 664
 665       gl_constant_value *components = storage->storage;
 666       unsigned vector_count = (MAX2(storage->array_elements, 1) *
 667                                storage->type->matrix_columns);
 668
 669       for (unsigned s = 0; s < vector_count; s++) {
 670          assert(uniforms < uniform_array_size);
 671          uniform_vector_size[uniforms] = storage->type->vector_elements;
 672
 673          int i;
 674          for (i = 0; i < uniform_vector_size[uniforms]; i++) {
 675             stage_prog_data->param[uniforms * 4 + i] = &components->f;
 676             components++;
 677          }
 678          for (; i < 4; i++) {
 679             static float zero = 0;
 680             stage_prog_data->param[uniforms * 4 + i] = &zero;
 681          }
 682
 683          uniforms++;
 684       }
 685    }
 686 }
 687
 688 void
 689 vec4_visitor::setup_uniform_clipplane_values()
 690 {
 691    gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
 692
 693    for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
 694       assert(this->uniforms < uniform_array_size);
 695       this->uniform_vector_size[this->uniforms] = 4;
 696       this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
 697       this->userplane[i].type = BRW_REGISTER_TYPE_F;
 698       for (int j = 0; j < 4; ++j) {
 699          stage_prog_data->param[this->uniforms * 4 + j] = &clip_planes[i][j];
 700       }
 701       ++this->uniforms;
 702    }
 703 }
 704
 705 /* Our support for builtin uniforms is even scarier than non-builtin.
 706  * It sits on top of the PROG_STATE_VAR parameters that are
 707  * automatically updated from GL context state.
 708  */
 709 void
 710 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
 711 {
 712    const ir_state_slot *const slots = ir->state_slots;
 713    assert(ir->state_slots != NULL);
 714
 715    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 716       /* This state reference has already been setup by ir_to_mesa,
 717        * but we'll get the same index back here.  We can reference
 718        * ParameterValues directly, since unlike brw_fs.cpp, we never
 719        * add new state references during compile.
 720        */
 721       int index = _mesa_add_state_reference(this->prog->Parameters,
 722                                             (gl_state_index *)slots[i].tokens);
 723       float *values = &this->prog->Parameters->ParameterValues[index][0].f;
 724
 725       assert(this->uniforms < uniform_array_size);
 726       this->uniform_vector_size[this->uniforms] = 0;
 727       /* Add each of the unique swizzled channels of the element.
 728        * This will end up matching the size of the glsl_type of this field.
 729        */
 730       int last_swiz = -1;
 731       for (unsigned int j = 0; j < 4; j++) {
 732          int swiz = GET_SWZ(slots[i].swizzle, j);
 733          last_swiz = swiz;
 734
 735          stage_prog_data->param[this->uniforms * 4 + j] = &values[swiz];
 736          assert(this->uniforms < uniform_array_size);
 737          if (swiz <= last_swiz)
 738             this->uniform_vector_size[this->uniforms]++;
 739       }
 740       this->uniforms++;
 741    }
 742 }
 743
 744 dst_reg *
 745 vec4_visitor::variable_storage(ir_variable *var)
 746 {
 747    return (dst_reg *)hash_table_find(this->variable_ht, var);
 748 }
 749
 750 void
 751 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir, uint32_t *predicate)
 752 {
 753    ir_expression *expr = ir->as_expression();
 754
 755    *predicate = BRW_PREDICATE_NORMAL;
 756
 757    if (expr) {
 758       src_reg op[2];
 759       vec4_instruction *inst;
 760
 761       assert(expr->get_num_operands() <= 2);
 762       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 763          expr->operands[i]->accept(this);
 764          op[i] = this->result;
 765
 766          resolve_ud_negate(&op[i]);
 767       }
 768
 769       switch (expr->operation) {
 770       case ir_unop_logic_not:
 771          inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
 772          inst->conditional_mod = BRW_CONDITIONAL_Z;
 773          break;
 774
 775       case ir_binop_logic_xor:
 776          inst = emit(XOR(dst_null_d(), op[0], op[1]));
 777          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 778          break;
 779
 780       case ir_binop_logic_or:
 781          inst = emit(OR(dst_null_d(), op[0], op[1]));
 782          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 783          break;
 784
 785       case ir_binop_logic_and:
 786          inst = emit(AND(dst_null_d(), op[0], op[1]));
 787          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 788          break;
 789
 790       case ir_unop_f2b:
 791          if (brw->gen >= 6) {
 792             emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
 793          } else {
 794             inst = emit(MOV(dst_null_f(), op[0]));
 795             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 796          }
 797          break;
 798
 799       case ir_unop_i2b:
 800          if (brw->gen >= 6) {
 801             emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 802          } else {
 803             inst = emit(MOV(dst_null_d(), op[0]));
 804             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 805          }
 806          break;
 807
 808       case ir_binop_all_equal:
 809          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 810          *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
 811          break;
 812
 813       case ir_binop_any_nequal:
 814          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 815          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 816          break;
 817
 818       case ir_unop_any:
 819          inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 820          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 821          break;
 822
 823       case ir_binop_greater:
 824       case ir_binop_gequal:
 825       case ir_binop_less:
 826       case ir_binop_lequal:
 827       case ir_binop_equal:
 828       case ir_binop_nequal:
 829          emit(CMP(dst_null_d(), op[0], op[1],
 830                   brw_conditional_for_comparison(expr->operation)));
 831          break;
 832
 833       default:
 834          assert(!"not reached");
 835          break;
 836       }
 837       return;
 838    }
 839
 840    ir->accept(this);
 841
 842    resolve_ud_negate(&this->result);
 843
 844    if (brw->gen >= 6) {
 845       vec4_instruction *inst = emit(AND(dst_null_d(),
 846                                         this->result, src_reg(1)));
 847       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 848    } else {
 849       vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
 850       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 851    }
 852 }
 853
 854 /**
 855  * Emit a gen6 IF statement with the comparison folded into the IF
 856  * instruction.
 857  */
 858 void
 859 vec4_visitor::emit_if_gen6(ir_if *ir)
 860 {
 861    ir_expression *expr = ir->condition->as_expression();
 862
 863    if (expr) {
 864       src_reg op[2];
 865       dst_reg temp;
 866
 867       assert(expr->get_num_operands() <= 2);
 868       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 869          expr->operands[i]->accept(this);
 870          op[i] = this->result;
 871       }
 872
 873       switch (expr->operation) {
 874       case ir_unop_logic_not:
 875          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
 876          return;
 877
 878       case ir_binop_logic_xor:
 879          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
 880          return;
 881
 882       case ir_binop_logic_or:
 883          temp = dst_reg(this, glsl_type::bool_type);
 884          emit(OR(temp, op[0], op[1]));
 885          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 886          return;
 887
 888       case ir_binop_logic_and:
 889          temp = dst_reg(this, glsl_type::bool_type);
 890          emit(AND(temp, op[0], op[1]));
 891          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 892          return;
 893
 894       case ir_unop_f2b:
 895          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 896          return;
 897
 898       case ir_unop_i2b:
 899          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 900          return;
 901
 902       case ir_binop_greater:
 903       case ir_binop_gequal:
 904       case ir_binop_less:
 905       case ir_binop_lequal:
 906       case ir_binop_equal:
 907       case ir_binop_nequal:
 908          emit(IF(op[0], op[1],
 909                  brw_conditional_for_comparison(expr->operation)));
 910          return;
 911
 912       case ir_binop_all_equal:
 913          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 914          emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
 915          return;
 916
 917       case ir_binop_any_nequal:
 918          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 919          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 920          return;
 921
 922       case ir_unop_any:
 923          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 924          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 925          return;
 926
 927       default:
 928          assert(!"not reached");
 929          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 930          return;
 931       }
 932       return;
 933    }
 934
 935    ir->condition->accept(this);
 936
 937    emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
 938 }
 939
 940 void
 941 vec4_visitor::visit(ir_variable *ir)
 942 {
 943    dst_reg *reg = NULL;
 944
 945    if (variable_storage(ir))
 946       return;
 947
 948    switch (ir->data.mode) {
 949    case ir_var_shader_in:
 950       reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
 951       break;
 952
 953    case ir_var_shader_out:
 954       reg = new(mem_ctx) dst_reg(this, ir->type);
 955
 956       for (int i = 0; i < type_size(ir->type); i++) {
 957          output_reg[ir->data.location + i] = *reg;
 958          output_reg[ir->data.location + i].reg_offset = i;
 959          output_reg[ir->data.location + i].type =
 960             brw_type_for_base_type(ir->type->get_scalar_type());
 961          output_reg_annotation[ir->data.location + i] = ir->name;
 962       }
 963       break;
 964
 965    case ir_var_auto:
 966    case ir_var_temporary:
 967       reg = new(mem_ctx) dst_reg(this, ir->type);
 968       break;
 969
 970    case ir_var_uniform:
 971       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
 972
 973       /* Thanks to the lower_ubo_reference pass, we will see only
 974        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
 975        * variables, so no need for them to be in variable_ht.
 976        *
 977        * Atomic counters take no uniform storage, no need to do
 978        * anything here.
 979        */
 980       if (ir->is_in_uniform_block() || ir->type->contains_atomic())
 981          return;
 982
 983       /* Track how big the whole uniform variable is, in case we need to put a
 984        * copy of its data into pull constants for array access.
 985        */
 986       assert(this->uniforms < uniform_array_size);
 987       this->uniform_size[this->uniforms] = type_size(ir->type);
 988
 989       if (!strncmp(ir->name, "gl_", 3)) {
 990          setup_builtin_uniform_values(ir);
 991       } else {
 992          setup_uniform_values(ir);
 993       }
 994       break;
 995
 996    case ir_var_system_value:
 997       reg = make_reg_for_system_value(ir);
 998       break;
 999
1000    default:
1001       assert(!"not reached");
1002    }
1003
1004    reg->type = brw_type_for_base_type(ir->type);
1005    hash_table_insert(this->variable_ht, reg, ir);
1006 }
1007
1008 void
1009 vec4_visitor::visit(ir_loop *ir)
1010 {
1011    /* We don't want debugging output to print the whole body of the
1012     * loop as the annotation.
1013     */
1014    this->base_ir = NULL;
1015
1016    emit(BRW_OPCODE_DO);
1017
1018    visit_instructions(&ir->body_instructions);
1019
1020    emit(BRW_OPCODE_WHILE);
1021 }
1022
1023 void
1024 vec4_visitor::visit(ir_loop_jump *ir)
1025 {
1026    switch (ir->mode) {
1027    case ir_loop_jump::jump_break:
1028       emit(BRW_OPCODE_BREAK);
1029       break;
1030    case ir_loop_jump::jump_continue:
1031       emit(BRW_OPCODE_CONTINUE);
1032       break;
1033    }
1034 }
1035
1036
1037 void
1038 vec4_visitor::visit(ir_function_signature *ir)
1039 {
1040    assert(0);
1041    (void)ir;
1042 }
1043
1044 void
1045 vec4_visitor::visit(ir_function *ir)
1046 {
1047    /* Ignore function bodies other than main() -- we shouldn't see calls to
1048     * them since they should all be inlined.
1049     */
1050    if (strcmp(ir->name, "main") == 0) {
1051       const ir_function_signature *sig;
1052       exec_list empty;
1053
1054       sig = ir->matching_signature(NULL, &empty);
1055
1056       assert(sig);
1057
1058       visit_instructions(&sig->body);
1059    }
1060 }
1061
1062 bool
1063 vec4_visitor::try_emit_sat(ir_expression *ir)
1064 {
1065    ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
1066    if (!sat_src)
1067       return false;
1068
1069    sat_src->accept(this);
1070    src_reg src = this->result;
1071
1072    this->result = src_reg(this, ir->type);
1073    vec4_instruction *inst;
1074    inst = emit(MOV(dst_reg(this->result), src));
1075    inst->saturate = true;
1076
1077    return true;
1078 }
1079
1080 bool
1081 vec4_visitor::try_emit_mad(ir_expression *ir, int mul_arg)
1082 {
1083    /* 3-src instructions were introduced in gen6. */
1084    if (brw->gen < 6)
1085       return false;
1086
1087    /* MAD can only handle floating-point data. */
1088    if (ir->type->base_type != GLSL_TYPE_FLOAT)
1089       return false;
1090
1091    ir_rvalue *nonmul = ir->operands[1 - mul_arg];
1092    ir_expression *mul = ir->operands[mul_arg]->as_expression();
1093
1094    if (!mul || mul->operation != ir_binop_mul)
1095       return false;
1096
1097    nonmul->accept(this);
1098    src_reg src0 = fix_3src_operand(this->result);
1099
1100    mul->operands[0]->accept(this);
1101    src_reg src1 = fix_3src_operand(this->result);
1102
1103    mul->operands[1]->accept(this);
1104    src_reg src2 = fix_3src_operand(this->result);
1105
1106    this->result = src_reg(this, ir->type);
1107    emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1108
1109    return true;
1110 }
1111
1112 void
1113 vec4_visitor::emit_bool_comparison(unsigned int op,
1114                                  dst_reg dst, src_reg src0, src_reg src1)
1115 {
1116    /* original gen4 does destination conversion before comparison. */
1117    if (brw->gen < 5)
1118       dst.type = src0.type;
1119
1120    emit(CMP(dst, src0, src1, brw_conditional_for_comparison(op)));
1121
1122    dst.type = BRW_REGISTER_TYPE_D;
1123    emit(AND(dst, src_reg(dst), src_reg(0x1)));
1124 }
1125
1126 void
1127 vec4_visitor::emit_minmax(uint32_t conditionalmod, dst_reg dst,
1128                           src_reg src0, src_reg src1)
1129 {
1130    vec4_instruction *inst;
1131
1132    if (brw->gen >= 6) {
1133       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1134       inst->conditional_mod = conditionalmod;
1135    } else {
1136       emit(CMP(dst, src0, src1, conditionalmod));
1137
1138       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1139       inst->predicate = BRW_PREDICATE_NORMAL;
1140    }
1141 }
1142
1143 void
1144 vec4_visitor::emit_lrp(const dst_reg &dst,
1145                        const src_reg &x, const src_reg &y, const src_reg &a)
1146 {
1147    if (brw->gen >= 6) {
1148       /* Note that the instruction's argument order is reversed from GLSL
1149        * and the IR.
1150        */
1151       emit(LRP(dst,
1152                fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
1153    } else {
1154       /* Earlier generations don't support three source operations, so we
1155        * need to emit x*(1-a) + y*a.
1156        *
1157        * A better way to do this would be:
1158        *    ADD one_minus_a, negate(a), 1.0f
1159        *    MUL null, y, a
1160        *    MAC dst, x, one_minus_a
1161        * but we would need to support MAC and implicit accumulator.
1162        */
1163       dst_reg y_times_a           = dst_reg(this, glsl_type::vec4_type);
1164       dst_reg one_minus_a         = dst_reg(this, glsl_type::vec4_type);
1165       dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1166       y_times_a.writemask           = dst.writemask;
1167       one_minus_a.writemask         = dst.writemask;
1168       x_times_one_minus_a.writemask = dst.writemask;
1169
1170       emit(MUL(y_times_a, y, a));
1171       emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1172       emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1173       emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1174    }
1175 }
1176
1177 static bool
1178 is_16bit_constant(ir_rvalue *rvalue)
1179 {
1180    ir_constant *constant = rvalue->as_constant();
1181    if (!constant)
1182       return false;
1183
1184    if (constant->type != glsl_type::int_type &&
1185        constant->type != glsl_type::uint_type)
1186       return false;
1187
1188    return constant->value.u[0] < (1 << 16);
1189 }
1190
1191 void
1192 vec4_visitor::visit(ir_expression *ir)
1193 {
1194    unsigned int operand;
1195    src_reg op[Elements(ir->operands)];
1196    src_reg result_src;
1197    dst_reg result_dst;
1198    vec4_instruction *inst;
1199
1200    if (try_emit_sat(ir))
1201       return;
1202
1203    if (ir->operation == ir_binop_add) {
1204       if (try_emit_mad(ir, 0) || try_emit_mad(ir, 1))
1205          return;
1206    }
1207
1208    for (operand = 0; operand < ir->get_num_operands(); operand++) {
1209       this->result.file = BAD_FILE;
1210       ir->operands[operand]->accept(this);
1211       if (this->result.file == BAD_FILE) {
1212          fprintf(stderr, "Failed to get tree for expression operand:\n");
1213          ir->operands[operand]->fprint(stderr);
1214          exit(1);
1215       }
1216       op[operand] = this->result;
1217
1218       /* Matrix expression operands should have been broken down to vector
1219        * operations already.
1220        */
1221       assert(!ir->operands[operand]->type->is_matrix());
1222    }
1223
1224    int vector_elements = ir->operands[0]->type->vector_elements;
1225    if (ir->operands[1]) {
1226       vector_elements = MAX2(vector_elements,
1227                              ir->operands[1]->type->vector_elements);
1228    }
1229
1230    this->result.file = BAD_FILE;
1231
1232    /* Storage for our result.  Ideally for an assignment we'd be using
1233     * the actual storage for the result here, instead.
1234     */
1235    result_src = src_reg(this, ir->type);
1236    /* convenience for the emit functions below. */
1237    result_dst = dst_reg(result_src);
1238    /* If nothing special happens, this is the result. */
1239    this->result = result_src;
1240    /* Limit writes to the channels that will be used by result_src later.
1241     * This does limit this temp's use as a temporary for multi-instruction
1242     * sequences.
1243     */
1244    result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1245
1246    switch (ir->operation) {
1247    case ir_unop_logic_not:
1248       /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
1249        * ones complement of the whole register, not just bit 0.
1250        */
1251       emit(XOR(result_dst, op[0], src_reg(1)));
1252       break;
1253    case ir_unop_neg:
1254       op[0].negate = !op[0].negate;
1255       emit(MOV(result_dst, op[0]));
1256       break;
1257    case ir_unop_abs:
1258       op[0].abs = true;
1259       op[0].negate = false;
1260       emit(MOV(result_dst, op[0]));
1261       break;
1262
1263    case ir_unop_sign:
1264       if (ir->type->is_float()) {
1265          /* AND(val, 0x80000000) gives the sign bit.
1266           *
1267           * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1268           * zero.
1269           */
1270          emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1271
1272          op[0].type = BRW_REGISTER_TYPE_UD;
1273          result_dst.type = BRW_REGISTER_TYPE_UD;
1274          emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1275
1276          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1277          inst->predicate = BRW_PREDICATE_NORMAL;
1278
1279          this->result.type = BRW_REGISTER_TYPE_F;
1280       } else {
1281          /*  ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1282           *               -> non-negative val generates 0x00000000.
1283           *  Predicated OR sets 1 if val is positive.
1284           */
1285          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1286
1287          emit(ASR(result_dst, op[0], src_reg(31)));
1288
1289          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1290          inst->predicate = BRW_PREDICATE_NORMAL;
1291       }
1292       break;
1293
1294    case ir_unop_rcp:
1295       emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1296       break;
1297
1298    case ir_unop_exp2:
1299       emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1300       break;
1301    case ir_unop_log2:
1302       emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1303       break;
1304    case ir_unop_exp:
1305    case ir_unop_log:
1306       assert(!"not reached: should be handled by ir_explog_to_explog2");
1307       break;
1308    case ir_unop_sin:
1309    case ir_unop_sin_reduced:
1310       emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1311       break;
1312    case ir_unop_cos:
1313    case ir_unop_cos_reduced:
1314       emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1315       break;
1316
1317    case ir_unop_dFdx:
1318    case ir_unop_dFdy:
1319       assert(!"derivatives not valid in vertex shader");
1320       break;
1321
1322    case ir_unop_bitfield_reverse:
1323       emit(BFREV(result_dst, op[0]));
1324       break;
1325    case ir_unop_bit_count:
1326       emit(CBIT(result_dst, op[0]));
1327       break;
1328    case ir_unop_find_msb: {
1329       src_reg temp = src_reg(this, glsl_type::uint_type);
1330
1331       inst = emit(FBH(dst_reg(temp), op[0]));
1332       inst->dst.writemask = WRITEMASK_XYZW;
1333
1334       /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1335        * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1336        * subtract the result from 31 to convert the MSB count into an LSB count.
1337        */
1338
1339       /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1340       temp.swizzle = BRW_SWIZZLE_NOOP;
1341       emit(MOV(result_dst, temp));
1342
1343       src_reg src_tmp = src_reg(result_dst);
1344       emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1345
1346       src_tmp.negate = true;
1347       inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1348       inst->predicate = BRW_PREDICATE_NORMAL;
1349       break;
1350    }
1351    case ir_unop_find_lsb:
1352       emit(FBL(result_dst, op[0]));
1353       break;
1354
1355    case ir_unop_noise:
1356       assert(!"not reached: should be handled by lower_noise");
1357       break;
1358
1359    case ir_binop_add:
1360       emit(ADD(result_dst, op[0], op[1]));
1361       break;
1362    case ir_binop_sub:
1363       assert(!"not reached: should be handled by ir_sub_to_add_neg");
1364       break;
1365
1366    case ir_binop_mul:
1367       if (brw->gen < 8 && ir->type->is_integer()) {
1368          /* For integer multiplication, the MUL uses the low 16 bits of one of
1369           * the operands (src0 through SNB, src1 on IVB and later).  The MACH
1370           * accumulates in the contribution of the upper 16 bits of that
1371           * operand.  If we can determine that one of the args is in the low
1372           * 16 bits, though, we can just emit a single MUL.
1373           */
1374          if (is_16bit_constant(ir->operands[0])) {
1375             if (brw->gen < 7)
1376                emit(MUL(result_dst, op[0], op[1]));
1377             else
1378                emit(MUL(result_dst, op[1], op[0]));
1379          } else if (is_16bit_constant(ir->operands[1])) {
1380             if (brw->gen < 7)
1381                emit(MUL(result_dst, op[1], op[0]));
1382             else
1383                emit(MUL(result_dst, op[0], op[1]));
1384          } else {
1385             struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1386
1387             emit(MUL(acc, op[0], op[1]));
1388             emit(MACH(dst_null_d(), op[0], op[1]));
1389             emit(MOV(result_dst, src_reg(acc)));
1390          }
1391       } else {
1392          emit(MUL(result_dst, op[0], op[1]));
1393       }
1394       break;
1395    case ir_binop_imul_high: {
1396       struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1397
1398       emit(MUL(acc, op[0], op[1]));
1399       emit(MACH(result_dst, op[0], op[1]));
1400       break;
1401    }
1402    case ir_binop_div:
1403       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1404       assert(ir->type->is_integer());
1405       emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1406       break;
1407    case ir_binop_carry: {
1408       struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1409
1410       emit(ADDC(dst_null_ud(), op[0], op[1]));
1411       emit(MOV(result_dst, src_reg(acc)));
1412       break;
1413    }
1414    case ir_binop_borrow: {
1415       struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1416
1417       emit(SUBB(dst_null_ud(), op[0], op[1]));
1418       emit(MOV(result_dst, src_reg(acc)));
1419       break;
1420    }
1421    case ir_binop_mod:
1422       /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1423       assert(ir->type->is_integer());
1424       emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1425       break;
1426
1427    case ir_binop_less:
1428    case ir_binop_greater:
1429    case ir_binop_lequal:
1430    case ir_binop_gequal:
1431    case ir_binop_equal:
1432    case ir_binop_nequal: {
1433       emit(CMP(result_dst, op[0], op[1],
1434                brw_conditional_for_comparison(ir->operation)));
1435       emit(AND(result_dst, result_src, src_reg(0x1)));
1436       break;
1437    }
1438
1439    case ir_binop_all_equal:
1440       /* "==" operator producing a scalar boolean. */
1441       if (ir->operands[0]->type->is_vector() ||
1442           ir->operands[1]->type->is_vector()) {
1443          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1444          emit(MOV(result_dst, src_reg(0)));
1445          inst = emit(MOV(result_dst, src_reg(1)));
1446          inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1447       } else {
1448          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1449          emit(AND(result_dst, result_src, src_reg(0x1)));
1450       }
1451       break;
1452    case ir_binop_any_nequal:
1453       /* "!=" operator producing a scalar boolean. */
1454       if (ir->operands[0]->type->is_vector() ||
1455           ir->operands[1]->type->is_vector()) {
1456          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1457
1458          emit(MOV(result_dst, src_reg(0)));
1459          inst = emit(MOV(result_dst, src_reg(1)));
1460          inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1461       } else {
1462          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1463          emit(AND(result_dst, result_src, src_reg(0x1)));
1464       }
1465       break;
1466
1467    case ir_unop_any:
1468       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1469       emit(MOV(result_dst, src_reg(0)));
1470
1471       inst = emit(MOV(result_dst, src_reg(1)));
1472       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1473       break;
1474
1475    case ir_binop_logic_xor:
1476       emit(XOR(result_dst, op[0], op[1]));
1477       break;
1478
1479    case ir_binop_logic_or:
1480       emit(OR(result_dst, op[0], op[1]));
1481       break;
1482
1483    case ir_binop_logic_and:
1484       emit(AND(result_dst, op[0], op[1]));
1485       break;
1486
1487    case ir_binop_dot:
1488       assert(ir->operands[0]->type->is_vector());
1489       assert(ir->operands[0]->type == ir->operands[1]->type);
1490       emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1491       break;
1492
1493    case ir_unop_sqrt:
1494       emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1495       break;
1496    case ir_unop_rsq:
1497       emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1498       break;
1499
1500    case ir_unop_bitcast_i2f:
1501    case ir_unop_bitcast_u2f:
1502       this->result = op[0];
1503       this->result.type = BRW_REGISTER_TYPE_F;
1504       break;
1505
1506    case ir_unop_bitcast_f2i:
1507       this->result = op[0];
1508       this->result.type = BRW_REGISTER_TYPE_D;
1509       break;
1510
1511    case ir_unop_bitcast_f2u:
1512       this->result = op[0];
1513       this->result.type = BRW_REGISTER_TYPE_UD;
1514       break;
1515
1516    case ir_unop_i2f:
1517    case ir_unop_i2u:
1518    case ir_unop_u2i:
1519    case ir_unop_u2f:
1520    case ir_unop_b2f:
1521    case ir_unop_b2i:
1522    case ir_unop_f2i:
1523    case ir_unop_f2u:
1524       emit(MOV(result_dst, op[0]));
1525       break;
1526    case ir_unop_f2b:
1527    case ir_unop_i2b: {
1528       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1529       emit(AND(result_dst, result_src, src_reg(1)));
1530       break;
1531    }
1532
1533    case ir_unop_trunc:
1534       emit(RNDZ(result_dst, op[0]));
1535       break;
1536    case ir_unop_ceil:
1537       op[0].negate = !op[0].negate;
1538       inst = emit(RNDD(result_dst, op[0]));
1539       this->result.negate = true;
1540       break;
1541    case ir_unop_floor:
1542       inst = emit(RNDD(result_dst, op[0]));
1543       break;
1544    case ir_unop_fract:
1545       inst = emit(FRC(result_dst, op[0]));
1546       break;
1547    case ir_unop_round_even:
1548       emit(RNDE(result_dst, op[0]));
1549       break;
1550
1551    case ir_binop_min:
1552       emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1553       break;
1554    case ir_binop_max:
1555       emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1556       break;
1557
1558    case ir_binop_pow:
1559       emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1560       break;
1561
1562    case ir_unop_bit_not:
1563       inst = emit(NOT(result_dst, op[0]));
1564       break;
1565    case ir_binop_bit_and:
1566       inst = emit(AND(result_dst, op[0], op[1]));
1567       break;
1568    case ir_binop_bit_xor:
1569       inst = emit(XOR(result_dst, op[0], op[1]));
1570       break;
1571    case ir_binop_bit_or:
1572       inst = emit(OR(result_dst, op[0], op[1]));
1573       break;
1574
1575    case ir_binop_lshift:
1576       inst = emit(SHL(result_dst, op[0], op[1]));
1577       break;
1578
1579    case ir_binop_rshift:
1580       if (ir->type->base_type == GLSL_TYPE_INT)
1581          inst = emit(ASR(result_dst, op[0], op[1]));
1582       else
1583          inst = emit(SHR(result_dst, op[0], op[1]));
1584       break;
1585
1586    case ir_binop_bfm:
1587       emit(BFI1(result_dst, op[0], op[1]));
1588       break;
1589
1590    case ir_binop_ubo_load: {
1591       ir_constant *uniform_block = ir->operands[0]->as_constant();
1592       ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1593       unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1594       src_reg offset;
1595
1596       /* Now, load the vector from that offset. */
1597       assert(ir->type->is_vector() || ir->type->is_scalar());
1598
1599       src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1600       packed_consts.type = result.type;
1601       src_reg surf_index =
1602          src_reg(prog_data->base.binding_table.ubo_start + uniform_block->value.u[0]);
1603       if (const_offset_ir) {
1604          if (brw->gen >= 8) {
1605             /* Store the offset in a GRF so we can send-from-GRF. */
1606             offset = src_reg(this, glsl_type::int_type);
1607             emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1608          } else {
1609             /* Immediates are fine on older generations since they'll be moved
1610              * to a (potentially fake) MRF at the generator level.
1611              */
1612             offset = src_reg(const_offset / 16);
1613          }
1614       } else {
1615          offset = src_reg(this, glsl_type::uint_type);
1616          emit(SHR(dst_reg(offset), op[1], src_reg(4)));
1617       }
1618
1619       if (brw->gen >= 7) {
1620          dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1621          grf_offset.type = offset.type;
1622
1623          emit(MOV(grf_offset, offset));
1624
1625          emit(new(mem_ctx) vec4_instruction(this,
1626                                             VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1627                                             dst_reg(packed_consts),
1628                                             surf_index,
1629                                             src_reg(grf_offset)));
1630       } else {
1631          vec4_instruction *pull =
1632             emit(new(mem_ctx) vec4_instruction(this,
1633                                                VS_OPCODE_PULL_CONSTANT_LOAD,
1634                                                dst_reg(packed_consts),
1635                                                surf_index,
1636                                                offset));
1637          pull->base_mrf = 14;
1638          pull->mlen = 1;
1639       }
1640
1641       packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1642       packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1643                                             const_offset % 16 / 4,
1644                                             const_offset % 16 / 4,
1645                                             const_offset % 16 / 4);
1646
1647       /* UBO bools are any nonzero int.  We store bools as either 0 or 1. */
1648       if (ir->type->base_type == GLSL_TYPE_BOOL) {
1649          emit(CMP(result_dst, packed_consts, src_reg(0u),
1650                   BRW_CONDITIONAL_NZ));
1651          emit(AND(result_dst, result, src_reg(0x1)));
1652       } else {
1653          emit(MOV(result_dst, packed_consts));
1654       }
1655       break;
1656    }
1657
1658    case ir_binop_vector_extract:
1659       assert(!"should have been lowered by vec_index_to_cond_assign");
1660       break;
1661
1662    case ir_triop_fma:
1663       op[0] = fix_3src_operand(op[0]);
1664       op[1] = fix_3src_operand(op[1]);
1665       op[2] = fix_3src_operand(op[2]);
1666       /* Note that the instruction's argument order is reversed from GLSL
1667        * and the IR.
1668        */
1669       emit(MAD(result_dst, op[2], op[1], op[0]));
1670       break;
1671
1672    case ir_triop_lrp:
1673       emit_lrp(result_dst, op[0], op[1], op[2]);
1674       break;
1675
1676    case ir_triop_csel:
1677       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1678       inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1679       inst->predicate = BRW_PREDICATE_NORMAL;
1680       break;
1681
1682    case ir_triop_bfi:
1683       op[0] = fix_3src_operand(op[0]);
1684       op[1] = fix_3src_operand(op[1]);
1685       op[2] = fix_3src_operand(op[2]);
1686       emit(BFI2(result_dst, op[0], op[1], op[2]));
1687       break;
1688
1689    case ir_triop_bitfield_extract:
1690       op[0] = fix_3src_operand(op[0]);
1691       op[1] = fix_3src_operand(op[1]);
1692       op[2] = fix_3src_operand(op[2]);
1693       /* Note that the instruction's argument order is reversed from GLSL
1694        * and the IR.
1695        */
1696       emit(BFE(result_dst, op[2], op[1], op[0]));
1697       break;
1698
1699    case ir_triop_vector_insert:
1700       assert(!"should have been lowered by lower_vector_insert");
1701       break;
1702
1703    case ir_quadop_bitfield_insert:
1704       assert(!"not reached: should be handled by "
1705               "bitfield_insert_to_bfm_bfi\n");
1706       break;
1707
1708    case ir_quadop_vector:
1709       assert(!"not reached: should be handled by lower_quadop_vector");
1710       break;
1711
1712    case ir_unop_pack_half_2x16:
1713       emit_pack_half_2x16(result_dst, op[0]);
1714       break;
1715    case ir_unop_unpack_half_2x16:
1716       emit_unpack_half_2x16(result_dst, op[0]);
1717       break;
1718    case ir_unop_pack_snorm_2x16:
1719    case ir_unop_pack_snorm_4x8:
1720    case ir_unop_pack_unorm_2x16:
1721    case ir_unop_pack_unorm_4x8:
1722    case ir_unop_unpack_snorm_2x16:
1723    case ir_unop_unpack_snorm_4x8:
1724    case ir_unop_unpack_unorm_2x16:
1725    case ir_unop_unpack_unorm_4x8:
1726       assert(!"not reached: should be handled by lower_packing_builtins");
1727       break;
1728    case ir_unop_unpack_half_2x16_split_x:
1729    case ir_unop_unpack_half_2x16_split_y:
1730    case ir_binop_pack_half_2x16_split:
1731       assert(!"not reached: should not occur in vertex shader");
1732       break;
1733    case ir_binop_ldexp:
1734       assert(!"not reached: should be handled by ldexp_to_arith()");
1735       break;
1736    }
1737 }
1738
1739
1740 void
1741 vec4_visitor::visit(ir_swizzle *ir)
1742 {
1743    src_reg src;
1744    int i = 0;
1745    int swizzle[4];
1746
1747    /* Note that this is only swizzles in expressions, not those on the left
1748     * hand side of an assignment, which do write masking.  See ir_assignment
1749     * for that.
1750     */
1751
1752    ir->val->accept(this);
1753    src = this->result;
1754    assert(src.file != BAD_FILE);
1755
1756    for (i = 0; i < ir->type->vector_elements; i++) {
1757       switch (i) {
1758       case 0:
1759          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1760          break;
1761       case 1:
1762          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1763          break;
1764       case 2:
1765          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1766          break;
1767       case 3:
1768          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1769             break;
1770       }
1771    }
1772    for (; i < 4; i++) {
1773       /* Replicate the last channel out. */
1774       swizzle[i] = swizzle[ir->type->vector_elements - 1];
1775    }
1776
1777    src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1778
1779    this->result = src;
1780 }
1781
1782 void
1783 vec4_visitor::visit(ir_dereference_variable *ir)
1784 {
1785    const struct glsl_type *type = ir->type;
1786    dst_reg *reg = variable_storage(ir->var);
1787
1788    if (!reg) {
1789       fail("Failed to find variable storage for %s\n", ir->var->name);
1790       this->result = src_reg(brw_null_reg());
1791       return;
1792    }
1793
1794    this->result = src_reg(*reg);
1795
1796    /* System values get their swizzle from the dst_reg writemask */
1797    if (ir->var->data.mode == ir_var_system_value)
1798       return;
1799
1800    if (type->is_scalar() || type->is_vector() || type->is_matrix())
1801       this->result.swizzle = swizzle_for_size(type->vector_elements);
1802 }
1803
1804
1805 int
1806 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1807 {
1808    /* Under normal circumstances array elements are stored consecutively, so
1809     * the stride is equal to the size of the array element.
1810     */
1811    return type_size(ir->type);
1812 }
1813
1814
1815 void
1816 vec4_visitor::visit(ir_dereference_array *ir)
1817 {
1818    ir_constant *constant_index;
1819    src_reg src;
1820    int array_stride = compute_array_stride(ir);
1821
1822    constant_index = ir->array_index->constant_expression_value();
1823
1824    ir->array->accept(this);
1825    src = this->result;
1826
1827    if (constant_index) {
1828       src.reg_offset += constant_index->value.i[0] * array_stride;
1829    } else {
1830       /* Variable index array dereference.  It eats the "vec4" of the
1831        * base of the array and an index that offsets the Mesa register
1832        * index.
1833        */
1834       ir->array_index->accept(this);
1835
1836       src_reg index_reg;
1837
1838       if (array_stride == 1) {
1839          index_reg = this->result;
1840       } else {
1841          index_reg = src_reg(this, glsl_type::int_type);
1842
1843          emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
1844       }
1845
1846       if (src.reladdr) {
1847          src_reg temp = src_reg(this, glsl_type::int_type);
1848
1849          emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1850
1851          index_reg = temp;
1852       }
1853
1854       src.reladdr = ralloc(mem_ctx, src_reg);
1855       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1856    }
1857
1858    /* If the type is smaller than a vec4, replicate the last channel out. */
1859    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1860       src.swizzle = swizzle_for_size(ir->type->vector_elements);
1861    else
1862       src.swizzle = BRW_SWIZZLE_NOOP;
1863    src.type = brw_type_for_base_type(ir->type);
1864
1865    this->result = src;
1866 }
1867
1868 void
1869 vec4_visitor::visit(ir_dereference_record *ir)
1870 {
1871    unsigned int i;
1872    const glsl_type *struct_type = ir->record->type;
1873    int offset = 0;
1874
1875    ir->record->accept(this);
1876
1877    for (i = 0; i < struct_type->length; i++) {
1878       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1879          break;
1880       offset += type_size(struct_type->fields.structure[i].type);
1881    }
1882
1883    /* If the type is smaller than a vec4, replicate the last channel out. */
1884    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1885       this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1886    else
1887       this->result.swizzle = BRW_SWIZZLE_NOOP;
1888    this->result.type = brw_type_for_base_type(ir->type);
1889
1890    this->result.reg_offset += offset;
1891 }
1892
1893 /**
1894  * We want to be careful in assignment setup to hit the actual storage
1895  * instead of potentially using a temporary like we might with the
1896  * ir_dereference handler.
1897  */
1898 static dst_reg
1899 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1900 {
1901    /* The LHS must be a dereference.  If the LHS is a variable indexed array
1902     * access of a vector, it must be separated into a series conditional moves
1903     * before reaching this point (see ir_vec_index_to_cond_assign).
1904     */
1905    assert(ir->as_dereference());
1906    ir_dereference_array *deref_array = ir->as_dereference_array();
1907    if (deref_array) {
1908       assert(!deref_array->array->type->is_vector());
1909    }
1910
1911    /* Use the rvalue deref handler for the most part.  We'll ignore
1912     * swizzles in it and write swizzles using writemask, though.
1913     */
1914    ir->accept(v);
1915    return dst_reg(v->result);
1916 }
1917
1918 void
1919 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1920                               const struct glsl_type *type, uint32_t predicate)
1921 {
1922    if (type->base_type == GLSL_TYPE_STRUCT) {
1923       for (unsigned int i = 0; i < type->length; i++) {
1924          emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1925       }
1926       return;
1927    }
1928
1929    if (type->is_array()) {
1930       for (unsigned int i = 0; i < type->length; i++) {
1931          emit_block_move(dst, src, type->fields.array, predicate);
1932       }
1933       return;
1934    }
1935
1936    if (type->is_matrix()) {
1937       const struct glsl_type *vec_type;
1938
1939       vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1940                                          type->vector_elements, 1);
1941
1942       for (int i = 0; i < type->matrix_columns; i++) {
1943          emit_block_move(dst, src, vec_type, predicate);
1944       }
1945       return;
1946    }
1947
1948    assert(type->is_scalar() || type->is_vector());
1949
1950    dst->type = brw_type_for_base_type(type);
1951    src->type = dst->type;
1952
1953    dst->writemask = (1 << type->vector_elements) - 1;
1954
1955    src->swizzle = swizzle_for_size(type->vector_elements);
1956
1957    vec4_instruction *inst = emit(MOV(*dst, *src));
1958    inst->predicate = predicate;
1959
1960    dst->reg_offset++;
1961    src->reg_offset++;
1962 }
1963
1964
1965 /* If the RHS processing resulted in an instruction generating a
1966  * temporary value, and it would be easy to rewrite the instruction to
1967  * generate its result right into the LHS instead, do so.  This ends
1968  * up reliably removing instructions where it can be tricky to do so
1969  * later without real UD chain information.
1970  */
1971 bool
1972 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
1973                                      dst_reg dst,
1974                                      src_reg src,
1975                                      vec4_instruction *pre_rhs_inst,
1976                                      vec4_instruction *last_rhs_inst)
1977 {
1978    /* This could be supported, but it would take more smarts. */
1979    if (ir->condition)
1980       return false;
1981
1982    if (pre_rhs_inst == last_rhs_inst)
1983       return false; /* No instructions generated to work with. */
1984
1985    /* Make sure the last instruction generated our source reg. */
1986    if (src.file != GRF ||
1987        src.file != last_rhs_inst->dst.file ||
1988        src.reg != last_rhs_inst->dst.reg ||
1989        src.reg_offset != last_rhs_inst->dst.reg_offset ||
1990        src.reladdr ||
1991        src.abs ||
1992        src.negate ||
1993        last_rhs_inst->predicate != BRW_PREDICATE_NONE)
1994       return false;
1995
1996    /* Check that that last instruction fully initialized the channels
1997     * we want to use, in the order we want to use them.  We could
1998     * potentially reswizzle the operands of many instructions so that
1999     * we could handle out of order channels, but don't yet.
2000     */
2001
2002    for (unsigned i = 0; i < 4; i++) {
2003       if (dst.writemask & (1 << i)) {
2004          if (!(last_rhs_inst->dst.writemask & (1 << i)))
2005             return false;
2006
2007          if (BRW_GET_SWZ(src.swizzle, i) != i)
2008             return false;
2009       }
2010    }
2011
2012    /* Success!  Rewrite the instruction. */
2013    last_rhs_inst->dst.file = dst.file;
2014    last_rhs_inst->dst.reg = dst.reg;
2015    last_rhs_inst->dst.reg_offset = dst.reg_offset;
2016    last_rhs_inst->dst.reladdr = dst.reladdr;
2017    last_rhs_inst->dst.writemask &= dst.writemask;
2018
2019    return true;
2020 }
2021
2022 void
2023 vec4_visitor::visit(ir_assignment *ir)
2024 {
2025    dst_reg dst = get_assignment_lhs(ir->lhs, this);
2026    uint32_t predicate = BRW_PREDICATE_NONE;
2027
2028    if (!ir->lhs->type->is_scalar() &&
2029        !ir->lhs->type->is_vector()) {
2030       ir->rhs->accept(this);
2031       src_reg src = this->result;
2032
2033       if (ir->condition) {
2034          emit_bool_to_cond_code(ir->condition, &predicate);
2035       }
2036
2037       /* emit_block_move doesn't account for swizzles in the source register.
2038        * This should be ok, since the source register is a structure or an
2039        * array, and those can't be swizzled.  But double-check to be sure.
2040        */
2041       assert(src.swizzle ==
2042              (ir->rhs->type->is_matrix()
2043               ? swizzle_for_size(ir->rhs->type->vector_elements)
2044               : BRW_SWIZZLE_NOOP));
2045
2046       emit_block_move(&dst, &src, ir->rhs->type, predicate);
2047       return;
2048    }
2049
2050    /* Now we're down to just a scalar/vector with writemasks. */
2051    int i;
2052
2053    vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2054    pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2055
2056    ir->rhs->accept(this);
2057
2058    last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2059
2060    src_reg src = this->result;
2061
2062    int swizzles[4];
2063    int first_enabled_chan = 0;
2064    int src_chan = 0;
2065
2066    assert(ir->lhs->type->is_vector() ||
2067           ir->lhs->type->is_scalar());
2068    dst.writemask = ir->write_mask;
2069
2070    for (int i = 0; i < 4; i++) {
2071       if (dst.writemask & (1 << i)) {
2072          first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
2073          break;
2074       }
2075    }
2076
2077    /* Swizzle a small RHS vector into the channels being written.
2078     *
2079     * glsl ir treats write_mask as dictating how many channels are
2080     * present on the RHS while in our instructions we need to make
2081     * those channels appear in the slots of the vec4 they're written to.
2082     */
2083    for (int i = 0; i < 4; i++) {
2084       if (dst.writemask & (1 << i))
2085          swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
2086       else
2087          swizzles[i] = first_enabled_chan;
2088    }
2089    src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2090                               swizzles[2], swizzles[3]);
2091
2092    if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2093       return;
2094    }
2095
2096    if (ir->condition) {
2097       emit_bool_to_cond_code(ir->condition, &predicate);
2098    }
2099
2100    for (i = 0; i < type_size(ir->lhs->type); i++) {
2101       vec4_instruction *inst = emit(MOV(dst, src));
2102       inst->predicate = predicate;
2103
2104       dst.reg_offset++;
2105       src.reg_offset++;
2106    }
2107 }
2108
2109 void
2110 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2111 {
2112    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2113       foreach_list(node, &ir->components) {
2114          ir_constant *field_value = (ir_constant *)node;
2115
2116          emit_constant_values(dst, field_value);
2117       }
2118       return;
2119    }
2120
2121    if (ir->type->is_array()) {
2122       for (unsigned int i = 0; i < ir->type->length; i++) {
2123          emit_constant_values(dst, ir->array_elements[i]);
2124       }
2125       return;
2126    }
2127
2128    if (ir->type->is_matrix()) {
2129       for (int i = 0; i < ir->type->matrix_columns; i++) {
2130          float *vec = &ir->value.f[i * ir->type->vector_elements];
2131
2132          for (int j = 0; j < ir->type->vector_elements; j++) {
2133             dst->writemask = 1 << j;
2134             dst->type = BRW_REGISTER_TYPE_F;
2135
2136             emit(MOV(*dst, src_reg(vec[j])));
2137          }
2138          dst->reg_offset++;
2139       }
2140       return;
2141    }
2142
2143    int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2144
2145    for (int i = 0; i < ir->type->vector_elements; i++) {
2146       if (!(remaining_writemask & (1 << i)))
2147          continue;
2148
2149       dst->writemask = 1 << i;
2150       dst->type = brw_type_for_base_type(ir->type);
2151
2152       /* Find other components that match the one we're about to
2153        * write.  Emits fewer instructions for things like vec4(0.5,
2154        * 1.5, 1.5, 1.5).
2155        */
2156       for (int j = i + 1; j < ir->type->vector_elements; j++) {
2157          if (ir->type->base_type == GLSL_TYPE_BOOL) {
2158             if (ir->value.b[i] == ir->value.b[j])
2159                dst->writemask |= (1 << j);
2160          } else {
2161             /* u, i, and f storage all line up, so no need for a
2162              * switch case for comparing each type.
2163              */
2164             if (ir->value.u[i] == ir->value.u[j])
2165                dst->writemask |= (1 << j);
2166          }
2167       }
2168
2169       switch (ir->type->base_type) {
2170       case GLSL_TYPE_FLOAT:
2171          emit(MOV(*dst, src_reg(ir->value.f[i])));
2172          break;
2173       case GLSL_TYPE_INT:
2174          emit(MOV(*dst, src_reg(ir->value.i[i])));
2175          break;
2176       case GLSL_TYPE_UINT:
2177          emit(MOV(*dst, src_reg(ir->value.u[i])));
2178          break;
2179       case GLSL_TYPE_BOOL:
2180          emit(MOV(*dst, src_reg(ir->value.b[i])));
2181          break;
2182       default:
2183          assert(!"Non-float/uint/int/bool constant");
2184          break;
2185       }
2186
2187       remaining_writemask &= ~dst->writemask;
2188    }
2189    dst->reg_offset++;
2190 }
2191
2192 void
2193 vec4_visitor::visit(ir_constant *ir)
2194 {
2195    dst_reg dst = dst_reg(this, ir->type);
2196    this->result = src_reg(dst);
2197
2198    emit_constant_values(&dst, ir);
2199 }
2200
2201 void
2202 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2203 {
2204    ir_dereference *deref = static_cast<ir_dereference *>(
2205       ir->actual_parameters.get_head());
2206    ir_variable *location = deref->variable_referenced();
2207    unsigned surf_index = (prog_data->base.binding_table.abo_start +
2208                           location->data.atomic.buffer_index);
2209
2210    /* Calculate the surface offset */
2211    src_reg offset(this, glsl_type::uint_type);
2212    ir_dereference_array *deref_array = deref->as_dereference_array();
2213    if (deref_array) {
2214       deref_array->array_index->accept(this);
2215
2216       src_reg tmp(this, glsl_type::uint_type);
2217       emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2218       emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2219    } else {
2220       offset = location->data.atomic.offset;
2221    }
2222
2223    /* Emit the appropriate machine instruction */
2224    const char *callee = ir->callee->function_name();
2225    dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2226
2227    if (!strcmp("__intrinsic_atomic_read", callee)) {
2228       emit_untyped_surface_read(surf_index, dst, offset);
2229
2230    } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2231       emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2232                           src_reg(), src_reg());
2233
2234    } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2235       emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2236                           src_reg(), src_reg());
2237    }
2238 }
2239
2240 void
2241 vec4_visitor::visit(ir_call *ir)
2242 {
2243    const char *callee = ir->callee->function_name();
2244
2245    if (!strcmp("__intrinsic_atomic_read", callee) ||
2246        !strcmp("__intrinsic_atomic_increment", callee) ||
2247        !strcmp("__intrinsic_atomic_predecrement", callee)) {
2248       visit_atomic_counter_intrinsic(ir);
2249    } else {
2250       assert(!"Unsupported intrinsic.");
2251    }
2252 }
2253
2254 src_reg
2255 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, int sampler)
2256 {
2257    vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_MCS);
2258    inst->base_mrf = 2;
2259    inst->mlen = 1;
2260    inst->sampler = sampler;
2261    inst->dst = dst_reg(this, glsl_type::uvec4_type);
2262    inst->dst.writemask = WRITEMASK_XYZW;
2263
2264    /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2265    int param_base = inst->base_mrf;
2266    int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2267    int zero_mask = 0xf & ~coord_mask;
2268
2269    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2270             coordinate));
2271
2272    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2273             src_reg(0)));
2274
2275    emit(inst);
2276    return src_reg(inst->dst);
2277 }
2278
2279 void
2280 vec4_visitor::visit(ir_texture *ir)
2281 {
2282    int sampler =
2283       _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2284
2285    /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2286     * emitting anything other than setting up the constant result.
2287     */
2288    if (ir->op == ir_tg4) {
2289       ir_constant *chan = ir->lod_info.component->as_constant();
2290       int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2291       if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2292          dst_reg result(this, ir->type);
2293          this->result = src_reg(result);
2294          emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2295          return;
2296       }
2297    }
2298
2299    /* Should be lowered by do_lower_texture_projection */
2300    assert(!ir->projector);
2301
2302    /* Should be lowered */
2303    assert(!ir->offset || !ir->offset->type->is_array());
2304
2305    /* Generate code to compute all the subexpression trees.  This has to be
2306     * done before loading any values into MRFs for the sampler message since
2307     * generating these values may involve SEND messages that need the MRFs.
2308     */
2309    src_reg coordinate;
2310    if (ir->coordinate) {
2311       ir->coordinate->accept(this);
2312       coordinate = this->result;
2313    }
2314
2315    src_reg shadow_comparitor;
2316    if (ir->shadow_comparitor) {
2317       ir->shadow_comparitor->accept(this);
2318       shadow_comparitor = this->result;
2319    }
2320
2321    bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2322    src_reg offset_value;
2323    if (has_nonconstant_offset) {
2324       ir->offset->accept(this);
2325       offset_value = src_reg(this->result);
2326    }
2327
2328    const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2329    src_reg lod, dPdx, dPdy, sample_index, mcs;
2330    switch (ir->op) {
2331    case ir_tex:
2332       lod = src_reg(0.0f);
2333       lod_type = glsl_type::float_type;
2334       break;
2335    case ir_txf:
2336    case ir_txl:
2337    case ir_txs:
2338       ir->lod_info.lod->accept(this);
2339       lod = this->result;
2340       lod_type = ir->lod_info.lod->type;
2341       break;
2342    case ir_query_levels:
2343       lod = src_reg(0);
2344       lod_type = glsl_type::int_type;
2345       break;
2346    case ir_txf_ms:
2347       ir->lod_info.sample_index->accept(this);
2348       sample_index = this->result;
2349       sample_index_type = ir->lod_info.sample_index->type;
2350
2351       if (brw->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2352          mcs = emit_mcs_fetch(ir, coordinate, sampler);
2353       else
2354          mcs = src_reg(0u);
2355       break;
2356    case ir_txd:
2357       ir->lod_info.grad.dPdx->accept(this);
2358       dPdx = this->result;
2359
2360       ir->lod_info.grad.dPdy->accept(this);
2361       dPdy = this->result;
2362
2363       lod_type = ir->lod_info.grad.dPdx->type;
2364       break;
2365    case ir_txb:
2366    case ir_lod:
2367    case ir_tg4:
2368       break;
2369    }
2370
2371    vec4_instruction *inst = NULL;
2372    switch (ir->op) {
2373    case ir_tex:
2374    case ir_txl:
2375       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXL);
2376       break;
2377    case ir_txd:
2378       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXD);
2379       break;
2380    case ir_txf:
2381       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF);
2382       break;
2383    case ir_txf_ms:
2384       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_CMS);
2385       break;
2386    case ir_txs:
2387       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
2388       break;
2389    case ir_tg4:
2390       if (has_nonconstant_offset)
2391          inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TG4_OFFSET);
2392       else
2393          inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TG4);
2394       break;
2395    case ir_query_levels:
2396       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
2397       break;
2398    case ir_txb:
2399       assert(!"TXB is not valid for vertex shaders.");
2400       break;
2401    case ir_lod:
2402       assert(!"LOD is not valid for vertex shaders.");
2403       break;
2404    default:
2405       assert(!"Unrecognized tex op");
2406    }
2407
2408    if (ir->offset != NULL && ir->op != ir_txf)
2409       inst->texture_offset = brw_texture_offset(ctx, ir->offset->as_constant());
2410
2411    /* Stuff the channel select bits in the top of the texture offset */
2412    if (ir->op == ir_tg4)
2413       inst->texture_offset |= gather_channel(ir, sampler) << 16;
2414
2415    /* The message header is necessary for:
2416     * - Gen4 (always)
2417     * - Texel offsets
2418     * - Gather channel selection
2419     * - Sampler indices too large to fit in a 4-bit value.
2420     */
2421    inst->header_present =
2422       brw->gen < 5 || inst->texture_offset != 0 || ir->op == ir_tg4 ||
2423       sampler >= 16;
2424    inst->base_mrf = 2;
2425    inst->mlen = inst->header_present + 1; /* always at least one */
2426    inst->sampler = sampler;
2427    inst->dst = dst_reg(this, ir->type);
2428    inst->dst.writemask = WRITEMASK_XYZW;
2429    inst->shadow_compare = ir->shadow_comparitor != NULL;
2430
2431    /* MRF for the first parameter */
2432    int param_base = inst->base_mrf + inst->header_present;
2433
2434    if (ir->op == ir_txs || ir->op == ir_query_levels) {
2435       int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2436       emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2437    } else {
2438       /* Load the coordinate */
2439       /* FINISHME: gl_clamp_mask and saturate */
2440       int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2441       int zero_mask = 0xf & ~coord_mask;
2442
2443       emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2444                coordinate));
2445
2446       if (zero_mask != 0) {
2447          emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2448                   src_reg(0)));
2449       }
2450       /* Load the shadow comparitor */
2451       if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2452          emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2453                           WRITEMASK_X),
2454                   shadow_comparitor));
2455          inst->mlen++;
2456       }
2457
2458       /* Load the LOD info */
2459       if (ir->op == ir_tex || ir->op == ir_txl) {
2460          int mrf, writemask;
2461          if (brw->gen >= 5) {
2462             mrf = param_base + 1;
2463             if (ir->shadow_comparitor) {
2464                writemask = WRITEMASK_Y;
2465                /* mlen already incremented */
2466             } else {
2467                writemask = WRITEMASK_X;
2468                inst->mlen++;
2469             }
2470          } else /* brw->gen == 4 */ {
2471             mrf = param_base;
2472             writemask = WRITEMASK_W;
2473          }
2474          emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2475       } else if (ir->op == ir_txf) {
2476          emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2477       } else if (ir->op == ir_txf_ms) {
2478          emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2479                   sample_index));
2480          if (brw->gen >= 7)
2481             /* MCS data is in the first channel of `mcs`, but we need to get it into
2482              * the .y channel of the second vec4 of params, so replicate .x across
2483              * the whole vec4 and then mask off everything except .y
2484              */
2485             mcs.swizzle = BRW_SWIZZLE_XXXX;
2486             emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2487                      mcs));
2488          inst->mlen++;
2489       } else if (ir->op == ir_txd) {
2490          const glsl_type *type = lod_type;
2491
2492          if (brw->gen >= 5) {
2493             dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2494             dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2495             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2496             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2497             inst->mlen++;
2498
2499             if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2500                dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2501                dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2502                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2503                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2504                inst->mlen++;
2505
2506                if (ir->shadow_comparitor) {
2507                   emit(MOV(dst_reg(MRF, param_base + 2,
2508                                    ir->shadow_comparitor->type, WRITEMASK_Z),
2509                            shadow_comparitor));
2510                }
2511             }
2512          } else /* brw->gen == 4 */ {
2513             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2514             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2515             inst->mlen += 2;
2516          }
2517       } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2518          if (ir->shadow_comparitor) {
2519             emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2520                      shadow_comparitor));
2521          }
2522
2523          emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2524                   offset_value));
2525          inst->mlen++;
2526       }
2527    }
2528
2529    emit(inst);
2530
2531    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2532     * spec requires layers.
2533     */
2534    if (ir->op == ir_txs) {
2535       glsl_type const *type = ir->sampler->type;
2536       if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2537           type->sampler_array) {
2538          emit_math(SHADER_OPCODE_INT_QUOTIENT,
2539                    writemask(inst->dst, WRITEMASK_Z),
2540                    src_reg(inst->dst), src_reg(6));
2541       }
2542    }
2543
2544    if (brw->gen == 6 && ir->op == ir_tg4) {
2545       emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2546    }
2547
2548    swizzle_result(ir, src_reg(inst->dst), sampler);
2549 }
2550
2551 /**
2552  * Apply workarounds for Gen6 gather with UINT/SINT
2553  */
2554 void
2555 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2556 {
2557    if (!wa)
2558       return;
2559
2560    int width = (wa & WA_8BIT) ? 8 : 16;
2561    dst_reg dst_f = dst;
2562    dst_f.type = BRW_REGISTER_TYPE_F;
2563
2564    /* Convert from UNORM to UINT */
2565    emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2566    emit(MOV(dst, src_reg(dst_f)));
2567
2568    if (wa & WA_SIGN) {
2569       /* Reinterpret the UINT value as a signed INT value by
2570        * shifting the sign bit into place, then shifting back
2571        * preserving sign.
2572        */
2573       emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2574       emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2575    }
2576 }
2577
2578 /**
2579  * Set up the gather channel based on the swizzle, for gather4.
2580  */
2581 uint32_t
2582 vec4_visitor::gather_channel(ir_texture *ir, int sampler)
2583 {
2584    ir_constant *chan = ir->lod_info.component->as_constant();
2585    int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2586    switch (swiz) {
2587       case SWIZZLE_X: return 0;
2588       case SWIZZLE_Y:
2589          /* gather4 sampler is broken for green channel on RG32F --
2590           * we must ask for blue instead.
2591           */
2592          if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2593             return 2;
2594          return 1;
2595       case SWIZZLE_Z: return 2;
2596       case SWIZZLE_W: return 3;
2597       default:
2598          assert(!"Not reached"); /* zero, one swizzles handled already */
2599          return 0;
2600    }
2601 }
2602
2603 void
2604 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, int sampler)
2605 {
2606    int s = key->tex.swizzles[sampler];
2607
2608    this->result = src_reg(this, ir->type);
2609    dst_reg swizzled_result(this->result);
2610
2611    if (ir->op == ir_query_levels) {
2612       /* # levels is in .w */
2613       orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2614       emit(MOV(swizzled_result, orig_val));
2615       return;
2616    }
2617
2618    if (ir->op == ir_txs || ir->type == glsl_type::float_type
2619                         || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2620       emit(MOV(swizzled_result, orig_val));
2621       return;
2622    }
2623
2624
2625    int zero_mask = 0, one_mask = 0, copy_mask = 0;
2626    int swizzle[4] = {0};
2627
2628    for (int i = 0; i < 4; i++) {
2629       switch (GET_SWZ(s, i)) {
2630       case SWIZZLE_ZERO:
2631          zero_mask |= (1 << i);
2632          break;
2633       case SWIZZLE_ONE:
2634          one_mask |= (1 << i);
2635          break;
2636       default:
2637          copy_mask |= (1 << i);
2638          swizzle[i] = GET_SWZ(s, i);
2639          break;
2640       }
2641    }
2642
2643    if (copy_mask) {
2644       orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2645       swizzled_result.writemask = copy_mask;
2646       emit(MOV(swizzled_result, orig_val));
2647    }
2648
2649    if (zero_mask) {
2650       swizzled_result.writemask = zero_mask;
2651       emit(MOV(swizzled_result, src_reg(0.0f)));
2652    }
2653
2654    if (one_mask) {
2655       swizzled_result.writemask = one_mask;
2656       emit(MOV(swizzled_result, src_reg(1.0f)));
2657    }
2658 }
2659
2660 void
2661 vec4_visitor::visit(ir_return *ir)
2662 {
2663    assert(!"not reached");
2664 }
2665
2666 void
2667 vec4_visitor::visit(ir_discard *ir)
2668 {
2669    assert(!"not reached");
2670 }
2671
2672 void
2673 vec4_visitor::visit(ir_if *ir)
2674 {
2675    /* Don't point the annotation at the if statement, because then it plus
2676     * the then and else blocks get printed.
2677     */
2678    this->base_ir = ir->condition;
2679
2680    if (brw->gen == 6) {
2681       emit_if_gen6(ir);
2682    } else {
2683       uint32_t predicate;
2684       emit_bool_to_cond_code(ir->condition, &predicate);
2685       emit(IF(predicate));
2686    }
2687
2688    visit_instructions(&ir->then_instructions);
2689
2690    if (!ir->else_instructions.is_empty()) {
2691       this->base_ir = ir->condition;
2692       emit(BRW_OPCODE_ELSE);
2693
2694       visit_instructions(&ir->else_instructions);
2695    }
2696
2697    this->base_ir = ir->condition;
2698    emit(BRW_OPCODE_ENDIF);
2699 }
2700
2701 void
2702 vec4_visitor::visit(ir_emit_vertex *)
2703 {
2704    assert(!"not reached");
2705 }
2706
2707 void
2708 vec4_visitor::visit(ir_end_primitive *)
2709 {
2710    assert(!"not reached");
2711 }
2712
2713 void
2714 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2715                                   dst_reg dst, src_reg offset,
2716                                   src_reg src0, src_reg src1)
2717 {
2718    unsigned mlen = 0;
2719
2720    /* Set the atomic operation offset. */
2721    emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2722    mlen++;
2723
2724    /* Set the atomic operation arguments. */
2725    if (src0.file != BAD_FILE) {
2726       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2727       mlen++;
2728    }
2729
2730    if (src1.file != BAD_FILE) {
2731       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2732       mlen++;
2733    }
2734
2735    /* Emit the instruction.  Note that this maps to the normal SIMD8
2736     * untyped atomic message on Ivy Bridge, but that's OK because
2737     * unused channels will be masked out.
2738     */
2739    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
2740                                  src_reg(atomic_op), src_reg(surf_index));
2741    inst->base_mrf = 0;
2742    inst->mlen = mlen;
2743 }
2744
2745 void
2746 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
2747                                         src_reg offset)
2748 {
2749    /* Set the surface read offset. */
2750    emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
2751
2752    /* Emit the instruction.  Note that this maps to the normal SIMD8
2753     * untyped surface read message, but that's OK because unused
2754     * channels will be masked out.
2755     */
2756    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ,
2757                                  dst, src_reg(surf_index));
2758    inst->base_mrf = 0;
2759    inst->mlen = 1;
2760 }
2761
2762 void
2763 vec4_visitor::emit_ndc_computation()
2764 {
2765    /* Get the position */
2766    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2767
2768    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2769    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2770    output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2771
2772    current_annotation = "NDC";
2773    dst_reg ndc_w = ndc;
2774    ndc_w.writemask = WRITEMASK_W;
2775    src_reg pos_w = pos;
2776    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2777    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2778
2779    dst_reg ndc_xyz = ndc;
2780    ndc_xyz.writemask = WRITEMASK_XYZ;
2781
2782    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2783 }
2784
2785 void
2786 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2787 {
2788    if (brw->gen < 6 &&
2789        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2790         key->userclip_active || brw->has_negative_rhw_bug)) {
2791       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2792       dst_reg header1_w = header1;
2793       header1_w.writemask = WRITEMASK_W;
2794
2795       emit(MOV(header1, 0u));
2796
2797       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2798          src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2799
2800          current_annotation = "Point size";
2801          emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2802          emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2803       }
2804
2805       if (key->userclip_active) {
2806          current_annotation = "Clipping flags";
2807          dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
2808          dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
2809
2810          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
2811          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
2812          emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
2813
2814          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
2815          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
2816          emit(SHL(flags1, src_reg(flags1), src_reg(4)));
2817          emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
2818       }
2819
2820       /* i965 clipping workaround:
2821        * 1) Test for -ve rhw
2822        * 2) If set,
2823        *      set ndc = (0,0,0,0)
2824        *      set ucp[6] = 1
2825        *
2826        * Later, clipping will detect ucp[6] and ensure the primitive is
2827        * clipped against all fixed planes.
2828        */
2829       if (brw->has_negative_rhw_bug) {
2830          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
2831          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
2832          emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
2833          vec4_instruction *inst;
2834          inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
2835          inst->predicate = BRW_PREDICATE_NORMAL;
2836          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
2837          inst->predicate = BRW_PREDICATE_NORMAL;
2838       }
2839
2840       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2841    } else if (brw->gen < 6) {
2842       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2843    } else {
2844       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2845       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2846          emit(MOV(brw_writemask(reg, WRITEMASK_W),
2847                   src_reg(output_reg[VARYING_SLOT_PSIZ])));
2848       }
2849       if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
2850          emit(MOV(retype(brw_writemask(reg, WRITEMASK_Y), BRW_REGISTER_TYPE_D),
2851                   src_reg(output_reg[VARYING_SLOT_LAYER])));
2852       }
2853       if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
2854          emit(MOV(retype(brw_writemask(reg, WRITEMASK_Z), BRW_REGISTER_TYPE_D),
2855                   src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
2856       }
2857    }
2858 }
2859
2860 void
2861 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
2862 {
2863    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2864     *
2865     *     "If a linked set of shaders forming the vertex stage contains no
2866     *     static write to gl_ClipVertex or gl_ClipDistance, but the
2867     *     application has requested clipping against user clip planes through
2868     *     the API, then the coordinate written to gl_Position is used for
2869     *     comparison against the user clip planes."
2870     *
2871     * This function is only called if the shader didn't write to
2872     * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
2873     * if the user wrote to it; otherwise we use gl_Position.
2874     */
2875    gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
2876    if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
2877       clip_vertex = VARYING_SLOT_POS;
2878    }
2879
2880    for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
2881         ++i) {
2882       reg.writemask = 1 << i;
2883       emit(DP4(reg,
2884                src_reg(output_reg[clip_vertex]),
2885                src_reg(this->userplane[i + offset])));
2886    }
2887 }
2888
2889 void
2890 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
2891 {
2892    assert (varying < VARYING_SLOT_MAX);
2893    reg.type = output_reg[varying].type;
2894    current_annotation = output_reg_annotation[varying];
2895    /* Copy the register, saturating if necessary */
2896    vec4_instruction *inst = emit(MOV(reg,
2897                                      src_reg(output_reg[varying])));
2898    if ((varying == VARYING_SLOT_COL0 ||
2899         varying == VARYING_SLOT_COL1 ||
2900         varying == VARYING_SLOT_BFC0 ||
2901         varying == VARYING_SLOT_BFC1) &&
2902        key->clamp_vertex_color) {
2903       inst->saturate = true;
2904    }
2905 }
2906
2907 void
2908 vec4_visitor::emit_urb_slot(int mrf, int varying)
2909 {
2910    struct brw_reg hw_reg = brw_message_reg(mrf);
2911    dst_reg reg = dst_reg(MRF, mrf);
2912    reg.type = BRW_REGISTER_TYPE_F;
2913
2914    switch (varying) {
2915    case VARYING_SLOT_PSIZ:
2916       /* PSIZ is always in slot 0, and is coupled with other flags. */
2917       current_annotation = "indices, point width, clip flags";
2918       emit_psiz_and_flags(hw_reg);
2919       break;
2920    case BRW_VARYING_SLOT_NDC:
2921       current_annotation = "NDC";
2922       emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
2923       break;
2924    case VARYING_SLOT_POS:
2925       current_annotation = "gl_Position";
2926       emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
2927       break;
2928    case VARYING_SLOT_EDGE:
2929       /* This is present when doing unfilled polygons.  We're supposed to copy
2930        * the edge flag from the user-provided vertex array
2931        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
2932        * of that attribute (starts as 1.0f).  This is then used in clipping to
2933        * determine which edges should be drawn as wireframe.
2934        */
2935       current_annotation = "edge flag";
2936       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
2937                                     glsl_type::float_type, WRITEMASK_XYZW))));
2938       break;
2939    case BRW_VARYING_SLOT_PAD:
2940       /* No need to write to this slot */
2941       break;
2942    default:
2943       emit_generic_urb_slot(reg, varying);
2944       break;
2945    }
2946 }
2947
2948 static int
2949 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
2950 {
2951    if (brw->gen >= 6) {
2952       /* URB data written (does not include the message header reg) must
2953        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
2954        * section 5.4.3.2.2: URB_INTERLEAVED.
2955        *
2956        * URB entries are allocated on a multiple of 1024 bits, so an
2957        * extra 128 bits written here to make the end align to 256 is
2958        * no problem.
2959        */
2960       if ((mlen % 2) != 1)
2961          mlen++;
2962    }
2963
2964    return mlen;
2965 }
2966
2967
2968 /**
2969  * Generates the VUE payload plus the necessary URB write instructions to
2970  * output it.
2971  *
2972  * The VUE layout is documented in Volume 2a.
2973  */
2974 void
2975 vec4_visitor::emit_vertex()
2976 {
2977    /* MRF 0 is reserved for the debugger, so start with message header
2978     * in MRF 1.
2979     */
2980    int base_mrf = 1;
2981    int mrf = base_mrf;
2982    /* In the process of generating our URB write message contents, we
2983     * may need to unspill a register or load from an array.  Those
2984     * reads would use MRFs 14-15.
2985     */
2986    int max_usable_mrf = 13;
2987
2988    /* The following assertion verifies that max_usable_mrf causes an
2989     * even-numbered amount of URB write data, which will meet gen6's
2990     * requirements for length alignment.
2991     */
2992    assert ((max_usable_mrf - base_mrf) % 2 == 0);
2993
2994    /* First mrf is the g0-based message header containing URB handles and
2995     * such.
2996     */
2997    emit_urb_write_header(mrf++);
2998
2999    if (brw->gen < 6) {
3000       emit_ndc_computation();
3001    }
3002
3003    /* Lower legacy ff and ClipVertex clipping to clip distances */
3004    if (key->userclip_active && !prog->UsesClipDistanceOut) {
3005       current_annotation = "user clip distances";
3006
3007       output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3008       output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3009
3010       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3011       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3012    }
3013
3014    /* We may need to split this up into several URB writes, so do them in a
3015     * loop.
3016     */
3017    int slot = 0;
3018    bool complete = false;
3019    do {
3020       /* URB offset is in URB row increments, and each of our MRFs is half of
3021        * one of those, since we're doing interleaved writes.
3022        */
3023       int offset = slot / 2;
3024
3025       mrf = base_mrf + 1;
3026       for (; slot < prog_data->vue_map.num_slots; ++slot) {
3027          emit_urb_slot(mrf++, prog_data->vue_map.slot_to_varying[slot]);
3028
3029          /* If this was max_usable_mrf, we can't fit anything more into this
3030           * URB WRITE.
3031           */
3032          if (mrf > max_usable_mrf) {
3033             slot++;
3034             break;
3035          }
3036       }
3037
3038       complete = slot >= prog_data->vue_map.num_slots;
3039       current_annotation = "URB write";
3040       vec4_instruction *inst = emit_urb_write_opcode(complete);
3041       inst->base_mrf = base_mrf;
3042       inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
3043       inst->offset += offset;
3044    } while(!complete);
3045 }
3046
3047
3048 src_reg
3049 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
3050                                  src_reg *reladdr, int reg_offset)
3051 {
3052    /* Because we store the values to scratch interleaved like our
3053     * vertex data, we need to scale the vec4 index by 2.
3054     */
3055    int message_header_scale = 2;
3056
3057    /* Pre-gen6, the message header uses byte offsets instead of vec4
3058     * (16-byte) offset units.
3059     */
3060    if (brw->gen < 6)
3061       message_header_scale *= 16;
3062
3063    if (reladdr) {
3064       src_reg index = src_reg(this, glsl_type::int_type);
3065
3066       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3067       emit_before(inst, MUL(dst_reg(index),
3068                             index, src_reg(message_header_scale)));
3069
3070       return index;
3071    } else {
3072       return src_reg(reg_offset * message_header_scale);
3073    }
3074 }
3075
3076 src_reg
3077 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
3078                                        src_reg *reladdr, int reg_offset)
3079 {
3080    if (reladdr) {
3081       src_reg index = src_reg(this, glsl_type::int_type);
3082
3083       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3084
3085       /* Pre-gen6, the message header uses byte offsets instead of vec4
3086        * (16-byte) offset units.
3087        */
3088       if (brw->gen < 6) {
3089          emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
3090       }
3091
3092       return index;
3093    } else if (brw->gen >= 8) {
3094       /* Store the offset in a GRF so we can send-from-GRF. */
3095       src_reg offset = src_reg(this, glsl_type::int_type);
3096       emit_before(inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3097       return offset;
3098    } else {
3099       int message_header_scale = brw->gen < 6 ? 16 : 1;
3100       return src_reg(reg_offset * message_header_scale);
3101    }
3102 }
3103
3104 /**
3105  * Emits an instruction before @inst to load the value named by @orig_src
3106  * from scratch space at @base_offset to @temp.
3107  *
3108  * @base_offset is measured in 32-byte units (the size of a register).
3109  */
3110 void
3111 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
3112                                 dst_reg temp, src_reg orig_src,
3113                                 int base_offset)
3114 {
3115    int reg_offset = base_offset + orig_src.reg_offset;
3116    src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
3117
3118    emit_before(inst, SCRATCH_READ(temp, index));
3119 }
3120
3121 /**
3122  * Emits an instruction after @inst to store the value to be written
3123  * to @orig_dst to scratch space at @base_offset, from @temp.
3124  *
3125  * @base_offset is measured in 32-byte units (the size of a register).
3126  */
3127 void
3128 vec4_visitor::emit_scratch_write(vec4_instruction *inst, int base_offset)
3129 {
3130    int reg_offset = base_offset + inst->dst.reg_offset;
3131    src_reg index = get_scratch_offset(inst, inst->dst.reladdr, reg_offset);
3132
3133    /* Create a temporary register to store *inst's result in.
3134     *
3135     * We have to be careful in MOVing from our temporary result register in
3136     * the scratch write.  If we swizzle from channels of the temporary that
3137     * weren't initialized, it will confuse live interval analysis, which will
3138     * make spilling fail to make progress.
3139     */
3140    src_reg temp = src_reg(this, glsl_type::vec4_type);
3141    temp.type = inst->dst.type;
3142    int first_writemask_chan = ffs(inst->dst.writemask) - 1;
3143    int swizzles[4];
3144    for (int i = 0; i < 4; i++)
3145       if (inst->dst.writemask & (1 << i))
3146          swizzles[i] = i;
3147       else
3148          swizzles[i] = first_writemask_chan;
3149    temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
3150                                swizzles[2], swizzles[3]);
3151
3152    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3153                                        inst->dst.writemask));
3154    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3155    write->predicate = inst->predicate;
3156    write->ir = inst->ir;
3157    write->annotation = inst->annotation;
3158    inst->insert_after(write);
3159
3160    inst->dst.file = temp.file;
3161    inst->dst.reg = temp.reg;
3162    inst->dst.reg_offset = temp.reg_offset;
3163    inst->dst.reladdr = NULL;
3164 }
3165
3166 /**
3167  * We can't generally support array access in GRF space, because a
3168  * single instruction's destination can only span 2 contiguous
3169  * registers.  So, we send all GRF arrays that get variable index
3170  * access to scratch space.
3171  */
3172 void
3173 vec4_visitor::move_grf_array_access_to_scratch()
3174 {
3175    int scratch_loc[this->virtual_grf_count];
3176
3177    for (int i = 0; i < this->virtual_grf_count; i++) {
3178       scratch_loc[i] = -1;
3179    }
3180
3181    /* First, calculate the set of virtual GRFs that need to be punted
3182     * to scratch due to having any array access on them, and where in
3183     * scratch.
3184     */
3185    foreach_list(node, &this->instructions) {
3186       vec4_instruction *inst = (vec4_instruction *)node;
3187
3188       if (inst->dst.file == GRF && inst->dst.reladdr &&
3189           scratch_loc[inst->dst.reg] == -1) {
3190          scratch_loc[inst->dst.reg] = c->last_scratch;
3191          c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
3192       }
3193
3194       for (int i = 0 ; i < 3; i++) {
3195          src_reg *src = &inst->src[i];
3196
3197          if (src->file == GRF && src->reladdr &&
3198              scratch_loc[src->reg] == -1) {
3199             scratch_loc[src->reg] = c->last_scratch;
3200             c->last_scratch += this->virtual_grf_sizes[src->reg];
3201          }
3202       }
3203    }
3204
3205    /* Now, for anything that will be accessed through scratch, rewrite
3206     * it to load/store.  Note that this is a _safe list walk, because
3207     * we may generate a new scratch_write instruction after the one
3208     * we're processing.
3209     */
3210    foreach_list_safe(node, &this->instructions) {
3211       vec4_instruction *inst = (vec4_instruction *)node;
3212
3213       /* Set up the annotation tracking for new generated instructions. */
3214       base_ir = inst->ir;
3215       current_annotation = inst->annotation;
3216
3217       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
3218          emit_scratch_write(inst, scratch_loc[inst->dst.reg]);
3219       }
3220
3221       for (int i = 0 ; i < 3; i++) {
3222          if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
3223             continue;
3224
3225          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3226
3227          emit_scratch_read(inst, temp, inst->src[i],
3228                            scratch_loc[inst->src[i].reg]);
3229
3230          inst->src[i].file = temp.file;
3231          inst->src[i].reg = temp.reg;
3232          inst->src[i].reg_offset = temp.reg_offset;
3233          inst->src[i].reladdr = NULL;
3234       }
3235    }
3236 }
3237
3238 /**
3239  * Emits an instruction before @inst to load the value named by @orig_src
3240  * from the pull constant buffer (surface) at @base_offset to @temp.
3241  */
3242 void
3243 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
3244                                       dst_reg temp, src_reg orig_src,
3245                                       int base_offset)
3246 {
3247    int reg_offset = base_offset + orig_src.reg_offset;
3248    src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3249    src_reg offset = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
3250    vec4_instruction *load;
3251
3252    if (brw->gen >= 7) {
3253       dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
3254       grf_offset.type = offset.type;
3255       emit_before(inst, MOV(grf_offset, offset));
3256
3257       load = new(mem_ctx) vec4_instruction(this,
3258                                            VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
3259                                            temp, index, src_reg(grf_offset));
3260    } else {
3261       load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
3262                                            temp, index, offset);
3263       load->base_mrf = 14;
3264       load->mlen = 1;
3265    }
3266    emit_before(inst, load);
3267 }
3268
3269 /**
3270  * Implements array access of uniforms by inserting a
3271  * PULL_CONSTANT_LOAD instruction.
3272  *
3273  * Unlike temporary GRF array access (where we don't support it due to
3274  * the difficulty of doing relative addressing on instruction
3275  * destinations), we could potentially do array access of uniforms
3276  * that were loaded in GRF space as push constants.  In real-world
3277  * usage we've seen, though, the arrays being used are always larger
3278  * than we could load as push constants, so just always move all
3279  * uniform array access out to a pull constant buffer.
3280  */
3281 void
3282 vec4_visitor::move_uniform_array_access_to_pull_constants()
3283 {
3284    int pull_constant_loc[this->uniforms];
3285
3286    for (int i = 0; i < this->uniforms; i++) {
3287       pull_constant_loc[i] = -1;
3288    }
3289
3290    /* Walk through and find array access of uniforms.  Put a copy of that
3291     * uniform in the pull constant buffer.
3292     *
3293     * Note that we don't move constant-indexed accesses to arrays.  No
3294     * testing has been done of the performance impact of this choice.
3295     */
3296    foreach_list_safe(node, &this->instructions) {
3297       vec4_instruction *inst = (vec4_instruction *)node;
3298
3299       for (int i = 0 ; i < 3; i++) {
3300          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3301             continue;
3302
3303          int uniform = inst->src[i].reg;
3304
3305          /* If this array isn't already present in the pull constant buffer,
3306           * add it.
3307           */
3308          if (pull_constant_loc[uniform] == -1) {
3309             const float **values = &stage_prog_data->param[uniform * 4];
3310
3311             pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3312
3313             assert(uniform < uniform_array_size);
3314             for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3315                stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3316                   = values[j];
3317             }
3318          }
3319
3320          /* Set up the annotation tracking for new generated instructions. */
3321          base_ir = inst->ir;
3322          current_annotation = inst->annotation;
3323
3324          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3325
3326          emit_pull_constant_load(inst, temp, inst->src[i],
3327                                  pull_constant_loc[uniform]);
3328
3329          inst->src[i].file = temp.file;
3330          inst->src[i].reg = temp.reg;
3331          inst->src[i].reg_offset = temp.reg_offset;
3332          inst->src[i].reladdr = NULL;
3333       }
3334    }
3335
3336    /* Now there are no accesses of the UNIFORM file with a reladdr, so
3337     * no need to track them as larger-than-vec4 objects.  This will be
3338     * relied on in cutting out unused uniform vectors from push
3339     * constants.
3340     */
3341    split_uniform_registers();
3342 }
3343
3344 void
3345 vec4_visitor::resolve_ud_negate(src_reg *reg)
3346 {
3347    if (reg->type != BRW_REGISTER_TYPE_UD ||
3348        !reg->negate)
3349       return;
3350
3351    src_reg temp = src_reg(this, glsl_type::uvec4_type);
3352    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3353    *reg = temp;
3354 }
3355
3356 vec4_visitor::vec4_visitor(struct brw_context *brw,
3357                            struct brw_vec4_compile *c,
3358                            struct gl_program *prog,
3359                            const struct brw_vec4_prog_key *key,
3360                            struct brw_vec4_prog_data *prog_data,
3361                            struct gl_shader_program *shader_prog,
3362                            gl_shader_stage stage,
3363                            void *mem_ctx,
3364                            bool debug_flag,
3365                            bool no_spills,
3366                            shader_time_shader_type st_base,
3367                            shader_time_shader_type st_written,
3368                            shader_time_shader_type st_reset)
3369    : backend_visitor(brw, shader_prog, prog, &prog_data->base, stage),
3370      c(c),
3371      key(key),
3372      prog_data(prog_data),
3373      sanity_param_count(0),
3374      fail_msg(NULL),
3375      first_non_payload_grf(0),
3376      need_all_constants_in_pull_buffer(false),
3377      debug_flag(debug_flag),
3378      no_spills(no_spills),
3379      st_base(st_base),
3380      st_written(st_written),
3381      st_reset(st_reset)
3382 {
3383    this->mem_ctx = mem_ctx;
3384    this->failed = false;
3385
3386    this->base_ir = NULL;
3387    this->current_annotation = NULL;
3388    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3389
3390    this->variable_ht = hash_table_ctor(0,
3391                                        hash_table_pointer_hash,
3392                                        hash_table_pointer_compare);
3393
3394    this->virtual_grf_start = NULL;
3395    this->virtual_grf_end = NULL;
3396    this->virtual_grf_sizes = NULL;
3397    this->virtual_grf_count = 0;
3398    this->virtual_grf_reg_map = NULL;
3399    this->virtual_grf_reg_count = 0;
3400    this->virtual_grf_array_size = 0;
3401    this->live_intervals_valid = false;
3402
3403    this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3404
3405    this->uniforms = 0;
3406
3407    /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3408     * at least one. See setup_uniforms() in brw_vec4.cpp.
3409     */
3410    this->uniform_array_size = 1;
3411    if (prog_data) {
3412       this->uniform_array_size = MAX2(stage_prog_data->nr_params, 1);
3413    }
3414
3415    this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3416    this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3417 }
3418
3419 vec4_visitor::~vec4_visitor()
3420 {
3421    hash_table_dtor(this->variable_ht);
3422 }
3423
3424
3425 void
3426 vec4_visitor::fail(const char *format, ...)
3427 {
3428    va_list va;
3429    char *msg;
3430
3431    if (failed)
3432       return;
3433
3434    failed = true;
3435
3436    va_start(va, format);
3437    msg = ralloc_vasprintf(mem_ctx, format, va);
3438    va_end(va);
3439    msg = ralloc_asprintf(mem_ctx, "vec4 compile failed: %s\n", msg);
3440
3441    this->fail_msg = msg;
3442
3443    if (debug_flag) {
3444       fprintf(stderr, "%s",  msg);
3445    }
3446 }
3447
3448 } /* namespace brw */