src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "glsl/ir_uniform.h"
  26 extern "C" {
  27 #include "program/sampler.h"
  28 }
  29
  30 namespace brw {
  31
  32 vec4_instruction::vec4_instruction(vec4_visitor *v,
  33                                    enum opcode opcode, dst_reg dst,
  34                                    src_reg src0, src_reg src1, src_reg src2)
  35 {
  36    this->opcode = opcode;
  37    this->dst = dst;
  38    this->src[0] = src0;
  39    this->src[1] = src1;
  40    this->src[2] = src2;
  41    this->saturate = false;
  42    this->force_writemask_all = false;
  43    this->no_dd_clear = false;
  44    this->no_dd_check = false;
  45    this->conditional_mod = BRW_CONDITIONAL_NONE;
  46    this->sampler = 0;
  47    this->texture_offset = 0;
  48    this->target = 0;
  49    this->shadow_compare = false;
  50    this->ir = v->base_ir;
  51    this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
  52    this->header_present = false;
  53    this->mlen = 0;
  54    this->base_mrf = 0;
  55    this->offset = 0;
  56    this->annotation = v->current_annotation;
  57 }
  58
  59 vec4_instruction *
  60 vec4_visitor::emit(vec4_instruction *inst)
  61 {
  62    this->instructions.push_tail(inst);
  63
  64    return inst;
  65 }
  66
  67 vec4_instruction *
  68 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
  69 {
  70    new_inst->ir = inst->ir;
  71    new_inst->annotation = inst->annotation;
  72
  73    inst->insert_before(new_inst);
  74
  75    return inst;
  76 }
  77
  78 vec4_instruction *
  79 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
  80                    src_reg src0, src_reg src1, src_reg src2)
  81 {
  82    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
  83                                              src0, src1, src2));
  84 }
  85
  86
  87 vec4_instruction *
  88 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
  89 {
  90    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
  91 }
  92
  93 vec4_instruction *
  94 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
  95 {
  96    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
  97 }
  98
  99 vec4_instruction *
 100 vec4_visitor::emit(enum opcode opcode, dst_reg dst)
 101 {
 102    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst));
 103 }
 104
 105 vec4_instruction *
 106 vec4_visitor::emit(enum opcode opcode)
 107 {
 108    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
 109 }
 110
 111 #define ALU1(op)                                                        \
 112    vec4_instruction *                                                   \
 113    vec4_visitor::op(dst_reg dst, src_reg src0)                          \
 114    {                                                                    \
 115       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 116                                            src0);                       \
 117    }
 118
 119 #define ALU2(op)                                                        \
 120    vec4_instruction *                                                   \
 121    vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1)            \
 122    {                                                                    \
 123       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 124                                            src0, src1);                 \
 125    }
 126
 127 #define ALU3(op)                                                        \
 128    vec4_instruction *                                                   \
 129    vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1, src_reg src2)\
 130    {                                                                    \
 131       assert(brw->gen >= 6);                                            \
 132       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 133                                            src0, src1, src2);           \
 134    }
 135
 136 ALU1(NOT)
 137 ALU1(MOV)
 138 ALU1(FRC)
 139 ALU1(RNDD)
 140 ALU1(RNDE)
 141 ALU1(RNDZ)
 142 ALU1(F32TO16)
 143 ALU1(F16TO32)
 144 ALU2(ADD)
 145 ALU2(MUL)
 146 ALU2(MACH)
 147 ALU2(AND)
 148 ALU2(OR)
 149 ALU2(XOR)
 150 ALU2(DP3)
 151 ALU2(DP4)
 152 ALU2(DPH)
 153 ALU2(SHL)
 154 ALU2(SHR)
 155 ALU2(ASR)
 156 ALU3(LRP)
 157 ALU1(BFREV)
 158 ALU3(BFE)
 159 ALU2(BFI1)
 160 ALU3(BFI2)
 161 ALU1(FBH)
 162 ALU1(FBL)
 163 ALU1(CBIT)
 164 ALU3(MAD)
 165 ALU2(ADDC)
 166 ALU2(SUBB)
 167
 168 /** Gen4 predicated IF. */
 169 vec4_instruction *
 170 vec4_visitor::IF(uint32_t predicate)
 171 {
 172    vec4_instruction *inst;
 173
 174    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
 175    inst->predicate = predicate;
 176
 177    return inst;
 178 }
 179
 180 /** Gen6 IF with embedded comparison. */
 181 vec4_instruction *
 182 vec4_visitor::IF(src_reg src0, src_reg src1, uint32_t condition)
 183 {
 184    assert(brw->gen == 6);
 185
 186    vec4_instruction *inst;
 187
 188    resolve_ud_negate(&src0);
 189    resolve_ud_negate(&src1);
 190
 191    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
 192                                         src0, src1);
 193    inst->conditional_mod = condition;
 194
 195    return inst;
 196 }
 197
 198 /**
 199  * CMP: Sets the low bit of the destination channels with the result
 200  * of the comparison, while the upper bits are undefined, and updates
 201  * the flag register with the packed 16 bits of the result.
 202  */
 203 vec4_instruction *
 204 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, uint32_t condition)
 205 {
 206    vec4_instruction *inst;
 207
 208    /* original gen4 does type conversion to the destination type
 209     * before before comparison, producing garbage results for floating
 210     * point comparisons.
 211     */
 212    if (brw->gen == 4) {
 213       dst.type = src0.type;
 214       if (dst.file == HW_REG)
 215          dst.fixed_hw_reg.type = dst.type;
 216    }
 217
 218    resolve_ud_negate(&src0);
 219    resolve_ud_negate(&src1);
 220
 221    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
 222    inst->conditional_mod = condition;
 223
 224    return inst;
 225 }
 226
 227 vec4_instruction *
 228 vec4_visitor::SCRATCH_READ(dst_reg dst, src_reg index)
 229 {
 230    vec4_instruction *inst;
 231
 232    inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_READ,
 233                                         dst, index);
 234    inst->base_mrf = 14;
 235    inst->mlen = 2;
 236
 237    return inst;
 238 }
 239
 240 vec4_instruction *
 241 vec4_visitor::SCRATCH_WRITE(dst_reg dst, src_reg src, src_reg index)
 242 {
 243    vec4_instruction *inst;
 244
 245    inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_WRITE,
 246                                         dst, src, index);
 247    inst->base_mrf = 13;
 248    inst->mlen = 3;
 249
 250    return inst;
 251 }
 252
 253 void
 254 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
 255 {
 256    static enum opcode dot_opcodes[] = {
 257       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
 258    };
 259
 260    emit(dot_opcodes[elements - 2], dst, src0, src1);
 261 }
 262
 263 src_reg
 264 vec4_visitor::fix_3src_operand(src_reg src)
 265 {
 266    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
 267     * able to use vertical stride of zero to replicate the vec4 uniform, like
 268     *
 269     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
 270     *
 271     * But you can't, since vertical stride is always four in three-source
 272     * instructions. Instead, insert a MOV instruction to do the replication so
 273     * that the three-source instruction can consume it.
 274     */
 275
 276    /* The MOV is only needed if the source is a uniform or immediate. */
 277    if (src.file != UNIFORM && src.file != IMM)
 278       return src;
 279
 280    if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
 281       return src;
 282
 283    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 284    expanded.type = src.type;
 285    emit(MOV(expanded, src));
 286    return src_reg(expanded);
 287 }
 288
 289 src_reg
 290 vec4_visitor::fix_math_operand(src_reg src)
 291 {
 292    /* The gen6 math instruction ignores the source modifiers --
 293     * swizzle, abs, negate, and at least some parts of the register
 294     * region description.
 295     *
 296     * Rather than trying to enumerate all these cases, *always* expand the
 297     * operand to a temp GRF for gen6.
 298     *
 299     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 300     * can't use.
 301     */
 302
 303    if (brw->gen == 7 && src.file != IMM)
 304       return src;
 305
 306    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 307    expanded.type = src.type;
 308    emit(MOV(expanded, src));
 309    return src_reg(expanded);
 310 }
 311
 312 void
 313 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
 314 {
 315    src = fix_math_operand(src);
 316
 317    if (dst.writemask != WRITEMASK_XYZW) {
 318       /* The gen6 math instruction must be align1, so we can't do
 319        * writemasks.
 320        */
 321       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 322
 323       emit(opcode, temp_dst, src);
 324
 325       emit(MOV(dst, src_reg(temp_dst)));
 326    } else {
 327       emit(opcode, dst, src);
 328    }
 329 }
 330
 331 void
 332 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
 333 {
 334    vec4_instruction *inst = emit(opcode, dst, src);
 335    inst->base_mrf = 1;
 336    inst->mlen = 1;
 337 }
 338
 339 void
 340 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
 341 {
 342    switch (opcode) {
 343    case SHADER_OPCODE_RCP:
 344    case SHADER_OPCODE_RSQ:
 345    case SHADER_OPCODE_SQRT:
 346    case SHADER_OPCODE_EXP2:
 347    case SHADER_OPCODE_LOG2:
 348    case SHADER_OPCODE_SIN:
 349    case SHADER_OPCODE_COS:
 350       break;
 351    default:
 352       assert(!"not reached: bad math opcode");
 353       return;
 354    }
 355
 356    if (brw->gen >= 6) {
 357       return emit_math1_gen6(opcode, dst, src);
 358    } else {
 359       return emit_math1_gen4(opcode, dst, src);
 360    }
 361 }
 362
 363 void
 364 vec4_visitor::emit_math2_gen6(enum opcode opcode,
 365                               dst_reg dst, src_reg src0, src_reg src1)
 366 {
 367    src0 = fix_math_operand(src0);
 368    src1 = fix_math_operand(src1);
 369
 370    if (dst.writemask != WRITEMASK_XYZW) {
 371       /* The gen6 math instruction must be align1, so we can't do
 372        * writemasks.
 373        */
 374       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 375       temp_dst.type = dst.type;
 376
 377       emit(opcode, temp_dst, src0, src1);
 378
 379       emit(MOV(dst, src_reg(temp_dst)));
 380    } else {
 381       emit(opcode, dst, src0, src1);
 382    }
 383 }
 384
 385 void
 386 vec4_visitor::emit_math2_gen4(enum opcode opcode,
 387                               dst_reg dst, src_reg src0, src_reg src1)
 388 {
 389    vec4_instruction *inst = emit(opcode, dst, src0, src1);
 390    inst->base_mrf = 1;
 391    inst->mlen = 2;
 392 }
 393
 394 void
 395 vec4_visitor::emit_math(enum opcode opcode,
 396                         dst_reg dst, src_reg src0, src_reg src1)
 397 {
 398    switch (opcode) {
 399    case SHADER_OPCODE_POW:
 400    case SHADER_OPCODE_INT_QUOTIENT:
 401    case SHADER_OPCODE_INT_REMAINDER:
 402       break;
 403    default:
 404       assert(!"not reached: unsupported binary math opcode");
 405       return;
 406    }
 407
 408    if (brw->gen >= 6) {
 409       return emit_math2_gen6(opcode, dst, src0, src1);
 410    } else {
 411       return emit_math2_gen4(opcode, dst, src0, src1);
 412    }
 413 }
 414
 415 void
 416 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 417 {
 418    if (brw->gen < 7)
 419       assert(!"ir_unop_pack_half_2x16 should be lowered");
 420
 421    assert(dst.type == BRW_REGISTER_TYPE_UD);
 422    assert(src0.type == BRW_REGISTER_TYPE_F);
 423
 424    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
 425     *
 426     *   Because this instruction does not have a 16-bit floating-point type,
 427     *   the destination data type must be Word (W).
 428     *
 429     *   The destination must be DWord-aligned and specify a horizontal stride
 430     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
 431     *   each destination channel and the upper word is not modified.
 432     *
 433     * The above restriction implies that the f32to16 instruction must use
 434     * align1 mode, because only in align1 mode is it possible to specify
 435     * horizontal stride.  We choose here to defy the hardware docs and emit
 436     * align16 instructions.
 437     *
 438     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
 439     * instructions. I was partially successful in that the code passed all
 440     * tests.  However, the code was dubiously correct and fragile, and the
 441     * tests were not harsh enough to probe that frailty. Not trusting the
 442     * code, I chose instead to remain in align16 mode in defiance of the hw
 443     * docs).
 444     *
 445     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
 446     * simulator, emitting a f32to16 in align16 mode with UD as destination
 447     * data type is safe. The behavior differs from that specified in the PRM
 448     * in that the upper word of each destination channel is cleared to 0.
 449     */
 450
 451    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 452    src_reg tmp_src(tmp_dst);
 453
 454 #if 0
 455    /* Verify the undocumented behavior on which the following instructions
 456     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
 457     * then the result of the bit-or instruction below will be incorrect.
 458     *
 459     * You should inspect the disasm output in order to verify that the MOV is
 460     * not optimized away.
 461     */
 462    emit(MOV(tmp_dst, src_reg(0x12345678u)));
 463 #endif
 464
 465    /* Give tmp the form below, where "." means untouched.
 466     *
 467     *     w z          y          x w z          y          x
 468     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
 469     *
 470     * That the upper word of each write-channel be 0 is required for the
 471     * following bit-shift and bit-or instructions to work. Note that this
 472     * relies on the undocumented hardware behavior mentioned above.
 473     */
 474    tmp_dst.writemask = WRITEMASK_XY;
 475    emit(F32TO16(tmp_dst, src0));
 476
 477    /* Give the write-channels of dst the form:
 478     *   0xhhhh0000
 479     */
 480    tmp_src.swizzle = BRW_SWIZZLE_YYYY;
 481    emit(SHL(dst, tmp_src, src_reg(16u)));
 482
 483    /* Finally, give the write-channels of dst the form of packHalf2x16's
 484     * output:
 485     *   0xhhhhllll
 486     */
 487    tmp_src.swizzle = BRW_SWIZZLE_XXXX;
 488    emit(OR(dst, src_reg(dst), tmp_src));
 489 }
 490
 491 void
 492 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 493 {
 494    if (brw->gen < 7)
 495       assert(!"ir_unop_unpack_half_2x16 should be lowered");
 496
 497    assert(dst.type == BRW_REGISTER_TYPE_F);
 498    assert(src0.type == BRW_REGISTER_TYPE_UD);
 499
 500    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
 501     *
 502     *   Because this instruction does not have a 16-bit floating-point type,
 503     *   the source data type must be Word (W). The destination type must be
 504     *   F (Float).
 505     *
 506     * To use W as the source data type, we must adjust horizontal strides,
 507     * which is only possible in align1 mode. All my [chadv] attempts at
 508     * emitting align1 instructions for unpackHalf2x16 failed to pass the
 509     * Piglit tests, so I gave up.
 510     *
 511     * I've verified that, on gen7 hardware and the simulator, it is safe to
 512     * emit f16to32 in align16 mode with UD as source data type.
 513     */
 514
 515    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 516    src_reg tmp_src(tmp_dst);
 517
 518    tmp_dst.writemask = WRITEMASK_X;
 519    emit(AND(tmp_dst, src0, src_reg(0xffffu)));
 520
 521    tmp_dst.writemask = WRITEMASK_Y;
 522    emit(SHR(tmp_dst, src0, src_reg(16u)));
 523
 524    dst.writemask = WRITEMASK_XY;
 525    emit(F16TO32(dst, tmp_src));
 526 }
 527
 528 void
 529 vec4_visitor::visit_instructions(const exec_list *list)
 530 {
 531    foreach_list(node, list) {
 532       ir_instruction *ir = (ir_instruction *)node;
 533
 534       base_ir = ir;
 535       ir->accept(this);
 536    }
 537 }
 538
 539
 540 static int
 541 type_size(const struct glsl_type *type)
 542 {
 543    unsigned int i;
 544    int size;
 545
 546    switch (type->base_type) {
 547    case GLSL_TYPE_UINT:
 548    case GLSL_TYPE_INT:
 549    case GLSL_TYPE_FLOAT:
 550    case GLSL_TYPE_BOOL:
 551       if (type->is_matrix()) {
 552          return type->matrix_columns;
 553       } else {
 554          /* Regardless of size of vector, it gets a vec4. This is bad
 555           * packing for things like floats, but otherwise arrays become a
 556           * mess.  Hopefully a later pass over the code can pack scalars
 557           * down if appropriate.
 558           */
 559          return 1;
 560       }
 561    case GLSL_TYPE_ARRAY:
 562       assert(type->length > 0);
 563       return type_size(type->fields.array) * type->length;
 564    case GLSL_TYPE_STRUCT:
 565       size = 0;
 566       for (i = 0; i < type->length; i++) {
 567          size += type_size(type->fields.structure[i].type);
 568       }
 569       return size;
 570    case GLSL_TYPE_SAMPLER:
 571       /* Samplers take up one slot in UNIFORMS[], but they're baked in
 572        * at link time.
 573        */
 574       return 1;
 575    case GLSL_TYPE_ATOMIC_UINT:
 576       return 0;
 577    case GLSL_TYPE_IMAGE:
 578    case GLSL_TYPE_VOID:
 579    case GLSL_TYPE_ERROR:
 580    case GLSL_TYPE_INTERFACE:
 581       assert(0);
 582       break;
 583    }
 584
 585    return 0;
 586 }
 587
 588 int
 589 vec4_visitor::virtual_grf_alloc(int size)
 590 {
 591    if (virtual_grf_array_size <= virtual_grf_count) {
 592       if (virtual_grf_array_size == 0)
 593          virtual_grf_array_size = 16;
 594       else
 595          virtual_grf_array_size *= 2;
 596       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 597                                    virtual_grf_array_size);
 598       virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
 599                                      virtual_grf_array_size);
 600    }
 601    virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
 602    virtual_grf_reg_count += size;
 603    virtual_grf_sizes[virtual_grf_count] = size;
 604    return virtual_grf_count++;
 605 }
 606
 607 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 608 {
 609    init();
 610
 611    this->file = GRF;
 612    this->reg = v->virtual_grf_alloc(type_size(type));
 613
 614    if (type->is_array() || type->is_record()) {
 615       this->swizzle = BRW_SWIZZLE_NOOP;
 616    } else {
 617       this->swizzle = swizzle_for_size(type->vector_elements);
 618    }
 619
 620    this->type = brw_type_for_base_type(type);
 621 }
 622
 623 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 624 {
 625    init();
 626
 627    this->file = GRF;
 628    this->reg = v->virtual_grf_alloc(type_size(type));
 629
 630    if (type->is_array() || type->is_record()) {
 631       this->writemask = WRITEMASK_XYZW;
 632    } else {
 633       this->writemask = (1 << type->vector_elements) - 1;
 634    }
 635
 636    this->type = brw_type_for_base_type(type);
 637 }
 638
 639 /* Our support for uniforms is piggy-backed on the struct
 640  * gl_fragment_program, because that's where the values actually
 641  * get stored, rather than in some global gl_shader_program uniform
 642  * store.
 643  */
 644 void
 645 vec4_visitor::setup_uniform_values(ir_variable *ir)
 646 {
 647    int namelen = strlen(ir->name);
 648
 649    /* The data for our (non-builtin) uniforms is stored in a series of
 650     * gl_uniform_driver_storage structs for each subcomponent that
 651     * glGetUniformLocation() could name.  We know it's been set up in the same
 652     * order we'd walk the type, so walk the list of storage and find anything
 653     * with our name, or the prefix of a component that starts with our name.
 654     */
 655    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 656       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 657
 658       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 659           (storage->name[namelen] != 0 &&
 660            storage->name[namelen] != '.' &&
 661            storage->name[namelen] != '[')) {
 662          continue;
 663       }
 664
 665       gl_constant_value *components = storage->storage;
 666       unsigned vector_count = (MAX2(storage->array_elements, 1) *
 667                                storage->type->matrix_columns);
 668
 669       for (unsigned s = 0; s < vector_count; s++) {
 670          assert(uniforms < uniform_array_size);
 671          uniform_vector_size[uniforms] = storage->type->vector_elements;
 672
 673          int i;
 674          for (i = 0; i < uniform_vector_size[uniforms]; i++) {
 675             stage_prog_data->param[uniforms * 4 + i] = &components->f;
 676             components++;
 677          }
 678          for (; i < 4; i++) {
 679             static float zero = 0;
 680             stage_prog_data->param[uniforms * 4 + i] = &zero;
 681          }
 682
 683          uniforms++;
 684       }
 685    }
 686 }
 687
 688 void
 689 vec4_visitor::setup_uniform_clipplane_values()
 690 {
 691    gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
 692
 693    for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
 694       assert(this->uniforms < uniform_array_size);
 695       this->uniform_vector_size[this->uniforms] = 4;
 696       this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
 697       this->userplane[i].type = BRW_REGISTER_TYPE_F;
 698       for (int j = 0; j < 4; ++j) {
 699          stage_prog_data->param[this->uniforms * 4 + j] = &clip_planes[i][j];
 700       }
 701       ++this->uniforms;
 702    }
 703 }
 704
 705 /* Our support for builtin uniforms is even scarier than non-builtin.
 706  * It sits on top of the PROG_STATE_VAR parameters that are
 707  * automatically updated from GL context state.
 708  */
 709 void
 710 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
 711 {
 712    const ir_state_slot *const slots = ir->state_slots;
 713    assert(ir->state_slots != NULL);
 714
 715    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 716       /* This state reference has already been setup by ir_to_mesa,
 717        * but we'll get the same index back here.  We can reference
 718        * ParameterValues directly, since unlike brw_fs.cpp, we never
 719        * add new state references during compile.
 720        */
 721       int index = _mesa_add_state_reference(this->prog->Parameters,
 722                                             (gl_state_index *)slots[i].tokens);
 723       float *values = &this->prog->Parameters->ParameterValues[index][0].f;
 724
 725       assert(this->uniforms < uniform_array_size);
 726       this->uniform_vector_size[this->uniforms] = 0;
 727       /* Add each of the unique swizzled channels of the element.
 728        * This will end up matching the size of the glsl_type of this field.
 729        */
 730       int last_swiz = -1;
 731       for (unsigned int j = 0; j < 4; j++) {
 732          int swiz = GET_SWZ(slots[i].swizzle, j);
 733          last_swiz = swiz;
 734
 735          stage_prog_data->param[this->uniforms * 4 + j] = &values[swiz];
 736          assert(this->uniforms < uniform_array_size);
 737          if (swiz <= last_swiz)
 738             this->uniform_vector_size[this->uniforms]++;
 739       }
 740       this->uniforms++;
 741    }
 742 }
 743
 744 dst_reg *
 745 vec4_visitor::variable_storage(ir_variable *var)
 746 {
 747    return (dst_reg *)hash_table_find(this->variable_ht, var);
 748 }
 749
 750 void
 751 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir, uint32_t *predicate)
 752 {
 753    ir_expression *expr = ir->as_expression();
 754
 755    *predicate = BRW_PREDICATE_NORMAL;
 756
 757    if (expr) {
 758       src_reg op[2];
 759       vec4_instruction *inst;
 760
 761       assert(expr->get_num_operands() <= 2);
 762       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 763          expr->operands[i]->accept(this);
 764          op[i] = this->result;
 765
 766          resolve_ud_negate(&op[i]);
 767       }
 768
 769       switch (expr->operation) {
 770       case ir_unop_logic_not:
 771          inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
 772          inst->conditional_mod = BRW_CONDITIONAL_Z;
 773          break;
 774
 775       case ir_binop_logic_xor:
 776          inst = emit(XOR(dst_null_d(), op[0], op[1]));
 777          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 778          break;
 779
 780       case ir_binop_logic_or:
 781          inst = emit(OR(dst_null_d(), op[0], op[1]));
 782          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 783          break;
 784
 785       case ir_binop_logic_and:
 786          inst = emit(AND(dst_null_d(), op[0], op[1]));
 787          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 788          break;
 789
 790       case ir_unop_f2b:
 791          if (brw->gen >= 6) {
 792             emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
 793          } else {
 794             inst = emit(MOV(dst_null_f(), op[0]));
 795             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 796          }
 797          break;
 798
 799       case ir_unop_i2b:
 800          if (brw->gen >= 6) {
 801             emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 802          } else {
 803             inst = emit(MOV(dst_null_d(), op[0]));
 804             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 805          }
 806          break;
 807
 808       case ir_binop_all_equal:
 809          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 810          *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
 811          break;
 812
 813       case ir_binop_any_nequal:
 814          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 815          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 816          break;
 817
 818       case ir_unop_any:
 819          inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 820          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 821          break;
 822
 823       case ir_binop_greater:
 824       case ir_binop_gequal:
 825       case ir_binop_less:
 826       case ir_binop_lequal:
 827       case ir_binop_equal:
 828       case ir_binop_nequal:
 829          emit(CMP(dst_null_d(), op[0], op[1],
 830                   brw_conditional_for_comparison(expr->operation)));
 831          break;
 832
 833       default:
 834          assert(!"not reached");
 835          break;
 836       }
 837       return;
 838    }
 839
 840    ir->accept(this);
 841
 842    resolve_ud_negate(&this->result);
 843
 844    if (brw->gen >= 6) {
 845       vec4_instruction *inst = emit(AND(dst_null_d(),
 846                                         this->result, src_reg(1)));
 847       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 848    } else {
 849       vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
 850       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 851    }
 852 }
 853
 854 /**
 855  * Emit a gen6 IF statement with the comparison folded into the IF
 856  * instruction.
 857  */
 858 void
 859 vec4_visitor::emit_if_gen6(ir_if *ir)
 860 {
 861    ir_expression *expr = ir->condition->as_expression();
 862
 863    if (expr) {
 864       src_reg op[2];
 865       dst_reg temp;
 866
 867       assert(expr->get_num_operands() <= 2);
 868       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 869          expr->operands[i]->accept(this);
 870          op[i] = this->result;
 871       }
 872
 873       switch (expr->operation) {
 874       case ir_unop_logic_not:
 875          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
 876          return;
 877
 878       case ir_binop_logic_xor:
 879          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
 880          return;
 881
 882       case ir_binop_logic_or:
 883          temp = dst_reg(this, glsl_type::bool_type);
 884          emit(OR(temp, op[0], op[1]));
 885          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 886          return;
 887
 888       case ir_binop_logic_and:
 889          temp = dst_reg(this, glsl_type::bool_type);
 890          emit(AND(temp, op[0], op[1]));
 891          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 892          return;
 893
 894       case ir_unop_f2b:
 895          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 896          return;
 897
 898       case ir_unop_i2b:
 899          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 900          return;
 901
 902       case ir_binop_greater:
 903       case ir_binop_gequal:
 904       case ir_binop_less:
 905       case ir_binop_lequal:
 906       case ir_binop_equal:
 907       case ir_binop_nequal:
 908          emit(IF(op[0], op[1],
 909                  brw_conditional_for_comparison(expr->operation)));
 910          return;
 911
 912       case ir_binop_all_equal:
 913          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 914          emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
 915          return;
 916
 917       case ir_binop_any_nequal:
 918          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 919          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 920          return;
 921
 922       case ir_unop_any:
 923          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 924          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 925          return;
 926
 927       default:
 928          assert(!"not reached");
 929          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 930          return;
 931       }
 932       return;
 933    }
 934
 935    ir->condition->accept(this);
 936
 937    emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
 938 }
 939
 940 void
 941 vec4_visitor::visit(ir_variable *ir)
 942 {
 943    dst_reg *reg = NULL;
 944
 945    if (variable_storage(ir))
 946       return;
 947
 948    switch (ir->data.mode) {
 949    case ir_var_shader_in:
 950       reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
 951       break;
 952
 953    case ir_var_shader_out:
 954       reg = new(mem_ctx) dst_reg(this, ir->type);
 955
 956       for (int i = 0; i < type_size(ir->type); i++) {
 957          output_reg[ir->data.location + i] = *reg;
 958          output_reg[ir->data.location + i].reg_offset = i;
 959          output_reg[ir->data.location + i].type =
 960             brw_type_for_base_type(ir->type->get_scalar_type());
 961          output_reg_annotation[ir->data.location + i] = ir->name;
 962       }
 963       break;
 964
 965    case ir_var_auto:
 966    case ir_var_temporary:
 967       reg = new(mem_ctx) dst_reg(this, ir->type);
 968       break;
 969
 970    case ir_var_uniform:
 971       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
 972
 973       /* Thanks to the lower_ubo_reference pass, we will see only
 974        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
 975        * variables, so no need for them to be in variable_ht.
 976        *
 977        * Atomic counters take no uniform storage, no need to do
 978        * anything here.
 979        */
 980       if (ir->is_in_uniform_block() || ir->type->contains_atomic())
 981          return;
 982
 983       /* Track how big the whole uniform variable is, in case we need to put a
 984        * copy of its data into pull constants for array access.
 985        */
 986       assert(this->uniforms < uniform_array_size);
 987       this->uniform_size[this->uniforms] = type_size(ir->type);
 988
 989       if (!strncmp(ir->name, "gl_", 3)) {
 990          setup_builtin_uniform_values(ir);
 991       } else {
 992          setup_uniform_values(ir);
 993       }
 994       break;
 995
 996    case ir_var_system_value:
 997       reg = make_reg_for_system_value(ir);
 998       break;
 999
1000    default:
1001       assert(!"not reached");
1002    }
1003
1004    reg->type = brw_type_for_base_type(ir->type);
1005    hash_table_insert(this->variable_ht, reg, ir);
1006 }
1007
1008 void
1009 vec4_visitor::visit(ir_loop *ir)
1010 {
1011    /* We don't want debugging output to print the whole body of the
1012     * loop as the annotation.
1013     */
1014    this->base_ir = NULL;
1015
1016    emit(BRW_OPCODE_DO);
1017
1018    visit_instructions(&ir->body_instructions);
1019
1020    emit(BRW_OPCODE_WHILE);
1021 }
1022
1023 void
1024 vec4_visitor::visit(ir_loop_jump *ir)
1025 {
1026    switch (ir->mode) {
1027    case ir_loop_jump::jump_break:
1028       emit(BRW_OPCODE_BREAK);
1029       break;
1030    case ir_loop_jump::jump_continue:
1031       emit(BRW_OPCODE_CONTINUE);
1032       break;
1033    }
1034 }
1035
1036
1037 void
1038 vec4_visitor::visit(ir_function_signature *ir)
1039 {
1040    assert(0);
1041    (void)ir;
1042 }
1043
1044 void
1045 vec4_visitor::visit(ir_function *ir)
1046 {
1047    /* Ignore function bodies other than main() -- we shouldn't see calls to
1048     * them since they should all be inlined.
1049     */
1050    if (strcmp(ir->name, "main") == 0) {
1051       const ir_function_signature *sig;
1052       exec_list empty;
1053
1054       sig = ir->matching_signature(NULL, &empty);
1055
1056       assert(sig);
1057
1058       visit_instructions(&sig->body);
1059    }
1060 }
1061
1062 bool
1063 vec4_visitor::try_emit_sat(ir_expression *ir)
1064 {
1065    ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
1066    if (!sat_src)
1067       return false;
1068
1069    sat_src->accept(this);
1070    src_reg src = this->result;
1071
1072    this->result = src_reg(this, ir->type);
1073    vec4_instruction *inst;
1074    inst = emit(MOV(dst_reg(this->result), src));
1075    inst->saturate = true;
1076
1077    return true;
1078 }
1079
1080 bool
1081 vec4_visitor::try_emit_mad(ir_expression *ir, int mul_arg)
1082 {
1083    /* 3-src instructions were introduced in gen6. */
1084    if (brw->gen < 6)
1085       return false;
1086
1087    /* MAD can only handle floating-point data. */
1088    if (ir->type->base_type != GLSL_TYPE_FLOAT)
1089       return false;
1090
1091    ir_rvalue *nonmul = ir->operands[1 - mul_arg];
1092    ir_expression *mul = ir->operands[mul_arg]->as_expression();
1093
1094    if (!mul || mul->operation != ir_binop_mul)
1095       return false;
1096
1097    nonmul->accept(this);
1098    src_reg src0 = fix_3src_operand(this->result);
1099
1100    mul->operands[0]->accept(this);
1101    src_reg src1 = fix_3src_operand(this->result);
1102
1103    mul->operands[1]->accept(this);
1104    src_reg src2 = fix_3src_operand(this->result);
1105
1106    this->result = src_reg(this, ir->type);
1107    emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1108
1109    return true;
1110 }
1111
1112 void
1113 vec4_visitor::emit_bool_comparison(unsigned int op,
1114                                  dst_reg dst, src_reg src0, src_reg src1)
1115 {
1116    /* original gen4 does destination conversion before comparison. */
1117    if (brw->gen < 5)
1118       dst.type = src0.type;
1119
1120    emit(CMP(dst, src0, src1, brw_conditional_for_comparison(op)));
1121
1122    dst.type = BRW_REGISTER_TYPE_D;
1123    emit(AND(dst, src_reg(dst), src_reg(0x1)));
1124 }
1125
1126 void
1127 vec4_visitor::emit_minmax(uint32_t conditionalmod, dst_reg dst,
1128                           src_reg src0, src_reg src1)
1129 {
1130    vec4_instruction *inst;
1131
1132    if (brw->gen >= 6) {
1133       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1134       inst->conditional_mod = conditionalmod;
1135    } else {
1136       emit(CMP(dst, src0, src1, conditionalmod));
1137
1138       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1139       inst->predicate = BRW_PREDICATE_NORMAL;
1140    }
1141 }
1142
1143 void
1144 vec4_visitor::emit_lrp(const dst_reg &dst,
1145                        const src_reg &x, const src_reg &y, const src_reg &a)
1146 {
1147    if (brw->gen >= 6) {
1148       /* Note that the instruction's argument order is reversed from GLSL
1149        * and the IR.
1150        */
1151       emit(LRP(dst,
1152                fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
1153    } else {
1154       /* Earlier generations don't support three source operations, so we
1155        * need to emit x*(1-a) + y*a.
1156        *
1157        * A better way to do this would be:
1158        *    ADD one_minus_a, negate(a), 1.0f
1159        *    MUL null, y, a
1160        *    MAC dst, x, one_minus_a
1161        * but we would need to support MAC and implicit accumulator.
1162        */
1163       dst_reg y_times_a           = dst_reg(this, glsl_type::vec4_type);
1164       dst_reg one_minus_a         = dst_reg(this, glsl_type::vec4_type);
1165       dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1166       y_times_a.writemask           = dst.writemask;
1167       one_minus_a.writemask         = dst.writemask;
1168       x_times_one_minus_a.writemask = dst.writemask;
1169
1170       emit(MUL(y_times_a, y, a));
1171       emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1172       emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1173       emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1174    }
1175 }
1176
1177 void
1178 vec4_visitor::visit(ir_expression *ir)
1179 {
1180    unsigned int operand;
1181    src_reg op[Elements(ir->operands)];
1182    src_reg result_src;
1183    dst_reg result_dst;
1184    vec4_instruction *inst;
1185
1186    if (try_emit_sat(ir))
1187       return;
1188
1189    if (ir->operation == ir_binop_add) {
1190       if (try_emit_mad(ir, 0) || try_emit_mad(ir, 1))
1191          return;
1192    }
1193
1194    for (operand = 0; operand < ir->get_num_operands(); operand++) {
1195       this->result.file = BAD_FILE;
1196       ir->operands[operand]->accept(this);
1197       if (this->result.file == BAD_FILE) {
1198          fprintf(stderr, "Failed to get tree for expression operand:\n");
1199          ir->operands[operand]->fprint(stderr);
1200          exit(1);
1201       }
1202       op[operand] = this->result;
1203
1204       /* Matrix expression operands should have been broken down to vector
1205        * operations already.
1206        */
1207       assert(!ir->operands[operand]->type->is_matrix());
1208    }
1209
1210    int vector_elements = ir->operands[0]->type->vector_elements;
1211    if (ir->operands[1]) {
1212       vector_elements = MAX2(vector_elements,
1213                              ir->operands[1]->type->vector_elements);
1214    }
1215
1216    this->result.file = BAD_FILE;
1217
1218    /* Storage for our result.  Ideally for an assignment we'd be using
1219     * the actual storage for the result here, instead.
1220     */
1221    result_src = src_reg(this, ir->type);
1222    /* convenience for the emit functions below. */
1223    result_dst = dst_reg(result_src);
1224    /* If nothing special happens, this is the result. */
1225    this->result = result_src;
1226    /* Limit writes to the channels that will be used by result_src later.
1227     * This does limit this temp's use as a temporary for multi-instruction
1228     * sequences.
1229     */
1230    result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1231
1232    switch (ir->operation) {
1233    case ir_unop_logic_not:
1234       /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
1235        * ones complement of the whole register, not just bit 0.
1236        */
1237       emit(XOR(result_dst, op[0], src_reg(1)));
1238       break;
1239    case ir_unop_neg:
1240       op[0].negate = !op[0].negate;
1241       emit(MOV(result_dst, op[0]));
1242       break;
1243    case ir_unop_abs:
1244       op[0].abs = true;
1245       op[0].negate = false;
1246       emit(MOV(result_dst, op[0]));
1247       break;
1248
1249    case ir_unop_sign:
1250       if (ir->type->is_float()) {
1251          /* AND(val, 0x80000000) gives the sign bit.
1252           *
1253           * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1254           * zero.
1255           */
1256          emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1257
1258          op[0].type = BRW_REGISTER_TYPE_UD;
1259          result_dst.type = BRW_REGISTER_TYPE_UD;
1260          emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1261
1262          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1263          inst->predicate = BRW_PREDICATE_NORMAL;
1264
1265          this->result.type = BRW_REGISTER_TYPE_F;
1266       } else {
1267          /*  ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1268           *               -> non-negative val generates 0x00000000.
1269           *  Predicated OR sets 1 if val is positive.
1270           */
1271          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1272
1273          emit(ASR(result_dst, op[0], src_reg(31)));
1274
1275          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1276          inst->predicate = BRW_PREDICATE_NORMAL;
1277       }
1278       break;
1279
1280    case ir_unop_rcp:
1281       emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1282       break;
1283
1284    case ir_unop_exp2:
1285       emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1286       break;
1287    case ir_unop_log2:
1288       emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1289       break;
1290    case ir_unop_exp:
1291    case ir_unop_log:
1292       assert(!"not reached: should be handled by ir_explog_to_explog2");
1293       break;
1294    case ir_unop_sin:
1295    case ir_unop_sin_reduced:
1296       emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1297       break;
1298    case ir_unop_cos:
1299    case ir_unop_cos_reduced:
1300       emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1301       break;
1302
1303    case ir_unop_dFdx:
1304    case ir_unop_dFdy:
1305       assert(!"derivatives not valid in vertex shader");
1306       break;
1307
1308    case ir_unop_bitfield_reverse:
1309       emit(BFREV(result_dst, op[0]));
1310       break;
1311    case ir_unop_bit_count:
1312       emit(CBIT(result_dst, op[0]));
1313       break;
1314    case ir_unop_find_msb: {
1315       src_reg temp = src_reg(this, glsl_type::uint_type);
1316
1317       inst = emit(FBH(dst_reg(temp), op[0]));
1318       inst->dst.writemask = WRITEMASK_XYZW;
1319
1320       /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1321        * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1322        * subtract the result from 31 to convert the MSB count into an LSB count.
1323        */
1324
1325       /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1326       temp.swizzle = BRW_SWIZZLE_NOOP;
1327       emit(MOV(result_dst, temp));
1328
1329       src_reg src_tmp = src_reg(result_dst);
1330       emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1331
1332       src_tmp.negate = true;
1333       inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1334       inst->predicate = BRW_PREDICATE_NORMAL;
1335       break;
1336    }
1337    case ir_unop_find_lsb:
1338       emit(FBL(result_dst, op[0]));
1339       break;
1340
1341    case ir_unop_noise:
1342       assert(!"not reached: should be handled by lower_noise");
1343       break;
1344
1345    case ir_binop_add:
1346       emit(ADD(result_dst, op[0], op[1]));
1347       break;
1348    case ir_binop_sub:
1349       assert(!"not reached: should be handled by ir_sub_to_add_neg");
1350       break;
1351
1352    case ir_binop_mul:
1353       if (brw->gen < 8 && ir->type->is_integer()) {
1354          /* For integer multiplication, the MUL uses the low 16 bits of one of
1355           * the operands (src0 through SNB, src1 on IVB and later).  The MACH
1356           * accumulates in the contribution of the upper 16 bits of that
1357           * operand.  If we can determine that one of the args is in the low
1358           * 16 bits, though, we can just emit a single MUL.
1359           */
1360          if (ir->operands[0]->is_uint16_constant()) {
1361             if (brw->gen < 7)
1362                emit(MUL(result_dst, op[0], op[1]));
1363             else
1364                emit(MUL(result_dst, op[1], op[0]));
1365          } else if (ir->operands[1]->is_uint16_constant()) {
1366             if (brw->gen < 7)
1367                emit(MUL(result_dst, op[1], op[0]));
1368             else
1369                emit(MUL(result_dst, op[0], op[1]));
1370          } else {
1371             struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1372
1373             emit(MUL(acc, op[0], op[1]));
1374             emit(MACH(dst_null_d(), op[0], op[1]));
1375             emit(MOV(result_dst, src_reg(acc)));
1376          }
1377       } else {
1378          emit(MUL(result_dst, op[0], op[1]));
1379       }
1380       break;
1381    case ir_binop_imul_high: {
1382       struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1383
1384       emit(MUL(acc, op[0], op[1]));
1385       emit(MACH(result_dst, op[0], op[1]));
1386       break;
1387    }
1388    case ir_binop_div:
1389       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1390       assert(ir->type->is_integer());
1391       emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1392       break;
1393    case ir_binop_carry: {
1394       struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1395
1396       emit(ADDC(dst_null_ud(), op[0], op[1]));
1397       emit(MOV(result_dst, src_reg(acc)));
1398       break;
1399    }
1400    case ir_binop_borrow: {
1401       struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1402
1403       emit(SUBB(dst_null_ud(), op[0], op[1]));
1404       emit(MOV(result_dst, src_reg(acc)));
1405       break;
1406    }
1407    case ir_binop_mod:
1408       /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1409       assert(ir->type->is_integer());
1410       emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1411       break;
1412
1413    case ir_binop_less:
1414    case ir_binop_greater:
1415    case ir_binop_lequal:
1416    case ir_binop_gequal:
1417    case ir_binop_equal:
1418    case ir_binop_nequal: {
1419       emit(CMP(result_dst, op[0], op[1],
1420                brw_conditional_for_comparison(ir->operation)));
1421       emit(AND(result_dst, result_src, src_reg(0x1)));
1422       break;
1423    }
1424
1425    case ir_binop_all_equal:
1426       /* "==" operator producing a scalar boolean. */
1427       if (ir->operands[0]->type->is_vector() ||
1428           ir->operands[1]->type->is_vector()) {
1429          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1430          emit(MOV(result_dst, src_reg(0)));
1431          inst = emit(MOV(result_dst, src_reg(1)));
1432          inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1433       } else {
1434          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1435          emit(AND(result_dst, result_src, src_reg(0x1)));
1436       }
1437       break;
1438    case ir_binop_any_nequal:
1439       /* "!=" operator producing a scalar boolean. */
1440       if (ir->operands[0]->type->is_vector() ||
1441           ir->operands[1]->type->is_vector()) {
1442          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1443
1444          emit(MOV(result_dst, src_reg(0)));
1445          inst = emit(MOV(result_dst, src_reg(1)));
1446          inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1447       } else {
1448          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1449          emit(AND(result_dst, result_src, src_reg(0x1)));
1450       }
1451       break;
1452
1453    case ir_unop_any:
1454       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1455       emit(MOV(result_dst, src_reg(0)));
1456
1457       inst = emit(MOV(result_dst, src_reg(1)));
1458       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1459       break;
1460
1461    case ir_binop_logic_xor:
1462       emit(XOR(result_dst, op[0], op[1]));
1463       break;
1464
1465    case ir_binop_logic_or:
1466       emit(OR(result_dst, op[0], op[1]));
1467       break;
1468
1469    case ir_binop_logic_and:
1470       emit(AND(result_dst, op[0], op[1]));
1471       break;
1472
1473    case ir_binop_dot:
1474       assert(ir->operands[0]->type->is_vector());
1475       assert(ir->operands[0]->type == ir->operands[1]->type);
1476       emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1477       break;
1478
1479    case ir_unop_sqrt:
1480       emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1481       break;
1482    case ir_unop_rsq:
1483       emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1484       break;
1485
1486    case ir_unop_bitcast_i2f:
1487    case ir_unop_bitcast_u2f:
1488       this->result = op[0];
1489       this->result.type = BRW_REGISTER_TYPE_F;
1490       break;
1491
1492    case ir_unop_bitcast_f2i:
1493       this->result = op[0];
1494       this->result.type = BRW_REGISTER_TYPE_D;
1495       break;
1496
1497    case ir_unop_bitcast_f2u:
1498       this->result = op[0];
1499       this->result.type = BRW_REGISTER_TYPE_UD;
1500       break;
1501
1502    case ir_unop_i2f:
1503    case ir_unop_i2u:
1504    case ir_unop_u2i:
1505    case ir_unop_u2f:
1506    case ir_unop_b2f:
1507    case ir_unop_b2i:
1508    case ir_unop_f2i:
1509    case ir_unop_f2u:
1510       emit(MOV(result_dst, op[0]));
1511       break;
1512    case ir_unop_f2b:
1513    case ir_unop_i2b: {
1514       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1515       emit(AND(result_dst, result_src, src_reg(1)));
1516       break;
1517    }
1518
1519    case ir_unop_trunc:
1520       emit(RNDZ(result_dst, op[0]));
1521       break;
1522    case ir_unop_ceil:
1523       op[0].negate = !op[0].negate;
1524       inst = emit(RNDD(result_dst, op[0]));
1525       this->result.negate = true;
1526       break;
1527    case ir_unop_floor:
1528       inst = emit(RNDD(result_dst, op[0]));
1529       break;
1530    case ir_unop_fract:
1531       inst = emit(FRC(result_dst, op[0]));
1532       break;
1533    case ir_unop_round_even:
1534       emit(RNDE(result_dst, op[0]));
1535       break;
1536
1537    case ir_binop_min:
1538       emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1539       break;
1540    case ir_binop_max:
1541       emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1542       break;
1543
1544    case ir_binop_pow:
1545       emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1546       break;
1547
1548    case ir_unop_bit_not:
1549       inst = emit(NOT(result_dst, op[0]));
1550       break;
1551    case ir_binop_bit_and:
1552       inst = emit(AND(result_dst, op[0], op[1]));
1553       break;
1554    case ir_binop_bit_xor:
1555       inst = emit(XOR(result_dst, op[0], op[1]));
1556       break;
1557    case ir_binop_bit_or:
1558       inst = emit(OR(result_dst, op[0], op[1]));
1559       break;
1560
1561    case ir_binop_lshift:
1562       inst = emit(SHL(result_dst, op[0], op[1]));
1563       break;
1564
1565    case ir_binop_rshift:
1566       if (ir->type->base_type == GLSL_TYPE_INT)
1567          inst = emit(ASR(result_dst, op[0], op[1]));
1568       else
1569          inst = emit(SHR(result_dst, op[0], op[1]));
1570       break;
1571
1572    case ir_binop_bfm:
1573       emit(BFI1(result_dst, op[0], op[1]));
1574       break;
1575
1576    case ir_binop_ubo_load: {
1577       ir_constant *uniform_block = ir->operands[0]->as_constant();
1578       ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1579       unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1580       src_reg offset;
1581
1582       /* Now, load the vector from that offset. */
1583       assert(ir->type->is_vector() || ir->type->is_scalar());
1584
1585       src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1586       packed_consts.type = result.type;
1587       src_reg surf_index =
1588          src_reg(prog_data->base.binding_table.ubo_start + uniform_block->value.u[0]);
1589       if (const_offset_ir) {
1590          if (brw->gen >= 8) {
1591             /* Store the offset in a GRF so we can send-from-GRF. */
1592             offset = src_reg(this, glsl_type::int_type);
1593             emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1594          } else {
1595             /* Immediates are fine on older generations since they'll be moved
1596              * to a (potentially fake) MRF at the generator level.
1597              */
1598             offset = src_reg(const_offset / 16);
1599          }
1600       } else {
1601          offset = src_reg(this, glsl_type::uint_type);
1602          emit(SHR(dst_reg(offset), op[1], src_reg(4)));
1603       }
1604
1605       if (brw->gen >= 7) {
1606          dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1607          grf_offset.type = offset.type;
1608
1609          emit(MOV(grf_offset, offset));
1610
1611          emit(new(mem_ctx) vec4_instruction(this,
1612                                             VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1613                                             dst_reg(packed_consts),
1614                                             surf_index,
1615                                             src_reg(grf_offset)));
1616       } else {
1617          vec4_instruction *pull =
1618             emit(new(mem_ctx) vec4_instruction(this,
1619                                                VS_OPCODE_PULL_CONSTANT_LOAD,
1620                                                dst_reg(packed_consts),
1621                                                surf_index,
1622                                                offset));
1623          pull->base_mrf = 14;
1624          pull->mlen = 1;
1625       }
1626
1627       packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1628       packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1629                                             const_offset % 16 / 4,
1630                                             const_offset % 16 / 4,
1631                                             const_offset % 16 / 4);
1632
1633       /* UBO bools are any nonzero int.  We store bools as either 0 or 1. */
1634       if (ir->type->base_type == GLSL_TYPE_BOOL) {
1635          emit(CMP(result_dst, packed_consts, src_reg(0u),
1636                   BRW_CONDITIONAL_NZ));
1637          emit(AND(result_dst, result, src_reg(0x1)));
1638       } else {
1639          emit(MOV(result_dst, packed_consts));
1640       }
1641       break;
1642    }
1643
1644    case ir_binop_vector_extract:
1645       assert(!"should have been lowered by vec_index_to_cond_assign");
1646       break;
1647
1648    case ir_triop_fma:
1649       op[0] = fix_3src_operand(op[0]);
1650       op[1] = fix_3src_operand(op[1]);
1651       op[2] = fix_3src_operand(op[2]);
1652       /* Note that the instruction's argument order is reversed from GLSL
1653        * and the IR.
1654        */
1655       emit(MAD(result_dst, op[2], op[1], op[0]));
1656       break;
1657
1658    case ir_triop_lrp:
1659       emit_lrp(result_dst, op[0], op[1], op[2]);
1660       break;
1661
1662    case ir_triop_csel:
1663       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1664       inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1665       inst->predicate = BRW_PREDICATE_NORMAL;
1666       break;
1667
1668    case ir_triop_bfi:
1669       op[0] = fix_3src_operand(op[0]);
1670       op[1] = fix_3src_operand(op[1]);
1671       op[2] = fix_3src_operand(op[2]);
1672       emit(BFI2(result_dst, op[0], op[1], op[2]));
1673       break;
1674
1675    case ir_triop_bitfield_extract:
1676       op[0] = fix_3src_operand(op[0]);
1677       op[1] = fix_3src_operand(op[1]);
1678       op[2] = fix_3src_operand(op[2]);
1679       /* Note that the instruction's argument order is reversed from GLSL
1680        * and the IR.
1681        */
1682       emit(BFE(result_dst, op[2], op[1], op[0]));
1683       break;
1684
1685    case ir_triop_vector_insert:
1686       assert(!"should have been lowered by lower_vector_insert");
1687       break;
1688
1689    case ir_quadop_bitfield_insert:
1690       assert(!"not reached: should be handled by "
1691               "bitfield_insert_to_bfm_bfi\n");
1692       break;
1693
1694    case ir_quadop_vector:
1695       assert(!"not reached: should be handled by lower_quadop_vector");
1696       break;
1697
1698    case ir_unop_pack_half_2x16:
1699       emit_pack_half_2x16(result_dst, op[0]);
1700       break;
1701    case ir_unop_unpack_half_2x16:
1702       emit_unpack_half_2x16(result_dst, op[0]);
1703       break;
1704    case ir_unop_pack_snorm_2x16:
1705    case ir_unop_pack_snorm_4x8:
1706    case ir_unop_pack_unorm_2x16:
1707    case ir_unop_pack_unorm_4x8:
1708    case ir_unop_unpack_snorm_2x16:
1709    case ir_unop_unpack_snorm_4x8:
1710    case ir_unop_unpack_unorm_2x16:
1711    case ir_unop_unpack_unorm_4x8:
1712       assert(!"not reached: should be handled by lower_packing_builtins");
1713       break;
1714    case ir_unop_unpack_half_2x16_split_x:
1715    case ir_unop_unpack_half_2x16_split_y:
1716    case ir_binop_pack_half_2x16_split:
1717       assert(!"not reached: should not occur in vertex shader");
1718       break;
1719    case ir_binop_ldexp:
1720       assert(!"not reached: should be handled by ldexp_to_arith()");
1721       break;
1722    }
1723 }
1724
1725
1726 void
1727 vec4_visitor::visit(ir_swizzle *ir)
1728 {
1729    src_reg src;
1730    int i = 0;
1731    int swizzle[4];
1732
1733    /* Note that this is only swizzles in expressions, not those on the left
1734     * hand side of an assignment, which do write masking.  See ir_assignment
1735     * for that.
1736     */
1737
1738    ir->val->accept(this);
1739    src = this->result;
1740    assert(src.file != BAD_FILE);
1741
1742    for (i = 0; i < ir->type->vector_elements; i++) {
1743       switch (i) {
1744       case 0:
1745          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1746          break;
1747       case 1:
1748          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1749          break;
1750       case 2:
1751          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1752          break;
1753       case 3:
1754          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1755             break;
1756       }
1757    }
1758    for (; i < 4; i++) {
1759       /* Replicate the last channel out. */
1760       swizzle[i] = swizzle[ir->type->vector_elements - 1];
1761    }
1762
1763    src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1764
1765    this->result = src;
1766 }
1767
1768 void
1769 vec4_visitor::visit(ir_dereference_variable *ir)
1770 {
1771    const struct glsl_type *type = ir->type;
1772    dst_reg *reg = variable_storage(ir->var);
1773
1774    if (!reg) {
1775       fail("Failed to find variable storage for %s\n", ir->var->name);
1776       this->result = src_reg(brw_null_reg());
1777       return;
1778    }
1779
1780    this->result = src_reg(*reg);
1781
1782    /* System values get their swizzle from the dst_reg writemask */
1783    if (ir->var->data.mode == ir_var_system_value)
1784       return;
1785
1786    if (type->is_scalar() || type->is_vector() || type->is_matrix())
1787       this->result.swizzle = swizzle_for_size(type->vector_elements);
1788 }
1789
1790
1791 int
1792 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1793 {
1794    /* Under normal circumstances array elements are stored consecutively, so
1795     * the stride is equal to the size of the array element.
1796     */
1797    return type_size(ir->type);
1798 }
1799
1800
1801 void
1802 vec4_visitor::visit(ir_dereference_array *ir)
1803 {
1804    ir_constant *constant_index;
1805    src_reg src;
1806    int array_stride = compute_array_stride(ir);
1807
1808    constant_index = ir->array_index->constant_expression_value();
1809
1810    ir->array->accept(this);
1811    src = this->result;
1812
1813    if (constant_index) {
1814       src.reg_offset += constant_index->value.i[0] * array_stride;
1815    } else {
1816       /* Variable index array dereference.  It eats the "vec4" of the
1817        * base of the array and an index that offsets the Mesa register
1818        * index.
1819        */
1820       ir->array_index->accept(this);
1821
1822       src_reg index_reg;
1823
1824       if (array_stride == 1) {
1825          index_reg = this->result;
1826       } else {
1827          index_reg = src_reg(this, glsl_type::int_type);
1828
1829          emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
1830       }
1831
1832       if (src.reladdr) {
1833          src_reg temp = src_reg(this, glsl_type::int_type);
1834
1835          emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1836
1837          index_reg = temp;
1838       }
1839
1840       src.reladdr = ralloc(mem_ctx, src_reg);
1841       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1842    }
1843
1844    /* If the type is smaller than a vec4, replicate the last channel out. */
1845    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1846       src.swizzle = swizzle_for_size(ir->type->vector_elements);
1847    else
1848       src.swizzle = BRW_SWIZZLE_NOOP;
1849    src.type = brw_type_for_base_type(ir->type);
1850
1851    this->result = src;
1852 }
1853
1854 void
1855 vec4_visitor::visit(ir_dereference_record *ir)
1856 {
1857    unsigned int i;
1858    const glsl_type *struct_type = ir->record->type;
1859    int offset = 0;
1860
1861    ir->record->accept(this);
1862
1863    for (i = 0; i < struct_type->length; i++) {
1864       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1865          break;
1866       offset += type_size(struct_type->fields.structure[i].type);
1867    }
1868
1869    /* If the type is smaller than a vec4, replicate the last channel out. */
1870    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1871       this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1872    else
1873       this->result.swizzle = BRW_SWIZZLE_NOOP;
1874    this->result.type = brw_type_for_base_type(ir->type);
1875
1876    this->result.reg_offset += offset;
1877 }
1878
1879 /**
1880  * We want to be careful in assignment setup to hit the actual storage
1881  * instead of potentially using a temporary like we might with the
1882  * ir_dereference handler.
1883  */
1884 static dst_reg
1885 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1886 {
1887    /* The LHS must be a dereference.  If the LHS is a variable indexed array
1888     * access of a vector, it must be separated into a series conditional moves
1889     * before reaching this point (see ir_vec_index_to_cond_assign).
1890     */
1891    assert(ir->as_dereference());
1892    ir_dereference_array *deref_array = ir->as_dereference_array();
1893    if (deref_array) {
1894       assert(!deref_array->array->type->is_vector());
1895    }
1896
1897    /* Use the rvalue deref handler for the most part.  We'll ignore
1898     * swizzles in it and write swizzles using writemask, though.
1899     */
1900    ir->accept(v);
1901    return dst_reg(v->result);
1902 }
1903
1904 void
1905 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1906                               const struct glsl_type *type, uint32_t predicate)
1907 {
1908    if (type->base_type == GLSL_TYPE_STRUCT) {
1909       for (unsigned int i = 0; i < type->length; i++) {
1910          emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1911       }
1912       return;
1913    }
1914
1915    if (type->is_array()) {
1916       for (unsigned int i = 0; i < type->length; i++) {
1917          emit_block_move(dst, src, type->fields.array, predicate);
1918       }
1919       return;
1920    }
1921
1922    if (type->is_matrix()) {
1923       const struct glsl_type *vec_type;
1924
1925       vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1926                                          type->vector_elements, 1);
1927
1928       for (int i = 0; i < type->matrix_columns; i++) {
1929          emit_block_move(dst, src, vec_type, predicate);
1930       }
1931       return;
1932    }
1933
1934    assert(type->is_scalar() || type->is_vector());
1935
1936    dst->type = brw_type_for_base_type(type);
1937    src->type = dst->type;
1938
1939    dst->writemask = (1 << type->vector_elements) - 1;
1940
1941    src->swizzle = swizzle_for_size(type->vector_elements);
1942
1943    vec4_instruction *inst = emit(MOV(*dst, *src));
1944    inst->predicate = predicate;
1945
1946    dst->reg_offset++;
1947    src->reg_offset++;
1948 }
1949
1950
1951 /* If the RHS processing resulted in an instruction generating a
1952  * temporary value, and it would be easy to rewrite the instruction to
1953  * generate its result right into the LHS instead, do so.  This ends
1954  * up reliably removing instructions where it can be tricky to do so
1955  * later without real UD chain information.
1956  */
1957 bool
1958 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
1959                                      dst_reg dst,
1960                                      src_reg src,
1961                                      vec4_instruction *pre_rhs_inst,
1962                                      vec4_instruction *last_rhs_inst)
1963 {
1964    /* This could be supported, but it would take more smarts. */
1965    if (ir->condition)
1966       return false;
1967
1968    if (pre_rhs_inst == last_rhs_inst)
1969       return false; /* No instructions generated to work with. */
1970
1971    /* Make sure the last instruction generated our source reg. */
1972    if (src.file != GRF ||
1973        src.file != last_rhs_inst->dst.file ||
1974        src.reg != last_rhs_inst->dst.reg ||
1975        src.reg_offset != last_rhs_inst->dst.reg_offset ||
1976        src.reladdr ||
1977        src.abs ||
1978        src.negate ||
1979        last_rhs_inst->predicate != BRW_PREDICATE_NONE)
1980       return false;
1981
1982    /* Check that that last instruction fully initialized the channels
1983     * we want to use, in the order we want to use them.  We could
1984     * potentially reswizzle the operands of many instructions so that
1985     * we could handle out of order channels, but don't yet.
1986     */
1987
1988    for (unsigned i = 0; i < 4; i++) {
1989       if (dst.writemask & (1 << i)) {
1990          if (!(last_rhs_inst->dst.writemask & (1 << i)))
1991             return false;
1992
1993          if (BRW_GET_SWZ(src.swizzle, i) != i)
1994             return false;
1995       }
1996    }
1997
1998    /* Success!  Rewrite the instruction. */
1999    last_rhs_inst->dst.file = dst.file;
2000    last_rhs_inst->dst.reg = dst.reg;
2001    last_rhs_inst->dst.reg_offset = dst.reg_offset;
2002    last_rhs_inst->dst.reladdr = dst.reladdr;
2003    last_rhs_inst->dst.writemask &= dst.writemask;
2004
2005    return true;
2006 }
2007
2008 void
2009 vec4_visitor::visit(ir_assignment *ir)
2010 {
2011    dst_reg dst = get_assignment_lhs(ir->lhs, this);
2012    uint32_t predicate = BRW_PREDICATE_NONE;
2013
2014    if (!ir->lhs->type->is_scalar() &&
2015        !ir->lhs->type->is_vector()) {
2016       ir->rhs->accept(this);
2017       src_reg src = this->result;
2018
2019       if (ir->condition) {
2020          emit_bool_to_cond_code(ir->condition, &predicate);
2021       }
2022
2023       /* emit_block_move doesn't account for swizzles in the source register.
2024        * This should be ok, since the source register is a structure or an
2025        * array, and those can't be swizzled.  But double-check to be sure.
2026        */
2027       assert(src.swizzle ==
2028              (ir->rhs->type->is_matrix()
2029               ? swizzle_for_size(ir->rhs->type->vector_elements)
2030               : BRW_SWIZZLE_NOOP));
2031
2032       emit_block_move(&dst, &src, ir->rhs->type, predicate);
2033       return;
2034    }
2035
2036    /* Now we're down to just a scalar/vector with writemasks. */
2037    int i;
2038
2039    vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2040    pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2041
2042    ir->rhs->accept(this);
2043
2044    last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2045
2046    src_reg src = this->result;
2047
2048    int swizzles[4];
2049    int first_enabled_chan = 0;
2050    int src_chan = 0;
2051
2052    assert(ir->lhs->type->is_vector() ||
2053           ir->lhs->type->is_scalar());
2054    dst.writemask = ir->write_mask;
2055
2056    for (int i = 0; i < 4; i++) {
2057       if (dst.writemask & (1 << i)) {
2058          first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
2059          break;
2060       }
2061    }
2062
2063    /* Swizzle a small RHS vector into the channels being written.
2064     *
2065     * glsl ir treats write_mask as dictating how many channels are
2066     * present on the RHS while in our instructions we need to make
2067     * those channels appear in the slots of the vec4 they're written to.
2068     */
2069    for (int i = 0; i < 4; i++) {
2070       if (dst.writemask & (1 << i))
2071          swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
2072       else
2073          swizzles[i] = first_enabled_chan;
2074    }
2075    src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2076                               swizzles[2], swizzles[3]);
2077
2078    if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2079       return;
2080    }
2081
2082    if (ir->condition) {
2083       emit_bool_to_cond_code(ir->condition, &predicate);
2084    }
2085
2086    for (i = 0; i < type_size(ir->lhs->type); i++) {
2087       vec4_instruction *inst = emit(MOV(dst, src));
2088       inst->predicate = predicate;
2089
2090       dst.reg_offset++;
2091       src.reg_offset++;
2092    }
2093 }
2094
2095 void
2096 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2097 {
2098    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2099       foreach_list(node, &ir->components) {
2100          ir_constant *field_value = (ir_constant *)node;
2101
2102          emit_constant_values(dst, field_value);
2103       }
2104       return;
2105    }
2106
2107    if (ir->type->is_array()) {
2108       for (unsigned int i = 0; i < ir->type->length; i++) {
2109          emit_constant_values(dst, ir->array_elements[i]);
2110       }
2111       return;
2112    }
2113
2114    if (ir->type->is_matrix()) {
2115       for (int i = 0; i < ir->type->matrix_columns; i++) {
2116          float *vec = &ir->value.f[i * ir->type->vector_elements];
2117
2118          for (int j = 0; j < ir->type->vector_elements; j++) {
2119             dst->writemask = 1 << j;
2120             dst->type = BRW_REGISTER_TYPE_F;
2121
2122             emit(MOV(*dst, src_reg(vec[j])));
2123          }
2124          dst->reg_offset++;
2125       }
2126       return;
2127    }
2128
2129    int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2130
2131    for (int i = 0; i < ir->type->vector_elements; i++) {
2132       if (!(remaining_writemask & (1 << i)))
2133          continue;
2134
2135       dst->writemask = 1 << i;
2136       dst->type = brw_type_for_base_type(ir->type);
2137
2138       /* Find other components that match the one we're about to
2139        * write.  Emits fewer instructions for things like vec4(0.5,
2140        * 1.5, 1.5, 1.5).
2141        */
2142       for (int j = i + 1; j < ir->type->vector_elements; j++) {
2143          if (ir->type->base_type == GLSL_TYPE_BOOL) {
2144             if (ir->value.b[i] == ir->value.b[j])
2145                dst->writemask |= (1 << j);
2146          } else {
2147             /* u, i, and f storage all line up, so no need for a
2148              * switch case for comparing each type.
2149              */
2150             if (ir->value.u[i] == ir->value.u[j])
2151                dst->writemask |= (1 << j);
2152          }
2153       }
2154
2155       switch (ir->type->base_type) {
2156       case GLSL_TYPE_FLOAT:
2157          emit(MOV(*dst, src_reg(ir->value.f[i])));
2158          break;
2159       case GLSL_TYPE_INT:
2160          emit(MOV(*dst, src_reg(ir->value.i[i])));
2161          break;
2162       case GLSL_TYPE_UINT:
2163          emit(MOV(*dst, src_reg(ir->value.u[i])));
2164          break;
2165       case GLSL_TYPE_BOOL:
2166          emit(MOV(*dst, src_reg(ir->value.b[i])));
2167          break;
2168       default:
2169          assert(!"Non-float/uint/int/bool constant");
2170          break;
2171       }
2172
2173       remaining_writemask &= ~dst->writemask;
2174    }
2175    dst->reg_offset++;
2176 }
2177
2178 void
2179 vec4_visitor::visit(ir_constant *ir)
2180 {
2181    dst_reg dst = dst_reg(this, ir->type);
2182    this->result = src_reg(dst);
2183
2184    emit_constant_values(&dst, ir);
2185 }
2186
2187 void
2188 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2189 {
2190    ir_dereference *deref = static_cast<ir_dereference *>(
2191       ir->actual_parameters.get_head());
2192    ir_variable *location = deref->variable_referenced();
2193    unsigned surf_index = (prog_data->base.binding_table.abo_start +
2194                           location->data.atomic.buffer_index);
2195
2196    /* Calculate the surface offset */
2197    src_reg offset(this, glsl_type::uint_type);
2198    ir_dereference_array *deref_array = deref->as_dereference_array();
2199    if (deref_array) {
2200       deref_array->array_index->accept(this);
2201
2202       src_reg tmp(this, glsl_type::uint_type);
2203       emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2204       emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2205    } else {
2206       offset = location->data.atomic.offset;
2207    }
2208
2209    /* Emit the appropriate machine instruction */
2210    const char *callee = ir->callee->function_name();
2211    dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2212
2213    if (!strcmp("__intrinsic_atomic_read", callee)) {
2214       emit_untyped_surface_read(surf_index, dst, offset);
2215
2216    } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2217       emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2218                           src_reg(), src_reg());
2219
2220    } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2221       emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2222                           src_reg(), src_reg());
2223    }
2224 }
2225
2226 void
2227 vec4_visitor::visit(ir_call *ir)
2228 {
2229    const char *callee = ir->callee->function_name();
2230
2231    if (!strcmp("__intrinsic_atomic_read", callee) ||
2232        !strcmp("__intrinsic_atomic_increment", callee) ||
2233        !strcmp("__intrinsic_atomic_predecrement", callee)) {
2234       visit_atomic_counter_intrinsic(ir);
2235    } else {
2236       assert(!"Unsupported intrinsic.");
2237    }
2238 }
2239
2240 src_reg
2241 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, int sampler)
2242 {
2243    vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_MCS);
2244    inst->base_mrf = 2;
2245    inst->mlen = 1;
2246    inst->sampler = sampler;
2247    inst->dst = dst_reg(this, glsl_type::uvec4_type);
2248    inst->dst.writemask = WRITEMASK_XYZW;
2249
2250    /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2251    int param_base = inst->base_mrf;
2252    int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2253    int zero_mask = 0xf & ~coord_mask;
2254
2255    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2256             coordinate));
2257
2258    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2259             src_reg(0)));
2260
2261    emit(inst);
2262    return src_reg(inst->dst);
2263 }
2264
2265 void
2266 vec4_visitor::visit(ir_texture *ir)
2267 {
2268    int sampler =
2269       _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2270
2271    /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2272     * emitting anything other than setting up the constant result.
2273     */
2274    if (ir->op == ir_tg4) {
2275       ir_constant *chan = ir->lod_info.component->as_constant();
2276       int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2277       if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2278          dst_reg result(this, ir->type);
2279          this->result = src_reg(result);
2280          emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2281          return;
2282       }
2283    }
2284
2285    /* Should be lowered by do_lower_texture_projection */
2286    assert(!ir->projector);
2287
2288    /* Should be lowered */
2289    assert(!ir->offset || !ir->offset->type->is_array());
2290
2291    /* Generate code to compute all the subexpression trees.  This has to be
2292     * done before loading any values into MRFs for the sampler message since
2293     * generating these values may involve SEND messages that need the MRFs.
2294     */
2295    src_reg coordinate;
2296    if (ir->coordinate) {
2297       ir->coordinate->accept(this);
2298       coordinate = this->result;
2299    }
2300
2301    src_reg shadow_comparitor;
2302    if (ir->shadow_comparitor) {
2303       ir->shadow_comparitor->accept(this);
2304       shadow_comparitor = this->result;
2305    }
2306
2307    bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2308    src_reg offset_value;
2309    if (has_nonconstant_offset) {
2310       ir->offset->accept(this);
2311       offset_value = src_reg(this->result);
2312    }
2313
2314    const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2315    src_reg lod, dPdx, dPdy, sample_index, mcs;
2316    switch (ir->op) {
2317    case ir_tex:
2318       lod = src_reg(0.0f);
2319       lod_type = glsl_type::float_type;
2320       break;
2321    case ir_txf:
2322    case ir_txl:
2323    case ir_txs:
2324       ir->lod_info.lod->accept(this);
2325       lod = this->result;
2326       lod_type = ir->lod_info.lod->type;
2327       break;
2328    case ir_query_levels:
2329       lod = src_reg(0);
2330       lod_type = glsl_type::int_type;
2331       break;
2332    case ir_txf_ms:
2333       ir->lod_info.sample_index->accept(this);
2334       sample_index = this->result;
2335       sample_index_type = ir->lod_info.sample_index->type;
2336
2337       if (brw->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2338          mcs = emit_mcs_fetch(ir, coordinate, sampler);
2339       else
2340          mcs = src_reg(0u);
2341       break;
2342    case ir_txd:
2343       ir->lod_info.grad.dPdx->accept(this);
2344       dPdx = this->result;
2345
2346       ir->lod_info.grad.dPdy->accept(this);
2347       dPdy = this->result;
2348
2349       lod_type = ir->lod_info.grad.dPdx->type;
2350       break;
2351    case ir_txb:
2352    case ir_lod:
2353    case ir_tg4:
2354       break;
2355    }
2356
2357    vec4_instruction *inst = NULL;
2358    switch (ir->op) {
2359    case ir_tex:
2360    case ir_txl:
2361       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXL);
2362       break;
2363    case ir_txd:
2364       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXD);
2365       break;
2366    case ir_txf:
2367       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF);
2368       break;
2369    case ir_txf_ms:
2370       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_CMS);
2371       break;
2372    case ir_txs:
2373       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
2374       break;
2375    case ir_tg4:
2376       if (has_nonconstant_offset)
2377          inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TG4_OFFSET);
2378       else
2379          inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TG4);
2380       break;
2381    case ir_query_levels:
2382       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
2383       break;
2384    case ir_txb:
2385       assert(!"TXB is not valid for vertex shaders.");
2386       break;
2387    case ir_lod:
2388       assert(!"LOD is not valid for vertex shaders.");
2389       break;
2390    default:
2391       assert(!"Unrecognized tex op");
2392    }
2393
2394    if (ir->offset != NULL && ir->op != ir_txf)
2395       inst->texture_offset = brw_texture_offset(ctx, ir->offset->as_constant());
2396
2397    /* Stuff the channel select bits in the top of the texture offset */
2398    if (ir->op == ir_tg4)
2399       inst->texture_offset |= gather_channel(ir, sampler) << 16;
2400
2401    /* The message header is necessary for:
2402     * - Gen4 (always)
2403     * - Texel offsets
2404     * - Gather channel selection
2405     * - Sampler indices too large to fit in a 4-bit value.
2406     */
2407    inst->header_present =
2408       brw->gen < 5 || inst->texture_offset != 0 || ir->op == ir_tg4 ||
2409       sampler >= 16;
2410    inst->base_mrf = 2;
2411    inst->mlen = inst->header_present + 1; /* always at least one */
2412    inst->sampler = sampler;
2413    inst->dst = dst_reg(this, ir->type);
2414    inst->dst.writemask = WRITEMASK_XYZW;
2415    inst->shadow_compare = ir->shadow_comparitor != NULL;
2416
2417    /* MRF for the first parameter */
2418    int param_base = inst->base_mrf + inst->header_present;
2419
2420    if (ir->op == ir_txs || ir->op == ir_query_levels) {
2421       int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2422       emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2423    } else {
2424       /* Load the coordinate */
2425       /* FINISHME: gl_clamp_mask and saturate */
2426       int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2427       int zero_mask = 0xf & ~coord_mask;
2428
2429       emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2430                coordinate));
2431
2432       if (zero_mask != 0) {
2433          emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2434                   src_reg(0)));
2435       }
2436       /* Load the shadow comparitor */
2437       if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2438          emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2439                           WRITEMASK_X),
2440                   shadow_comparitor));
2441          inst->mlen++;
2442       }
2443
2444       /* Load the LOD info */
2445       if (ir->op == ir_tex || ir->op == ir_txl) {
2446          int mrf, writemask;
2447          if (brw->gen >= 5) {
2448             mrf = param_base + 1;
2449             if (ir->shadow_comparitor) {
2450                writemask = WRITEMASK_Y;
2451                /* mlen already incremented */
2452             } else {
2453                writemask = WRITEMASK_X;
2454                inst->mlen++;
2455             }
2456          } else /* brw->gen == 4 */ {
2457             mrf = param_base;
2458             writemask = WRITEMASK_W;
2459          }
2460          emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2461       } else if (ir->op == ir_txf) {
2462          emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2463       } else if (ir->op == ir_txf_ms) {
2464          emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2465                   sample_index));
2466          if (brw->gen >= 7)
2467             /* MCS data is in the first channel of `mcs`, but we need to get it into
2468              * the .y channel of the second vec4 of params, so replicate .x across
2469              * the whole vec4 and then mask off everything except .y
2470              */
2471             mcs.swizzle = BRW_SWIZZLE_XXXX;
2472             emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2473                      mcs));
2474          inst->mlen++;
2475       } else if (ir->op == ir_txd) {
2476          const glsl_type *type = lod_type;
2477
2478          if (brw->gen >= 5) {
2479             dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2480             dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2481             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2482             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2483             inst->mlen++;
2484
2485             if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2486                dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2487                dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2488                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2489                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2490                inst->mlen++;
2491
2492                if (ir->shadow_comparitor) {
2493                   emit(MOV(dst_reg(MRF, param_base + 2,
2494                                    ir->shadow_comparitor->type, WRITEMASK_Z),
2495                            shadow_comparitor));
2496                }
2497             }
2498          } else /* brw->gen == 4 */ {
2499             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2500             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2501             inst->mlen += 2;
2502          }
2503       } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2504          if (ir->shadow_comparitor) {
2505             emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2506                      shadow_comparitor));
2507          }
2508
2509          emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2510                   offset_value));
2511          inst->mlen++;
2512       }
2513    }
2514
2515    emit(inst);
2516
2517    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2518     * spec requires layers.
2519     */
2520    if (ir->op == ir_txs) {
2521       glsl_type const *type = ir->sampler->type;
2522       if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2523           type->sampler_array) {
2524          emit_math(SHADER_OPCODE_INT_QUOTIENT,
2525                    writemask(inst->dst, WRITEMASK_Z),
2526                    src_reg(inst->dst), src_reg(6));
2527       }
2528    }
2529
2530    if (brw->gen == 6 && ir->op == ir_tg4) {
2531       emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2532    }
2533
2534    swizzle_result(ir, src_reg(inst->dst), sampler);
2535 }
2536
2537 /**
2538  * Apply workarounds for Gen6 gather with UINT/SINT
2539  */
2540 void
2541 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2542 {
2543    if (!wa)
2544       return;
2545
2546    int width = (wa & WA_8BIT) ? 8 : 16;
2547    dst_reg dst_f = dst;
2548    dst_f.type = BRW_REGISTER_TYPE_F;
2549
2550    /* Convert from UNORM to UINT */
2551    emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2552    emit(MOV(dst, src_reg(dst_f)));
2553
2554    if (wa & WA_SIGN) {
2555       /* Reinterpret the UINT value as a signed INT value by
2556        * shifting the sign bit into place, then shifting back
2557        * preserving sign.
2558        */
2559       emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2560       emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2561    }
2562 }
2563
2564 /**
2565  * Set up the gather channel based on the swizzle, for gather4.
2566  */
2567 uint32_t
2568 vec4_visitor::gather_channel(ir_texture *ir, int sampler)
2569 {
2570    ir_constant *chan = ir->lod_info.component->as_constant();
2571    int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2572    switch (swiz) {
2573       case SWIZZLE_X: return 0;
2574       case SWIZZLE_Y:
2575          /* gather4 sampler is broken for green channel on RG32F --
2576           * we must ask for blue instead.
2577           */
2578          if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2579             return 2;
2580          return 1;
2581       case SWIZZLE_Z: return 2;
2582       case SWIZZLE_W: return 3;
2583       default:
2584          assert(!"Not reached"); /* zero, one swizzles handled already */
2585          return 0;
2586    }
2587 }
2588
2589 void
2590 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, int sampler)
2591 {
2592    int s = key->tex.swizzles[sampler];
2593
2594    this->result = src_reg(this, ir->type);
2595    dst_reg swizzled_result(this->result);
2596
2597    if (ir->op == ir_query_levels) {
2598       /* # levels is in .w */
2599       orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2600       emit(MOV(swizzled_result, orig_val));
2601       return;
2602    }
2603
2604    if (ir->op == ir_txs || ir->type == glsl_type::float_type
2605                         || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2606       emit(MOV(swizzled_result, orig_val));
2607       return;
2608    }
2609
2610
2611    int zero_mask = 0, one_mask = 0, copy_mask = 0;
2612    int swizzle[4] = {0};
2613
2614    for (int i = 0; i < 4; i++) {
2615       switch (GET_SWZ(s, i)) {
2616       case SWIZZLE_ZERO:
2617          zero_mask |= (1 << i);
2618          break;
2619       case SWIZZLE_ONE:
2620          one_mask |= (1 << i);
2621          break;
2622       default:
2623          copy_mask |= (1 << i);
2624          swizzle[i] = GET_SWZ(s, i);
2625          break;
2626       }
2627    }
2628
2629    if (copy_mask) {
2630       orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2631       swizzled_result.writemask = copy_mask;
2632       emit(MOV(swizzled_result, orig_val));
2633    }
2634
2635    if (zero_mask) {
2636       swizzled_result.writemask = zero_mask;
2637       emit(MOV(swizzled_result, src_reg(0.0f)));
2638    }
2639
2640    if (one_mask) {
2641       swizzled_result.writemask = one_mask;
2642       emit(MOV(swizzled_result, src_reg(1.0f)));
2643    }
2644 }
2645
2646 void
2647 vec4_visitor::visit(ir_return *ir)
2648 {
2649    assert(!"not reached");
2650 }
2651
2652 void
2653 vec4_visitor::visit(ir_discard *ir)
2654 {
2655    assert(!"not reached");
2656 }
2657
2658 void
2659 vec4_visitor::visit(ir_if *ir)
2660 {
2661    /* Don't point the annotation at the if statement, because then it plus
2662     * the then and else blocks get printed.
2663     */
2664    this->base_ir = ir->condition;
2665
2666    if (brw->gen == 6) {
2667       emit_if_gen6(ir);
2668    } else {
2669       uint32_t predicate;
2670       emit_bool_to_cond_code(ir->condition, &predicate);
2671       emit(IF(predicate));
2672    }
2673
2674    visit_instructions(&ir->then_instructions);
2675
2676    if (!ir->else_instructions.is_empty()) {
2677       this->base_ir = ir->condition;
2678       emit(BRW_OPCODE_ELSE);
2679
2680       visit_instructions(&ir->else_instructions);
2681    }
2682
2683    this->base_ir = ir->condition;
2684    emit(BRW_OPCODE_ENDIF);
2685 }
2686
2687 void
2688 vec4_visitor::visit(ir_emit_vertex *)
2689 {
2690    assert(!"not reached");
2691 }
2692
2693 void
2694 vec4_visitor::visit(ir_end_primitive *)
2695 {
2696    assert(!"not reached");
2697 }
2698
2699 void
2700 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2701                                   dst_reg dst, src_reg offset,
2702                                   src_reg src0, src_reg src1)
2703 {
2704    unsigned mlen = 0;
2705
2706    /* Set the atomic operation offset. */
2707    emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2708    mlen++;
2709
2710    /* Set the atomic operation arguments. */
2711    if (src0.file != BAD_FILE) {
2712       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2713       mlen++;
2714    }
2715
2716    if (src1.file != BAD_FILE) {
2717       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2718       mlen++;
2719    }
2720
2721    /* Emit the instruction.  Note that this maps to the normal SIMD8
2722     * untyped atomic message on Ivy Bridge, but that's OK because
2723     * unused channels will be masked out.
2724     */
2725    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
2726                                  src_reg(atomic_op), src_reg(surf_index));
2727    inst->base_mrf = 0;
2728    inst->mlen = mlen;
2729 }
2730
2731 void
2732 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
2733                                         src_reg offset)
2734 {
2735    /* Set the surface read offset. */
2736    emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
2737
2738    /* Emit the instruction.  Note that this maps to the normal SIMD8
2739     * untyped surface read message, but that's OK because unused
2740     * channels will be masked out.
2741     */
2742    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ,
2743                                  dst, src_reg(surf_index));
2744    inst->base_mrf = 0;
2745    inst->mlen = 1;
2746 }
2747
2748 void
2749 vec4_visitor::emit_ndc_computation()
2750 {
2751    /* Get the position */
2752    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2753
2754    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2755    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2756    output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2757
2758    current_annotation = "NDC";
2759    dst_reg ndc_w = ndc;
2760    ndc_w.writemask = WRITEMASK_W;
2761    src_reg pos_w = pos;
2762    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2763    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2764
2765    dst_reg ndc_xyz = ndc;
2766    ndc_xyz.writemask = WRITEMASK_XYZ;
2767
2768    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2769 }
2770
2771 void
2772 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2773 {
2774    if (brw->gen < 6 &&
2775        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2776         key->userclip_active || brw->has_negative_rhw_bug)) {
2777       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2778       dst_reg header1_w = header1;
2779       header1_w.writemask = WRITEMASK_W;
2780
2781       emit(MOV(header1, 0u));
2782
2783       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2784          src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2785
2786          current_annotation = "Point size";
2787          emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2788          emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2789       }
2790
2791       if (key->userclip_active) {
2792          current_annotation = "Clipping flags";
2793          dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
2794          dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
2795
2796          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
2797          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
2798          emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
2799
2800          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
2801          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
2802          emit(SHL(flags1, src_reg(flags1), src_reg(4)));
2803          emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
2804       }
2805
2806       /* i965 clipping workaround:
2807        * 1) Test for -ve rhw
2808        * 2) If set,
2809        *      set ndc = (0,0,0,0)
2810        *      set ucp[6] = 1
2811        *
2812        * Later, clipping will detect ucp[6] and ensure the primitive is
2813        * clipped against all fixed planes.
2814        */
2815       if (brw->has_negative_rhw_bug) {
2816          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
2817          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
2818          emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
2819          vec4_instruction *inst;
2820          inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
2821          inst->predicate = BRW_PREDICATE_NORMAL;
2822          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
2823          inst->predicate = BRW_PREDICATE_NORMAL;
2824       }
2825
2826       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2827    } else if (brw->gen < 6) {
2828       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2829    } else {
2830       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2831       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2832          emit(MOV(brw_writemask(reg, WRITEMASK_W),
2833                   src_reg(output_reg[VARYING_SLOT_PSIZ])));
2834       }
2835       if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
2836          emit(MOV(retype(brw_writemask(reg, WRITEMASK_Y), BRW_REGISTER_TYPE_D),
2837                   src_reg(output_reg[VARYING_SLOT_LAYER])));
2838       }
2839       if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
2840          emit(MOV(retype(brw_writemask(reg, WRITEMASK_Z), BRW_REGISTER_TYPE_D),
2841                   src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
2842       }
2843    }
2844 }
2845
2846 void
2847 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
2848 {
2849    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2850     *
2851     *     "If a linked set of shaders forming the vertex stage contains no
2852     *     static write to gl_ClipVertex or gl_ClipDistance, but the
2853     *     application has requested clipping against user clip planes through
2854     *     the API, then the coordinate written to gl_Position is used for
2855     *     comparison against the user clip planes."
2856     *
2857     * This function is only called if the shader didn't write to
2858     * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
2859     * if the user wrote to it; otherwise we use gl_Position.
2860     */
2861    gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
2862    if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
2863       clip_vertex = VARYING_SLOT_POS;
2864    }
2865
2866    for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
2867         ++i) {
2868       reg.writemask = 1 << i;
2869       emit(DP4(reg,
2870                src_reg(output_reg[clip_vertex]),
2871                src_reg(this->userplane[i + offset])));
2872    }
2873 }
2874
2875 void
2876 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
2877 {
2878    assert (varying < VARYING_SLOT_MAX);
2879    reg.type = output_reg[varying].type;
2880    current_annotation = output_reg_annotation[varying];
2881    /* Copy the register, saturating if necessary */
2882    vec4_instruction *inst = emit(MOV(reg,
2883                                      src_reg(output_reg[varying])));
2884    if ((varying == VARYING_SLOT_COL0 ||
2885         varying == VARYING_SLOT_COL1 ||
2886         varying == VARYING_SLOT_BFC0 ||
2887         varying == VARYING_SLOT_BFC1) &&
2888        key->clamp_vertex_color) {
2889       inst->saturate = true;
2890    }
2891 }
2892
2893 void
2894 vec4_visitor::emit_urb_slot(int mrf, int varying)
2895 {
2896    struct brw_reg hw_reg = brw_message_reg(mrf);
2897    dst_reg reg = dst_reg(MRF, mrf);
2898    reg.type = BRW_REGISTER_TYPE_F;
2899
2900    switch (varying) {
2901    case VARYING_SLOT_PSIZ:
2902       /* PSIZ is always in slot 0, and is coupled with other flags. */
2903       current_annotation = "indices, point width, clip flags";
2904       emit_psiz_and_flags(hw_reg);
2905       break;
2906    case BRW_VARYING_SLOT_NDC:
2907       current_annotation = "NDC";
2908       emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
2909       break;
2910    case VARYING_SLOT_POS:
2911       current_annotation = "gl_Position";
2912       emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
2913       break;
2914    case VARYING_SLOT_EDGE:
2915       /* This is present when doing unfilled polygons.  We're supposed to copy
2916        * the edge flag from the user-provided vertex array
2917        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
2918        * of that attribute (starts as 1.0f).  This is then used in clipping to
2919        * determine which edges should be drawn as wireframe.
2920        */
2921       current_annotation = "edge flag";
2922       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
2923                                     glsl_type::float_type, WRITEMASK_XYZW))));
2924       break;
2925    case BRW_VARYING_SLOT_PAD:
2926       /* No need to write to this slot */
2927       break;
2928    default:
2929       emit_generic_urb_slot(reg, varying);
2930       break;
2931    }
2932 }
2933
2934 static int
2935 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
2936 {
2937    if (brw->gen >= 6) {
2938       /* URB data written (does not include the message header reg) must
2939        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
2940        * section 5.4.3.2.2: URB_INTERLEAVED.
2941        *
2942        * URB entries are allocated on a multiple of 1024 bits, so an
2943        * extra 128 bits written here to make the end align to 256 is
2944        * no problem.
2945        */
2946       if ((mlen % 2) != 1)
2947          mlen++;
2948    }
2949
2950    return mlen;
2951 }
2952
2953
2954 /**
2955  * Generates the VUE payload plus the necessary URB write instructions to
2956  * output it.
2957  *
2958  * The VUE layout is documented in Volume 2a.
2959  */
2960 void
2961 vec4_visitor::emit_vertex()
2962 {
2963    /* MRF 0 is reserved for the debugger, so start with message header
2964     * in MRF 1.
2965     */
2966    int base_mrf = 1;
2967    int mrf = base_mrf;
2968    /* In the process of generating our URB write message contents, we
2969     * may need to unspill a register or load from an array.  Those
2970     * reads would use MRFs 14-15.
2971     */
2972    int max_usable_mrf = 13;
2973
2974    /* The following assertion verifies that max_usable_mrf causes an
2975     * even-numbered amount of URB write data, which will meet gen6's
2976     * requirements for length alignment.
2977     */
2978    assert ((max_usable_mrf - base_mrf) % 2 == 0);
2979
2980    /* First mrf is the g0-based message header containing URB handles and
2981     * such.
2982     */
2983    emit_urb_write_header(mrf++);
2984
2985    if (brw->gen < 6) {
2986       emit_ndc_computation();
2987    }
2988
2989    /* Lower legacy ff and ClipVertex clipping to clip distances */
2990    if (key->userclip_active && !prog->UsesClipDistanceOut) {
2991       current_annotation = "user clip distances";
2992
2993       output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
2994       output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
2995
2996       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
2997       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
2998    }
2999
3000    /* We may need to split this up into several URB writes, so do them in a
3001     * loop.
3002     */
3003    int slot = 0;
3004    bool complete = false;
3005    do {
3006       /* URB offset is in URB row increments, and each of our MRFs is half of
3007        * one of those, since we're doing interleaved writes.
3008        */
3009       int offset = slot / 2;
3010
3011       mrf = base_mrf + 1;
3012       for (; slot < prog_data->vue_map.num_slots; ++slot) {
3013          emit_urb_slot(mrf++, prog_data->vue_map.slot_to_varying[slot]);
3014
3015          /* If this was max_usable_mrf, we can't fit anything more into this
3016           * URB WRITE.
3017           */
3018          if (mrf > max_usable_mrf) {
3019             slot++;
3020             break;
3021          }
3022       }
3023
3024       complete = slot >= prog_data->vue_map.num_slots;
3025       current_annotation = "URB write";
3026       vec4_instruction *inst = emit_urb_write_opcode(complete);
3027       inst->base_mrf = base_mrf;
3028       inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
3029       inst->offset += offset;
3030    } while(!complete);
3031 }
3032
3033
3034 src_reg
3035 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
3036                                  src_reg *reladdr, int reg_offset)
3037 {
3038    /* Because we store the values to scratch interleaved like our
3039     * vertex data, we need to scale the vec4 index by 2.
3040     */
3041    int message_header_scale = 2;
3042
3043    /* Pre-gen6, the message header uses byte offsets instead of vec4
3044     * (16-byte) offset units.
3045     */
3046    if (brw->gen < 6)
3047       message_header_scale *= 16;
3048
3049    if (reladdr) {
3050       src_reg index = src_reg(this, glsl_type::int_type);
3051
3052       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3053       emit_before(inst, MUL(dst_reg(index),
3054                             index, src_reg(message_header_scale)));
3055
3056       return index;
3057    } else {
3058       return src_reg(reg_offset * message_header_scale);
3059    }
3060 }
3061
3062 src_reg
3063 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
3064                                        src_reg *reladdr, int reg_offset)
3065 {
3066    if (reladdr) {
3067       src_reg index = src_reg(this, glsl_type::int_type);
3068
3069       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3070
3071       /* Pre-gen6, the message header uses byte offsets instead of vec4
3072        * (16-byte) offset units.
3073        */
3074       if (brw->gen < 6) {
3075          emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
3076       }
3077
3078       return index;
3079    } else if (brw->gen >= 8) {
3080       /* Store the offset in a GRF so we can send-from-GRF. */
3081       src_reg offset = src_reg(this, glsl_type::int_type);
3082       emit_before(inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3083       return offset;
3084    } else {
3085       int message_header_scale = brw->gen < 6 ? 16 : 1;
3086       return src_reg(reg_offset * message_header_scale);
3087    }
3088 }
3089
3090 /**
3091  * Emits an instruction before @inst to load the value named by @orig_src
3092  * from scratch space at @base_offset to @temp.
3093  *
3094  * @base_offset is measured in 32-byte units (the size of a register).
3095  */
3096 void
3097 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
3098                                 dst_reg temp, src_reg orig_src,
3099                                 int base_offset)
3100 {
3101    int reg_offset = base_offset + orig_src.reg_offset;
3102    src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
3103
3104    emit_before(inst, SCRATCH_READ(temp, index));
3105 }
3106
3107 /**
3108  * Emits an instruction after @inst to store the value to be written
3109  * to @orig_dst to scratch space at @base_offset, from @temp.
3110  *
3111  * @base_offset is measured in 32-byte units (the size of a register).
3112  */
3113 void
3114 vec4_visitor::emit_scratch_write(vec4_instruction *inst, int base_offset)
3115 {
3116    int reg_offset = base_offset + inst->dst.reg_offset;
3117    src_reg index = get_scratch_offset(inst, inst->dst.reladdr, reg_offset);
3118
3119    /* Create a temporary register to store *inst's result in.
3120     *
3121     * We have to be careful in MOVing from our temporary result register in
3122     * the scratch write.  If we swizzle from channels of the temporary that
3123     * weren't initialized, it will confuse live interval analysis, which will
3124     * make spilling fail to make progress.
3125     */
3126    src_reg temp = src_reg(this, glsl_type::vec4_type);
3127    temp.type = inst->dst.type;
3128    int first_writemask_chan = ffs(inst->dst.writemask) - 1;
3129    int swizzles[4];
3130    for (int i = 0; i < 4; i++)
3131       if (inst->dst.writemask & (1 << i))
3132          swizzles[i] = i;
3133       else
3134          swizzles[i] = first_writemask_chan;
3135    temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
3136                                swizzles[2], swizzles[3]);
3137
3138    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3139                                        inst->dst.writemask));
3140    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3141    write->predicate = inst->predicate;
3142    write->ir = inst->ir;
3143    write->annotation = inst->annotation;
3144    inst->insert_after(write);
3145
3146    inst->dst.file = temp.file;
3147    inst->dst.reg = temp.reg;
3148    inst->dst.reg_offset = temp.reg_offset;
3149    inst->dst.reladdr = NULL;
3150 }
3151
3152 /**
3153  * We can't generally support array access in GRF space, because a
3154  * single instruction's destination can only span 2 contiguous
3155  * registers.  So, we send all GRF arrays that get variable index
3156  * access to scratch space.
3157  */
3158 void
3159 vec4_visitor::move_grf_array_access_to_scratch()
3160 {
3161    int scratch_loc[this->virtual_grf_count];
3162
3163    for (int i = 0; i < this->virtual_grf_count; i++) {
3164       scratch_loc[i] = -1;
3165    }
3166
3167    /* First, calculate the set of virtual GRFs that need to be punted
3168     * to scratch due to having any array access on them, and where in
3169     * scratch.
3170     */
3171    foreach_list(node, &this->instructions) {
3172       vec4_instruction *inst = (vec4_instruction *)node;
3173
3174       if (inst->dst.file == GRF && inst->dst.reladdr &&
3175           scratch_loc[inst->dst.reg] == -1) {
3176          scratch_loc[inst->dst.reg] = c->last_scratch;
3177          c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
3178       }
3179
3180       for (int i = 0 ; i < 3; i++) {
3181          src_reg *src = &inst->src[i];
3182
3183          if (src->file == GRF && src->reladdr &&
3184              scratch_loc[src->reg] == -1) {
3185             scratch_loc[src->reg] = c->last_scratch;
3186             c->last_scratch += this->virtual_grf_sizes[src->reg];
3187          }
3188       }
3189    }
3190
3191    /* Now, for anything that will be accessed through scratch, rewrite
3192     * it to load/store.  Note that this is a _safe list walk, because
3193     * we may generate a new scratch_write instruction after the one
3194     * we're processing.
3195     */
3196    foreach_list_safe(node, &this->instructions) {
3197       vec4_instruction *inst = (vec4_instruction *)node;
3198
3199       /* Set up the annotation tracking for new generated instructions. */
3200       base_ir = inst->ir;
3201       current_annotation = inst->annotation;
3202
3203       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
3204          emit_scratch_write(inst, scratch_loc[inst->dst.reg]);
3205       }
3206
3207       for (int i = 0 ; i < 3; i++) {
3208          if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
3209             continue;
3210
3211          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3212
3213          emit_scratch_read(inst, temp, inst->src[i],
3214                            scratch_loc[inst->src[i].reg]);
3215
3216          inst->src[i].file = temp.file;
3217          inst->src[i].reg = temp.reg;
3218          inst->src[i].reg_offset = temp.reg_offset;
3219          inst->src[i].reladdr = NULL;
3220       }
3221    }
3222 }
3223
3224 /**
3225  * Emits an instruction before @inst to load the value named by @orig_src
3226  * from the pull constant buffer (surface) at @base_offset to @temp.
3227  */
3228 void
3229 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
3230                                       dst_reg temp, src_reg orig_src,
3231                                       int base_offset)
3232 {
3233    int reg_offset = base_offset + orig_src.reg_offset;
3234    src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3235    src_reg offset = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
3236    vec4_instruction *load;
3237
3238    if (brw->gen >= 7) {
3239       dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
3240       grf_offset.type = offset.type;
3241       emit_before(inst, MOV(grf_offset, offset));
3242
3243       load = new(mem_ctx) vec4_instruction(this,
3244                                            VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
3245                                            temp, index, src_reg(grf_offset));
3246    } else {
3247       load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
3248                                            temp, index, offset);
3249       load->base_mrf = 14;
3250       load->mlen = 1;
3251    }
3252    emit_before(inst, load);
3253 }
3254
3255 /**
3256  * Implements array access of uniforms by inserting a
3257  * PULL_CONSTANT_LOAD instruction.
3258  *
3259  * Unlike temporary GRF array access (where we don't support it due to
3260  * the difficulty of doing relative addressing on instruction
3261  * destinations), we could potentially do array access of uniforms
3262  * that were loaded in GRF space as push constants.  In real-world
3263  * usage we've seen, though, the arrays being used are always larger
3264  * than we could load as push constants, so just always move all
3265  * uniform array access out to a pull constant buffer.
3266  */
3267 void
3268 vec4_visitor::move_uniform_array_access_to_pull_constants()
3269 {
3270    int pull_constant_loc[this->uniforms];
3271
3272    for (int i = 0; i < this->uniforms; i++) {
3273       pull_constant_loc[i] = -1;
3274    }
3275
3276    /* Walk through and find array access of uniforms.  Put a copy of that
3277     * uniform in the pull constant buffer.
3278     *
3279     * Note that we don't move constant-indexed accesses to arrays.  No
3280     * testing has been done of the performance impact of this choice.
3281     */
3282    foreach_list_safe(node, &this->instructions) {
3283       vec4_instruction *inst = (vec4_instruction *)node;
3284
3285       for (int i = 0 ; i < 3; i++) {
3286          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3287             continue;
3288
3289          int uniform = inst->src[i].reg;
3290
3291          /* If this array isn't already present in the pull constant buffer,
3292           * add it.
3293           */
3294          if (pull_constant_loc[uniform] == -1) {
3295             const float **values = &stage_prog_data->param[uniform * 4];
3296
3297             pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3298
3299             assert(uniform < uniform_array_size);
3300             for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3301                stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3302                   = values[j];
3303             }
3304          }
3305
3306          /* Set up the annotation tracking for new generated instructions. */
3307          base_ir = inst->ir;
3308          current_annotation = inst->annotation;
3309
3310          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3311
3312          emit_pull_constant_load(inst, temp, inst->src[i],
3313                                  pull_constant_loc[uniform]);
3314
3315          inst->src[i].file = temp.file;
3316          inst->src[i].reg = temp.reg;
3317          inst->src[i].reg_offset = temp.reg_offset;
3318          inst->src[i].reladdr = NULL;
3319       }
3320    }
3321
3322    /* Now there are no accesses of the UNIFORM file with a reladdr, so
3323     * no need to track them as larger-than-vec4 objects.  This will be
3324     * relied on in cutting out unused uniform vectors from push
3325     * constants.
3326     */
3327    split_uniform_registers();
3328 }
3329
3330 void
3331 vec4_visitor::resolve_ud_negate(src_reg *reg)
3332 {
3333    if (reg->type != BRW_REGISTER_TYPE_UD ||
3334        !reg->negate)
3335       return;
3336
3337    src_reg temp = src_reg(this, glsl_type::uvec4_type);
3338    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3339    *reg = temp;
3340 }
3341
3342 vec4_visitor::vec4_visitor(struct brw_context *brw,
3343                            struct brw_vec4_compile *c,
3344                            struct gl_program *prog,
3345                            const struct brw_vec4_prog_key *key,
3346                            struct brw_vec4_prog_data *prog_data,
3347                            struct gl_shader_program *shader_prog,
3348                            gl_shader_stage stage,
3349                            void *mem_ctx,
3350                            bool debug_flag,
3351                            bool no_spills,
3352                            shader_time_shader_type st_base,
3353                            shader_time_shader_type st_written,
3354                            shader_time_shader_type st_reset)
3355    : backend_visitor(brw, shader_prog, prog, &prog_data->base, stage),
3356      c(c),
3357      key(key),
3358      prog_data(prog_data),
3359      sanity_param_count(0),
3360      fail_msg(NULL),
3361      first_non_payload_grf(0),
3362      need_all_constants_in_pull_buffer(false),
3363      debug_flag(debug_flag),
3364      no_spills(no_spills),
3365      st_base(st_base),
3366      st_written(st_written),
3367      st_reset(st_reset)
3368 {
3369    this->mem_ctx = mem_ctx;
3370    this->failed = false;
3371
3372    this->base_ir = NULL;
3373    this->current_annotation = NULL;
3374    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3375
3376    this->variable_ht = hash_table_ctor(0,
3377                                        hash_table_pointer_hash,
3378                                        hash_table_pointer_compare);
3379
3380    this->virtual_grf_start = NULL;
3381    this->virtual_grf_end = NULL;
3382    this->virtual_grf_sizes = NULL;
3383    this->virtual_grf_count = 0;
3384    this->virtual_grf_reg_map = NULL;
3385    this->virtual_grf_reg_count = 0;
3386    this->virtual_grf_array_size = 0;
3387    this->live_intervals_valid = false;
3388
3389    this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3390
3391    this->uniforms = 0;
3392
3393    /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3394     * at least one. See setup_uniforms() in brw_vec4.cpp.
3395     */
3396    this->uniform_array_size = 1;
3397    if (prog_data) {
3398       this->uniform_array_size = MAX2(stage_prog_data->nr_params, 1);
3399    }
3400
3401    this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3402    this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3403 }
3404
3405 vec4_visitor::~vec4_visitor()
3406 {
3407    hash_table_dtor(this->variable_ht);
3408 }
3409
3410
3411 void
3412 vec4_visitor::fail(const char *format, ...)
3413 {
3414    va_list va;
3415    char *msg;
3416
3417    if (failed)
3418       return;
3419
3420    failed = true;
3421
3422    va_start(va, format);
3423    msg = ralloc_vasprintf(mem_ctx, format, va);
3424    va_end(va);
3425    msg = ralloc_asprintf(mem_ctx, "vec4 compile failed: %s\n", msg);
3426
3427    this->fail_msg = msg;
3428
3429    if (debug_flag) {
3430       fprintf(stderr, "%s",  msg);
3431    }
3432 }
3433
3434 } /* namespace brw */