src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "glsl/ir_uniform.h"
  26 extern "C" {
  27 #include "program/sampler.h"
  28 }
  29
  30 namespace brw {
  31
  32 vec4_instruction::vec4_instruction(vec4_visitor *v,
  33                                    enum opcode opcode, const dst_reg &dst,
  34                                    const src_reg &src0, const src_reg &src1,
  35                                    const src_reg &src2)
  36 {
  37    this->opcode = opcode;
  38    this->dst = dst;
  39    this->src[0] = src0;
  40    this->src[1] = src1;
  41    this->src[2] = src2;
  42    this->saturate = false;
  43    this->force_writemask_all = false;
  44    this->no_dd_clear = false;
  45    this->no_dd_check = false;
  46    this->writes_accumulator = false;
  47    this->conditional_mod = BRW_CONDITIONAL_NONE;
  48    this->sampler = 0;
  49    this->texture_offset = 0;
  50    this->target = 0;
  51    this->shadow_compare = false;
  52    this->ir = v->base_ir;
  53    this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
  54    this->header_present = false;
  55    this->mlen = 0;
  56    this->base_mrf = 0;
  57    this->offset = 0;
  58    this->annotation = v->current_annotation;
  59 }
  60
  61 vec4_instruction *
  62 vec4_visitor::emit(vec4_instruction *inst)
  63 {
  64    this->instructions.push_tail(inst);
  65
  66    return inst;
  67 }
  68
  69 vec4_instruction *
  70 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
  71 {
  72    new_inst->ir = inst->ir;
  73    new_inst->annotation = inst->annotation;
  74
  75    inst->insert_before(new_inst);
  76
  77    return inst;
  78 }
  79
  80 vec4_instruction *
  81 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
  82                    src_reg src0, src_reg src1, src_reg src2)
  83 {
  84    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
  85                                              src0, src1, src2));
  86 }
  87
  88
  89 vec4_instruction *
  90 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
  91 {
  92    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
  93 }
  94
  95 vec4_instruction *
  96 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
  97 {
  98    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
  99 }
 100
 101 vec4_instruction *
 102 vec4_visitor::emit(enum opcode opcode, dst_reg dst)
 103 {
 104    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst));
 105 }
 106
 107 vec4_instruction *
 108 vec4_visitor::emit(enum opcode opcode)
 109 {
 110    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
 111 }
 112
 113 #define ALU1(op)                                                        \
 114    vec4_instruction *                                                   \
 115    vec4_visitor::op(const dst_reg &dst, const src_reg &src0)            \
 116    {                                                                    \
 117       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 118                                            src0);                       \
 119    }
 120
 121 #define ALU2(op)                                                        \
 122    vec4_instruction *                                                   \
 123    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 124                     const src_reg &src1)                                \
 125    {                                                                    \
 126       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 127                                            src0, src1);                 \
 128    }
 129
 130 #define ALU2_ACC(op)                                                    \
 131    vec4_instruction *                                                   \
 132    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 133                     const src_reg &src1)                                \
 134    {                                                                    \
 135       vec4_instruction *inst = new(mem_ctx) vec4_instruction(this,     \
 136                        BRW_OPCODE_##op, dst, src0, src1);               \
 137       inst->writes_accumulator = true;                                 \
 138       return inst;                                                     \
 139    }
 140
 141 #define ALU3(op)                                                        \
 142    vec4_instruction *                                                   \
 143    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 144                     const src_reg &src1, const src_reg &src2)           \
 145    {                                                                    \
 146       assert(brw->gen >= 6);                                            \
 147       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 148                                            src0, src1, src2);           \
 149    }
 150
 151 ALU1(NOT)
 152 ALU1(MOV)
 153 ALU1(FRC)
 154 ALU1(RNDD)
 155 ALU1(RNDE)
 156 ALU1(RNDZ)
 157 ALU1(F32TO16)
 158 ALU1(F16TO32)
 159 ALU2(ADD)
 160 ALU2(MUL)
 161 ALU2_ACC(MACH)
 162 ALU2(AND)
 163 ALU2(OR)
 164 ALU2(XOR)
 165 ALU2(DP3)
 166 ALU2(DP4)
 167 ALU2(DPH)
 168 ALU2(SHL)
 169 ALU2(SHR)
 170 ALU2(ASR)
 171 ALU3(LRP)
 172 ALU1(BFREV)
 173 ALU3(BFE)
 174 ALU2(BFI1)
 175 ALU3(BFI2)
 176 ALU1(FBH)
 177 ALU1(FBL)
 178 ALU1(CBIT)
 179 ALU3(MAD)
 180 ALU2_ACC(ADDC)
 181 ALU2_ACC(SUBB)
 182 ALU2(MAC)
 183
 184 /** Gen4 predicated IF. */
 185 vec4_instruction *
 186 vec4_visitor::IF(uint32_t predicate)
 187 {
 188    vec4_instruction *inst;
 189
 190    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
 191    inst->predicate = predicate;
 192
 193    return inst;
 194 }
 195
 196 /** Gen6 IF with embedded comparison. */
 197 vec4_instruction *
 198 vec4_visitor::IF(src_reg src0, src_reg src1, uint32_t condition)
 199 {
 200    assert(brw->gen == 6);
 201
 202    vec4_instruction *inst;
 203
 204    resolve_ud_negate(&src0);
 205    resolve_ud_negate(&src1);
 206
 207    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
 208                                         src0, src1);
 209    inst->conditional_mod = condition;
 210
 211    return inst;
 212 }
 213
 214 /**
 215  * CMP: Sets the low bit of the destination channels with the result
 216  * of the comparison, while the upper bits are undefined, and updates
 217  * the flag register with the packed 16 bits of the result.
 218  */
 219 vec4_instruction *
 220 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, uint32_t condition)
 221 {
 222    vec4_instruction *inst;
 223
 224    /* original gen4 does type conversion to the destination type
 225     * before before comparison, producing garbage results for floating
 226     * point comparisons.
 227     */
 228    if (brw->gen == 4) {
 229       dst.type = src0.type;
 230       if (dst.file == HW_REG)
 231          dst.fixed_hw_reg.type = dst.type;
 232    }
 233
 234    resolve_ud_negate(&src0);
 235    resolve_ud_negate(&src1);
 236
 237    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
 238    inst->conditional_mod = condition;
 239
 240    return inst;
 241 }
 242
 243 vec4_instruction *
 244 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
 245 {
 246    vec4_instruction *inst;
 247
 248    inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_READ,
 249                                         dst, index);
 250    inst->base_mrf = 14;
 251    inst->mlen = 2;
 252
 253    return inst;
 254 }
 255
 256 vec4_instruction *
 257 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
 258                             const src_reg &index)
 259 {
 260    vec4_instruction *inst;
 261
 262    inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_WRITE,
 263                                         dst, src, index);
 264    inst->base_mrf = 13;
 265    inst->mlen = 3;
 266
 267    return inst;
 268 }
 269
 270 void
 271 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
 272 {
 273    static enum opcode dot_opcodes[] = {
 274       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
 275    };
 276
 277    emit(dot_opcodes[elements - 2], dst, src0, src1);
 278 }
 279
 280 src_reg
 281 vec4_visitor::fix_3src_operand(src_reg src)
 282 {
 283    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
 284     * able to use vertical stride of zero to replicate the vec4 uniform, like
 285     *
 286     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
 287     *
 288     * But you can't, since vertical stride is always four in three-source
 289     * instructions. Instead, insert a MOV instruction to do the replication so
 290     * that the three-source instruction can consume it.
 291     */
 292
 293    /* The MOV is only needed if the source is a uniform or immediate. */
 294    if (src.file != UNIFORM && src.file != IMM)
 295       return src;
 296
 297    if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
 298       return src;
 299
 300    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 301    expanded.type = src.type;
 302    emit(MOV(expanded, src));
 303    return src_reg(expanded);
 304 }
 305
 306 src_reg
 307 vec4_visitor::fix_math_operand(src_reg src)
 308 {
 309    /* The gen6 math instruction ignores the source modifiers --
 310     * swizzle, abs, negate, and at least some parts of the register
 311     * region description.
 312     *
 313     * Rather than trying to enumerate all these cases, *always* expand the
 314     * operand to a temp GRF for gen6.
 315     *
 316     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 317     * can't use.
 318     */
 319
 320    if (brw->gen == 7 && src.file != IMM)
 321       return src;
 322
 323    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 324    expanded.type = src.type;
 325    emit(MOV(expanded, src));
 326    return src_reg(expanded);
 327 }
 328
 329 void
 330 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
 331 {
 332    src = fix_math_operand(src);
 333
 334    if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
 335       /* The gen6 math instruction must be align1, so we can't do
 336        * writemasks.
 337        */
 338       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 339
 340       emit(opcode, temp_dst, src);
 341
 342       emit(MOV(dst, src_reg(temp_dst)));
 343    } else {
 344       emit(opcode, dst, src);
 345    }
 346 }
 347
 348 void
 349 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
 350 {
 351    vec4_instruction *inst = emit(opcode, dst, src);
 352    inst->base_mrf = 1;
 353    inst->mlen = 1;
 354 }
 355
 356 void
 357 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
 358 {
 359    switch (opcode) {
 360    case SHADER_OPCODE_RCP:
 361    case SHADER_OPCODE_RSQ:
 362    case SHADER_OPCODE_SQRT:
 363    case SHADER_OPCODE_EXP2:
 364    case SHADER_OPCODE_LOG2:
 365    case SHADER_OPCODE_SIN:
 366    case SHADER_OPCODE_COS:
 367       break;
 368    default:
 369       assert(!"not reached: bad math opcode");
 370       return;
 371    }
 372
 373    if (brw->gen >= 8) {
 374       emit(opcode, dst, src);
 375    } else if (brw->gen >= 6) {
 376       emit_math1_gen6(opcode, dst, src);
 377    } else {
 378       emit_math1_gen4(opcode, dst, src);
 379    }
 380 }
 381
 382 void
 383 vec4_visitor::emit_math2_gen6(enum opcode opcode,
 384                               dst_reg dst, src_reg src0, src_reg src1)
 385 {
 386    src0 = fix_math_operand(src0);
 387    src1 = fix_math_operand(src1);
 388
 389    if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
 390       /* The gen6 math instruction must be align1, so we can't do
 391        * writemasks.
 392        */
 393       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 394       temp_dst.type = dst.type;
 395
 396       emit(opcode, temp_dst, src0, src1);
 397
 398       emit(MOV(dst, src_reg(temp_dst)));
 399    } else {
 400       emit(opcode, dst, src0, src1);
 401    }
 402 }
 403
 404 void
 405 vec4_visitor::emit_math2_gen4(enum opcode opcode,
 406                               dst_reg dst, src_reg src0, src_reg src1)
 407 {
 408    vec4_instruction *inst = emit(opcode, dst, src0, src1);
 409    inst->base_mrf = 1;
 410    inst->mlen = 2;
 411 }
 412
 413 void
 414 vec4_visitor::emit_math(enum opcode opcode,
 415                         dst_reg dst, src_reg src0, src_reg src1)
 416 {
 417    switch (opcode) {
 418    case SHADER_OPCODE_POW:
 419    case SHADER_OPCODE_INT_QUOTIENT:
 420    case SHADER_OPCODE_INT_REMAINDER:
 421       break;
 422    default:
 423       assert(!"not reached: unsupported binary math opcode");
 424       return;
 425    }
 426
 427    if (brw->gen >= 8) {
 428       emit(opcode, dst, src0, src1);
 429    } else if (brw->gen >= 6) {
 430       emit_math2_gen6(opcode, dst, src0, src1);
 431    } else {
 432       emit_math2_gen4(opcode, dst, src0, src1);
 433    }
 434 }
 435
 436 void
 437 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 438 {
 439    if (brw->gen < 7)
 440       assert(!"ir_unop_pack_half_2x16 should be lowered");
 441
 442    assert(dst.type == BRW_REGISTER_TYPE_UD);
 443    assert(src0.type == BRW_REGISTER_TYPE_F);
 444
 445    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
 446     *
 447     *   Because this instruction does not have a 16-bit floating-point type,
 448     *   the destination data type must be Word (W).
 449     *
 450     *   The destination must be DWord-aligned and specify a horizontal stride
 451     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
 452     *   each destination channel and the upper word is not modified.
 453     *
 454     * The above restriction implies that the f32to16 instruction must use
 455     * align1 mode, because only in align1 mode is it possible to specify
 456     * horizontal stride.  We choose here to defy the hardware docs and emit
 457     * align16 instructions.
 458     *
 459     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
 460     * instructions. I was partially successful in that the code passed all
 461     * tests.  However, the code was dubiously correct and fragile, and the
 462     * tests were not harsh enough to probe that frailty. Not trusting the
 463     * code, I chose instead to remain in align16 mode in defiance of the hw
 464     * docs).
 465     *
 466     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
 467     * simulator, emitting a f32to16 in align16 mode with UD as destination
 468     * data type is safe. The behavior differs from that specified in the PRM
 469     * in that the upper word of each destination channel is cleared to 0.
 470     */
 471
 472    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 473    src_reg tmp_src(tmp_dst);
 474
 475 #if 0
 476    /* Verify the undocumented behavior on which the following instructions
 477     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
 478     * then the result of the bit-or instruction below will be incorrect.
 479     *
 480     * You should inspect the disasm output in order to verify that the MOV is
 481     * not optimized away.
 482     */
 483    emit(MOV(tmp_dst, src_reg(0x12345678u)));
 484 #endif
 485
 486    /* Give tmp the form below, where "." means untouched.
 487     *
 488     *     w z          y          x w z          y          x
 489     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
 490     *
 491     * That the upper word of each write-channel be 0 is required for the
 492     * following bit-shift and bit-or instructions to work. Note that this
 493     * relies on the undocumented hardware behavior mentioned above.
 494     */
 495    tmp_dst.writemask = WRITEMASK_XY;
 496    emit(F32TO16(tmp_dst, src0));
 497
 498    /* Give the write-channels of dst the form:
 499     *   0xhhhh0000
 500     */
 501    tmp_src.swizzle = BRW_SWIZZLE_YYYY;
 502    emit(SHL(dst, tmp_src, src_reg(16u)));
 503
 504    /* Finally, give the write-channels of dst the form of packHalf2x16's
 505     * output:
 506     *   0xhhhhllll
 507     */
 508    tmp_src.swizzle = BRW_SWIZZLE_XXXX;
 509    emit(OR(dst, src_reg(dst), tmp_src));
 510 }
 511
 512 void
 513 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 514 {
 515    if (brw->gen < 7)
 516       assert(!"ir_unop_unpack_half_2x16 should be lowered");
 517
 518    assert(dst.type == BRW_REGISTER_TYPE_F);
 519    assert(src0.type == BRW_REGISTER_TYPE_UD);
 520
 521    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
 522     *
 523     *   Because this instruction does not have a 16-bit floating-point type,
 524     *   the source data type must be Word (W). The destination type must be
 525     *   F (Float).
 526     *
 527     * To use W as the source data type, we must adjust horizontal strides,
 528     * which is only possible in align1 mode. All my [chadv] attempts at
 529     * emitting align1 instructions for unpackHalf2x16 failed to pass the
 530     * Piglit tests, so I gave up.
 531     *
 532     * I've verified that, on gen7 hardware and the simulator, it is safe to
 533     * emit f16to32 in align16 mode with UD as source data type.
 534     */
 535
 536    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 537    src_reg tmp_src(tmp_dst);
 538
 539    tmp_dst.writemask = WRITEMASK_X;
 540    emit(AND(tmp_dst, src0, src_reg(0xffffu)));
 541
 542    tmp_dst.writemask = WRITEMASK_Y;
 543    emit(SHR(tmp_dst, src0, src_reg(16u)));
 544
 545    dst.writemask = WRITEMASK_XY;
 546    emit(F16TO32(dst, tmp_src));
 547 }
 548
 549 void
 550 vec4_visitor::visit_instructions(const exec_list *list)
 551 {
 552    foreach_list(node, list) {
 553       ir_instruction *ir = (ir_instruction *)node;
 554
 555       base_ir = ir;
 556       ir->accept(this);
 557    }
 558 }
 559
 560
 561 static int
 562 type_size(const struct glsl_type *type)
 563 {
 564    unsigned int i;
 565    int size;
 566
 567    switch (type->base_type) {
 568    case GLSL_TYPE_UINT:
 569    case GLSL_TYPE_INT:
 570    case GLSL_TYPE_FLOAT:
 571    case GLSL_TYPE_BOOL:
 572       if (type->is_matrix()) {
 573          return type->matrix_columns;
 574       } else {
 575          /* Regardless of size of vector, it gets a vec4. This is bad
 576           * packing for things like floats, but otherwise arrays become a
 577           * mess.  Hopefully a later pass over the code can pack scalars
 578           * down if appropriate.
 579           */
 580          return 1;
 581       }
 582    case GLSL_TYPE_ARRAY:
 583       assert(type->length > 0);
 584       return type_size(type->fields.array) * type->length;
 585    case GLSL_TYPE_STRUCT:
 586       size = 0;
 587       for (i = 0; i < type->length; i++) {
 588          size += type_size(type->fields.structure[i].type);
 589       }
 590       return size;
 591    case GLSL_TYPE_SAMPLER:
 592       /* Samplers take up one slot in UNIFORMS[], but they're baked in
 593        * at link time.
 594        */
 595       return 1;
 596    case GLSL_TYPE_ATOMIC_UINT:
 597       return 0;
 598    case GLSL_TYPE_IMAGE:
 599    case GLSL_TYPE_VOID:
 600    case GLSL_TYPE_ERROR:
 601    case GLSL_TYPE_INTERFACE:
 602       assert(0);
 603       break;
 604    }
 605
 606    return 0;
 607 }
 608
 609 int
 610 vec4_visitor::virtual_grf_alloc(int size)
 611 {
 612    if (virtual_grf_array_size <= virtual_grf_count) {
 613       if (virtual_grf_array_size == 0)
 614          virtual_grf_array_size = 16;
 615       else
 616          virtual_grf_array_size *= 2;
 617       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 618                                    virtual_grf_array_size);
 619       virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
 620                                      virtual_grf_array_size);
 621    }
 622    virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
 623    virtual_grf_reg_count += size;
 624    virtual_grf_sizes[virtual_grf_count] = size;
 625    return virtual_grf_count++;
 626 }
 627
 628 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 629 {
 630    init();
 631
 632    this->file = GRF;
 633    this->reg = v->virtual_grf_alloc(type_size(type));
 634
 635    if (type->is_array() || type->is_record()) {
 636       this->swizzle = BRW_SWIZZLE_NOOP;
 637    } else {
 638       this->swizzle = swizzle_for_size(type->vector_elements);
 639    }
 640
 641    this->type = brw_type_for_base_type(type);
 642 }
 643
 644 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 645 {
 646    init();
 647
 648    this->file = GRF;
 649    this->reg = v->virtual_grf_alloc(type_size(type));
 650
 651    if (type->is_array() || type->is_record()) {
 652       this->writemask = WRITEMASK_XYZW;
 653    } else {
 654       this->writemask = (1 << type->vector_elements) - 1;
 655    }
 656
 657    this->type = brw_type_for_base_type(type);
 658 }
 659
 660 /* Our support for uniforms is piggy-backed on the struct
 661  * gl_fragment_program, because that's where the values actually
 662  * get stored, rather than in some global gl_shader_program uniform
 663  * store.
 664  */
 665 void
 666 vec4_visitor::setup_uniform_values(ir_variable *ir)
 667 {
 668    int namelen = strlen(ir->name);
 669
 670    /* The data for our (non-builtin) uniforms is stored in a series of
 671     * gl_uniform_driver_storage structs for each subcomponent that
 672     * glGetUniformLocation() could name.  We know it's been set up in the same
 673     * order we'd walk the type, so walk the list of storage and find anything
 674     * with our name, or the prefix of a component that starts with our name.
 675     */
 676    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 677       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 678
 679       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 680           (storage->name[namelen] != 0 &&
 681            storage->name[namelen] != '.' &&
 682            storage->name[namelen] != '[')) {
 683          continue;
 684       }
 685
 686       gl_constant_value *components = storage->storage;
 687       unsigned vector_count = (MAX2(storage->array_elements, 1) *
 688                                storage->type->matrix_columns);
 689
 690       for (unsigned s = 0; s < vector_count; s++) {
 691          assert(uniforms < uniform_array_size);
 692          uniform_vector_size[uniforms] = storage->type->vector_elements;
 693
 694          int i;
 695          for (i = 0; i < uniform_vector_size[uniforms]; i++) {
 696             stage_prog_data->param[uniforms * 4 + i] = &components->f;
 697             components++;
 698          }
 699          for (; i < 4; i++) {
 700             static float zero = 0;
 701             stage_prog_data->param[uniforms * 4 + i] = &zero;
 702          }
 703
 704          uniforms++;
 705       }
 706    }
 707 }
 708
 709 void
 710 vec4_visitor::setup_uniform_clipplane_values()
 711 {
 712    gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
 713
 714    for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
 715       assert(this->uniforms < uniform_array_size);
 716       this->uniform_vector_size[this->uniforms] = 4;
 717       this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
 718       this->userplane[i].type = BRW_REGISTER_TYPE_F;
 719       for (int j = 0; j < 4; ++j) {
 720          stage_prog_data->param[this->uniforms * 4 + j] = &clip_planes[i][j];
 721       }
 722       ++this->uniforms;
 723    }
 724 }
 725
 726 /* Our support for builtin uniforms is even scarier than non-builtin.
 727  * It sits on top of the PROG_STATE_VAR parameters that are
 728  * automatically updated from GL context state.
 729  */
 730 void
 731 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
 732 {
 733    const ir_state_slot *const slots = ir->state_slots;
 734    assert(ir->state_slots != NULL);
 735
 736    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 737       /* This state reference has already been setup by ir_to_mesa,
 738        * but we'll get the same index back here.  We can reference
 739        * ParameterValues directly, since unlike brw_fs.cpp, we never
 740        * add new state references during compile.
 741        */
 742       int index = _mesa_add_state_reference(this->prog->Parameters,
 743                                             (gl_state_index *)slots[i].tokens);
 744       float *values = &this->prog->Parameters->ParameterValues[index][0].f;
 745
 746       assert(this->uniforms < uniform_array_size);
 747       this->uniform_vector_size[this->uniforms] = 0;
 748       /* Add each of the unique swizzled channels of the element.
 749        * This will end up matching the size of the glsl_type of this field.
 750        */
 751       int last_swiz = -1;
 752       for (unsigned int j = 0; j < 4; j++) {
 753          int swiz = GET_SWZ(slots[i].swizzle, j);
 754          last_swiz = swiz;
 755
 756          stage_prog_data->param[this->uniforms * 4 + j] = &values[swiz];
 757          assert(this->uniforms < uniform_array_size);
 758          if (swiz <= last_swiz)
 759             this->uniform_vector_size[this->uniforms]++;
 760       }
 761       this->uniforms++;
 762    }
 763 }
 764
 765 dst_reg *
 766 vec4_visitor::variable_storage(ir_variable *var)
 767 {
 768    return (dst_reg *)hash_table_find(this->variable_ht, var);
 769 }
 770
 771 void
 772 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir, uint32_t *predicate)
 773 {
 774    ir_expression *expr = ir->as_expression();
 775
 776    *predicate = BRW_PREDICATE_NORMAL;
 777
 778    if (expr) {
 779       src_reg op[2];
 780       vec4_instruction *inst;
 781
 782       assert(expr->get_num_operands() <= 2);
 783       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 784          expr->operands[i]->accept(this);
 785          op[i] = this->result;
 786
 787          resolve_ud_negate(&op[i]);
 788       }
 789
 790       switch (expr->operation) {
 791       case ir_unop_logic_not:
 792          inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
 793          inst->conditional_mod = BRW_CONDITIONAL_Z;
 794          break;
 795
 796       case ir_binop_logic_xor:
 797          inst = emit(XOR(dst_null_d(), op[0], op[1]));
 798          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 799          break;
 800
 801       case ir_binop_logic_or:
 802          inst = emit(OR(dst_null_d(), op[0], op[1]));
 803          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 804          break;
 805
 806       case ir_binop_logic_and:
 807          inst = emit(AND(dst_null_d(), op[0], op[1]));
 808          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 809          break;
 810
 811       case ir_unop_f2b:
 812          if (brw->gen >= 6) {
 813             emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
 814          } else {
 815             inst = emit(MOV(dst_null_f(), op[0]));
 816             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 817          }
 818          break;
 819
 820       case ir_unop_i2b:
 821          if (brw->gen >= 6) {
 822             emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 823          } else {
 824             inst = emit(MOV(dst_null_d(), op[0]));
 825             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 826          }
 827          break;
 828
 829       case ir_binop_all_equal:
 830          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 831          *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
 832          break;
 833
 834       case ir_binop_any_nequal:
 835          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 836          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 837          break;
 838
 839       case ir_unop_any:
 840          inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 841          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 842          break;
 843
 844       case ir_binop_greater:
 845       case ir_binop_gequal:
 846       case ir_binop_less:
 847       case ir_binop_lequal:
 848       case ir_binop_equal:
 849       case ir_binop_nequal:
 850          emit(CMP(dst_null_d(), op[0], op[1],
 851                   brw_conditional_for_comparison(expr->operation)));
 852          break;
 853
 854       default:
 855          assert(!"not reached");
 856          break;
 857       }
 858       return;
 859    }
 860
 861    ir->accept(this);
 862
 863    resolve_ud_negate(&this->result);
 864
 865    if (brw->gen >= 6) {
 866       vec4_instruction *inst = emit(AND(dst_null_d(),
 867                                         this->result, src_reg(1)));
 868       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 869    } else {
 870       vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
 871       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 872    }
 873 }
 874
 875 /**
 876  * Emit a gen6 IF statement with the comparison folded into the IF
 877  * instruction.
 878  */
 879 void
 880 vec4_visitor::emit_if_gen6(ir_if *ir)
 881 {
 882    ir_expression *expr = ir->condition->as_expression();
 883
 884    if (expr) {
 885       src_reg op[2];
 886       dst_reg temp;
 887
 888       assert(expr->get_num_operands() <= 2);
 889       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 890          expr->operands[i]->accept(this);
 891          op[i] = this->result;
 892       }
 893
 894       switch (expr->operation) {
 895       case ir_unop_logic_not:
 896          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
 897          return;
 898
 899       case ir_binop_logic_xor:
 900          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
 901          return;
 902
 903       case ir_binop_logic_or:
 904          temp = dst_reg(this, glsl_type::bool_type);
 905          emit(OR(temp, op[0], op[1]));
 906          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 907          return;
 908
 909       case ir_binop_logic_and:
 910          temp = dst_reg(this, glsl_type::bool_type);
 911          emit(AND(temp, op[0], op[1]));
 912          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 913          return;
 914
 915       case ir_unop_f2b:
 916          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 917          return;
 918
 919       case ir_unop_i2b:
 920          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 921          return;
 922
 923       case ir_binop_greater:
 924       case ir_binop_gequal:
 925       case ir_binop_less:
 926       case ir_binop_lequal:
 927       case ir_binop_equal:
 928       case ir_binop_nequal:
 929          emit(IF(op[0], op[1],
 930                  brw_conditional_for_comparison(expr->operation)));
 931          return;
 932
 933       case ir_binop_all_equal:
 934          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 935          emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
 936          return;
 937
 938       case ir_binop_any_nequal:
 939          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 940          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 941          return;
 942
 943       case ir_unop_any:
 944          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 945          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 946          return;
 947
 948       default:
 949          assert(!"not reached");
 950          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 951          return;
 952       }
 953       return;
 954    }
 955
 956    ir->condition->accept(this);
 957
 958    emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
 959 }
 960
 961 void
 962 vec4_visitor::visit(ir_variable *ir)
 963 {
 964    dst_reg *reg = NULL;
 965
 966    if (variable_storage(ir))
 967       return;
 968
 969    switch (ir->data.mode) {
 970    case ir_var_shader_in:
 971       reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
 972       break;
 973
 974    case ir_var_shader_out:
 975       reg = new(mem_ctx) dst_reg(this, ir->type);
 976
 977       for (int i = 0; i < type_size(ir->type); i++) {
 978          output_reg[ir->data.location + i] = *reg;
 979          output_reg[ir->data.location + i].reg_offset = i;
 980          output_reg[ir->data.location + i].type =
 981             brw_type_for_base_type(ir->type->get_scalar_type());
 982          output_reg_annotation[ir->data.location + i] = ir->name;
 983       }
 984       break;
 985
 986    case ir_var_auto:
 987    case ir_var_temporary:
 988       reg = new(mem_ctx) dst_reg(this, ir->type);
 989       break;
 990
 991    case ir_var_uniform:
 992       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
 993
 994       /* Thanks to the lower_ubo_reference pass, we will see only
 995        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
 996        * variables, so no need for them to be in variable_ht.
 997        *
 998        * Atomic counters take no uniform storage, no need to do
 999        * anything here.
1000        */
1001       if (ir->is_in_uniform_block() || ir->type->contains_atomic())
1002          return;
1003
1004       /* Track how big the whole uniform variable is, in case we need to put a
1005        * copy of its data into pull constants for array access.
1006        */
1007       assert(this->uniforms < uniform_array_size);
1008       this->uniform_size[this->uniforms] = type_size(ir->type);
1009
1010       if (!strncmp(ir->name, "gl_", 3)) {
1011          setup_builtin_uniform_values(ir);
1012       } else {
1013          setup_uniform_values(ir);
1014       }
1015       break;
1016
1017    case ir_var_system_value:
1018       reg = make_reg_for_system_value(ir);
1019       break;
1020
1021    default:
1022       assert(!"not reached");
1023    }
1024
1025    reg->type = brw_type_for_base_type(ir->type);
1026    hash_table_insert(this->variable_ht, reg, ir);
1027 }
1028
1029 void
1030 vec4_visitor::visit(ir_loop *ir)
1031 {
1032    /* We don't want debugging output to print the whole body of the
1033     * loop as the annotation.
1034     */
1035    this->base_ir = NULL;
1036
1037    emit(BRW_OPCODE_DO);
1038
1039    visit_instructions(&ir->body_instructions);
1040
1041    emit(BRW_OPCODE_WHILE);
1042 }
1043
1044 void
1045 vec4_visitor::visit(ir_loop_jump *ir)
1046 {
1047    switch (ir->mode) {
1048    case ir_loop_jump::jump_break:
1049       emit(BRW_OPCODE_BREAK);
1050       break;
1051    case ir_loop_jump::jump_continue:
1052       emit(BRW_OPCODE_CONTINUE);
1053       break;
1054    }
1055 }
1056
1057
1058 void
1059 vec4_visitor::visit(ir_function_signature *ir)
1060 {
1061    assert(0);
1062    (void)ir;
1063 }
1064
1065 void
1066 vec4_visitor::visit(ir_function *ir)
1067 {
1068    /* Ignore function bodies other than main() -- we shouldn't see calls to
1069     * them since they should all be inlined.
1070     */
1071    if (strcmp(ir->name, "main") == 0) {
1072       const ir_function_signature *sig;
1073       exec_list empty;
1074
1075       sig = ir->matching_signature(NULL, &empty);
1076
1077       assert(sig);
1078
1079       visit_instructions(&sig->body);
1080    }
1081 }
1082
1083 bool
1084 vec4_visitor::try_emit_sat(ir_expression *ir)
1085 {
1086    ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
1087    if (!sat_src)
1088       return false;
1089
1090    sat_src->accept(this);
1091    src_reg src = this->result;
1092
1093    this->result = src_reg(this, ir->type);
1094    vec4_instruction *inst;
1095    inst = emit(MOV(dst_reg(this->result), src));
1096    inst->saturate = true;
1097
1098    return true;
1099 }
1100
1101 bool
1102 vec4_visitor::try_emit_mad(ir_expression *ir)
1103 {
1104    /* 3-src instructions were introduced in gen6. */
1105    if (brw->gen < 6)
1106       return false;
1107
1108    /* MAD can only handle floating-point data. */
1109    if (ir->type->base_type != GLSL_TYPE_FLOAT)
1110       return false;
1111
1112    ir_rvalue *nonmul = ir->operands[1];
1113    ir_expression *mul = ir->operands[0]->as_expression();
1114
1115    if (!mul || mul->operation != ir_binop_mul) {
1116       nonmul = ir->operands[0];
1117       mul = ir->operands[1]->as_expression();
1118
1119       if (!mul || mul->operation != ir_binop_mul)
1120          return false;
1121    }
1122
1123    nonmul->accept(this);
1124    src_reg src0 = fix_3src_operand(this->result);
1125
1126    mul->operands[0]->accept(this);
1127    src_reg src1 = fix_3src_operand(this->result);
1128
1129    mul->operands[1]->accept(this);
1130    src_reg src2 = fix_3src_operand(this->result);
1131
1132    this->result = src_reg(this, ir->type);
1133    emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1134
1135    return true;
1136 }
1137
1138 bool
1139 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1140 {
1141    ir_expression *const cmp = ir->operands[0]->as_expression();
1142
1143    if (cmp == NULL)
1144       return false;
1145
1146    switch (cmp->operation) {
1147    case ir_binop_less:
1148    case ir_binop_greater:
1149    case ir_binop_lequal:
1150    case ir_binop_gequal:
1151    case ir_binop_equal:
1152    case ir_binop_nequal:
1153       break;
1154
1155    default:
1156       return false;
1157    }
1158
1159    cmp->operands[0]->accept(this);
1160    const src_reg cmp_src0 = this->result;
1161
1162    cmp->operands[1]->accept(this);
1163    const src_reg cmp_src1 = this->result;
1164
1165    this->result = src_reg(this, ir->type);
1166
1167    emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1168             brw_conditional_for_comparison(cmp->operation)));
1169
1170    /* If the comparison is false, this->result will just happen to be zero.
1171     */
1172    vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1173                                        this->result, src_reg(1.0f));
1174    inst->predicate = BRW_PREDICATE_NORMAL;
1175    inst->predicate_inverse = true;
1176
1177    return true;
1178 }
1179
1180 void
1181 vec4_visitor::emit_bool_comparison(unsigned int op,
1182                                  dst_reg dst, src_reg src0, src_reg src1)
1183 {
1184    /* original gen4 does destination conversion before comparison. */
1185    if (brw->gen < 5)
1186       dst.type = src0.type;
1187
1188    emit(CMP(dst, src0, src1, brw_conditional_for_comparison(op)));
1189
1190    dst.type = BRW_REGISTER_TYPE_D;
1191    emit(AND(dst, src_reg(dst), src_reg(0x1)));
1192 }
1193
1194 void
1195 vec4_visitor::emit_minmax(uint32_t conditionalmod, dst_reg dst,
1196                           src_reg src0, src_reg src1)
1197 {
1198    vec4_instruction *inst;
1199
1200    if (brw->gen >= 6) {
1201       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1202       inst->conditional_mod = conditionalmod;
1203    } else {
1204       emit(CMP(dst, src0, src1, conditionalmod));
1205
1206       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1207       inst->predicate = BRW_PREDICATE_NORMAL;
1208    }
1209 }
1210
1211 void
1212 vec4_visitor::emit_lrp(const dst_reg &dst,
1213                        const src_reg &x, const src_reg &y, const src_reg &a)
1214 {
1215    if (brw->gen >= 6) {
1216       /* Note that the instruction's argument order is reversed from GLSL
1217        * and the IR.
1218        */
1219       emit(LRP(dst,
1220                fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
1221    } else {
1222       /* Earlier generations don't support three source operations, so we
1223        * need to emit x*(1-a) + y*a.
1224        */
1225       dst_reg y_times_a           = dst_reg(this, glsl_type::vec4_type);
1226       dst_reg one_minus_a         = dst_reg(this, glsl_type::vec4_type);
1227       dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1228       y_times_a.writemask           = dst.writemask;
1229       one_minus_a.writemask         = dst.writemask;
1230       x_times_one_minus_a.writemask = dst.writemask;
1231
1232       emit(MUL(y_times_a, y, a));
1233       emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1234       emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1235       emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1236    }
1237 }
1238
1239 void
1240 vec4_visitor::visit(ir_expression *ir)
1241 {
1242    unsigned int operand;
1243    src_reg op[Elements(ir->operands)];
1244    src_reg result_src;
1245    dst_reg result_dst;
1246    vec4_instruction *inst;
1247
1248    if (try_emit_sat(ir))
1249       return;
1250
1251    if (ir->operation == ir_binop_add) {
1252       if (try_emit_mad(ir))
1253          return;
1254    }
1255
1256    if (ir->operation == ir_unop_b2f) {
1257       if (try_emit_b2f_of_compare(ir))
1258          return;
1259    }
1260
1261    for (operand = 0; operand < ir->get_num_operands(); operand++) {
1262       this->result.file = BAD_FILE;
1263       ir->operands[operand]->accept(this);
1264       if (this->result.file == BAD_FILE) {
1265          fprintf(stderr, "Failed to get tree for expression operand:\n");
1266          ir->operands[operand]->fprint(stderr);
1267          exit(1);
1268       }
1269       op[operand] = this->result;
1270
1271       /* Matrix expression operands should have been broken down to vector
1272        * operations already.
1273        */
1274       assert(!ir->operands[operand]->type->is_matrix());
1275    }
1276
1277    int vector_elements = ir->operands[0]->type->vector_elements;
1278    if (ir->operands[1]) {
1279       vector_elements = MAX2(vector_elements,
1280                              ir->operands[1]->type->vector_elements);
1281    }
1282
1283    this->result.file = BAD_FILE;
1284
1285    /* Storage for our result.  Ideally for an assignment we'd be using
1286     * the actual storage for the result here, instead.
1287     */
1288    result_src = src_reg(this, ir->type);
1289    /* convenience for the emit functions below. */
1290    result_dst = dst_reg(result_src);
1291    /* If nothing special happens, this is the result. */
1292    this->result = result_src;
1293    /* Limit writes to the channels that will be used by result_src later.
1294     * This does limit this temp's use as a temporary for multi-instruction
1295     * sequences.
1296     */
1297    result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1298
1299    switch (ir->operation) {
1300    case ir_unop_logic_not:
1301       /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
1302        * ones complement of the whole register, not just bit 0.
1303        */
1304       emit(XOR(result_dst, op[0], src_reg(1)));
1305       break;
1306    case ir_unop_neg:
1307       op[0].negate = !op[0].negate;
1308       emit(MOV(result_dst, op[0]));
1309       break;
1310    case ir_unop_abs:
1311       op[0].abs = true;
1312       op[0].negate = false;
1313       emit(MOV(result_dst, op[0]));
1314       break;
1315
1316    case ir_unop_sign:
1317       if (ir->type->is_float()) {
1318          /* AND(val, 0x80000000) gives the sign bit.
1319           *
1320           * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1321           * zero.
1322           */
1323          emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1324
1325          op[0].type = BRW_REGISTER_TYPE_UD;
1326          result_dst.type = BRW_REGISTER_TYPE_UD;
1327          emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1328
1329          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1330          inst->predicate = BRW_PREDICATE_NORMAL;
1331
1332          this->result.type = BRW_REGISTER_TYPE_F;
1333       } else {
1334          /*  ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1335           *               -> non-negative val generates 0x00000000.
1336           *  Predicated OR sets 1 if val is positive.
1337           */
1338          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1339
1340          emit(ASR(result_dst, op[0], src_reg(31)));
1341
1342          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1343          inst->predicate = BRW_PREDICATE_NORMAL;
1344       }
1345       break;
1346
1347    case ir_unop_rcp:
1348       emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1349       break;
1350
1351    case ir_unop_exp2:
1352       emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1353       break;
1354    case ir_unop_log2:
1355       emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1356       break;
1357    case ir_unop_exp:
1358    case ir_unop_log:
1359       assert(!"not reached: should be handled by ir_explog_to_explog2");
1360       break;
1361    case ir_unop_sin:
1362    case ir_unop_sin_reduced:
1363       emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1364       break;
1365    case ir_unop_cos:
1366    case ir_unop_cos_reduced:
1367       emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1368       break;
1369
1370    case ir_unop_dFdx:
1371    case ir_unop_dFdy:
1372       assert(!"derivatives not valid in vertex shader");
1373       break;
1374
1375    case ir_unop_bitfield_reverse:
1376       emit(BFREV(result_dst, op[0]));
1377       break;
1378    case ir_unop_bit_count:
1379       emit(CBIT(result_dst, op[0]));
1380       break;
1381    case ir_unop_find_msb: {
1382       src_reg temp = src_reg(this, glsl_type::uint_type);
1383
1384       inst = emit(FBH(dst_reg(temp), op[0]));
1385       inst->dst.writemask = WRITEMASK_XYZW;
1386
1387       /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1388        * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1389        * subtract the result from 31 to convert the MSB count into an LSB count.
1390        */
1391
1392       /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1393       temp.swizzle = BRW_SWIZZLE_NOOP;
1394       emit(MOV(result_dst, temp));
1395
1396       src_reg src_tmp = src_reg(result_dst);
1397       emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1398
1399       src_tmp.negate = true;
1400       inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1401       inst->predicate = BRW_PREDICATE_NORMAL;
1402       break;
1403    }
1404    case ir_unop_find_lsb:
1405       emit(FBL(result_dst, op[0]));
1406       break;
1407
1408    case ir_unop_noise:
1409       assert(!"not reached: should be handled by lower_noise");
1410       break;
1411
1412    case ir_binop_add:
1413       emit(ADD(result_dst, op[0], op[1]));
1414       break;
1415    case ir_binop_sub:
1416       assert(!"not reached: should be handled by ir_sub_to_add_neg");
1417       break;
1418
1419    case ir_binop_mul:
1420       if (brw->gen < 8 && ir->type->is_integer()) {
1421          /* For integer multiplication, the MUL uses the low 16 bits of one of
1422           * the operands (src0 through SNB, src1 on IVB and later).  The MACH
1423           * accumulates in the contribution of the upper 16 bits of that
1424           * operand.  If we can determine that one of the args is in the low
1425           * 16 bits, though, we can just emit a single MUL.
1426           */
1427          if (ir->operands[0]->is_uint16_constant()) {
1428             if (brw->gen < 7)
1429                emit(MUL(result_dst, op[0], op[1]));
1430             else
1431                emit(MUL(result_dst, op[1], op[0]));
1432          } else if (ir->operands[1]->is_uint16_constant()) {
1433             if (brw->gen < 7)
1434                emit(MUL(result_dst, op[1], op[0]));
1435             else
1436                emit(MUL(result_dst, op[0], op[1]));
1437          } else {
1438             struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1439
1440             emit(MUL(acc, op[0], op[1]));
1441             emit(MACH(dst_null_d(), op[0], op[1]));
1442             emit(MOV(result_dst, src_reg(acc)));
1443          }
1444       } else {
1445          emit(MUL(result_dst, op[0], op[1]));
1446       }
1447       break;
1448    case ir_binop_imul_high: {
1449       struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1450
1451       emit(MUL(acc, op[0], op[1]));
1452       emit(MACH(result_dst, op[0], op[1]));
1453       break;
1454    }
1455    case ir_binop_div:
1456       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1457       assert(ir->type->is_integer());
1458       emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1459       break;
1460    case ir_binop_carry: {
1461       struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1462
1463       emit(ADDC(dst_null_ud(), op[0], op[1]));
1464       emit(MOV(result_dst, src_reg(acc)));
1465       break;
1466    }
1467    case ir_binop_borrow: {
1468       struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1469
1470       emit(SUBB(dst_null_ud(), op[0], op[1]));
1471       emit(MOV(result_dst, src_reg(acc)));
1472       break;
1473    }
1474    case ir_binop_mod:
1475       /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1476       assert(ir->type->is_integer());
1477       emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1478       break;
1479
1480    case ir_binop_less:
1481    case ir_binop_greater:
1482    case ir_binop_lequal:
1483    case ir_binop_gequal:
1484    case ir_binop_equal:
1485    case ir_binop_nequal: {
1486       emit(CMP(result_dst, op[0], op[1],
1487                brw_conditional_for_comparison(ir->operation)));
1488       emit(AND(result_dst, result_src, src_reg(0x1)));
1489       break;
1490    }
1491
1492    case ir_binop_all_equal:
1493       /* "==" operator producing a scalar boolean. */
1494       if (ir->operands[0]->type->is_vector() ||
1495           ir->operands[1]->type->is_vector()) {
1496          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1497          emit(MOV(result_dst, src_reg(0)));
1498          inst = emit(MOV(result_dst, src_reg(1)));
1499          inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1500       } else {
1501          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1502          emit(AND(result_dst, result_src, src_reg(0x1)));
1503       }
1504       break;
1505    case ir_binop_any_nequal:
1506       /* "!=" operator producing a scalar boolean. */
1507       if (ir->operands[0]->type->is_vector() ||
1508           ir->operands[1]->type->is_vector()) {
1509          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1510
1511          emit(MOV(result_dst, src_reg(0)));
1512          inst = emit(MOV(result_dst, src_reg(1)));
1513          inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1514       } else {
1515          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1516          emit(AND(result_dst, result_src, src_reg(0x1)));
1517       }
1518       break;
1519
1520    case ir_unop_any:
1521       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1522       emit(MOV(result_dst, src_reg(0)));
1523
1524       inst = emit(MOV(result_dst, src_reg(1)));
1525       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1526       break;
1527
1528    case ir_binop_logic_xor:
1529       emit(XOR(result_dst, op[0], op[1]));
1530       break;
1531
1532    case ir_binop_logic_or:
1533       emit(OR(result_dst, op[0], op[1]));
1534       break;
1535
1536    case ir_binop_logic_and:
1537       emit(AND(result_dst, op[0], op[1]));
1538       break;
1539
1540    case ir_binop_dot:
1541       assert(ir->operands[0]->type->is_vector());
1542       assert(ir->operands[0]->type == ir->operands[1]->type);
1543       emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1544       break;
1545
1546    case ir_unop_sqrt:
1547       emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1548       break;
1549    case ir_unop_rsq:
1550       emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1551       break;
1552
1553    case ir_unop_bitcast_i2f:
1554    case ir_unop_bitcast_u2f:
1555       this->result = op[0];
1556       this->result.type = BRW_REGISTER_TYPE_F;
1557       break;
1558
1559    case ir_unop_bitcast_f2i:
1560       this->result = op[0];
1561       this->result.type = BRW_REGISTER_TYPE_D;
1562       break;
1563
1564    case ir_unop_bitcast_f2u:
1565       this->result = op[0];
1566       this->result.type = BRW_REGISTER_TYPE_UD;
1567       break;
1568
1569    case ir_unop_i2f:
1570    case ir_unop_i2u:
1571    case ir_unop_u2i:
1572    case ir_unop_u2f:
1573    case ir_unop_b2f:
1574    case ir_unop_b2i:
1575    case ir_unop_f2i:
1576    case ir_unop_f2u:
1577       emit(MOV(result_dst, op[0]));
1578       break;
1579    case ir_unop_f2b:
1580    case ir_unop_i2b: {
1581       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1582       emit(AND(result_dst, result_src, src_reg(1)));
1583       break;
1584    }
1585
1586    case ir_unop_trunc:
1587       emit(RNDZ(result_dst, op[0]));
1588       break;
1589    case ir_unop_ceil:
1590       op[0].negate = !op[0].negate;
1591       inst = emit(RNDD(result_dst, op[0]));
1592       this->result.negate = true;
1593       break;
1594    case ir_unop_floor:
1595       inst = emit(RNDD(result_dst, op[0]));
1596       break;
1597    case ir_unop_fract:
1598       inst = emit(FRC(result_dst, op[0]));
1599       break;
1600    case ir_unop_round_even:
1601       emit(RNDE(result_dst, op[0]));
1602       break;
1603
1604    case ir_binop_min:
1605       emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1606       break;
1607    case ir_binop_max:
1608       emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1609       break;
1610
1611    case ir_binop_pow:
1612       emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1613       break;
1614
1615    case ir_unop_bit_not:
1616       inst = emit(NOT(result_dst, op[0]));
1617       break;
1618    case ir_binop_bit_and:
1619       inst = emit(AND(result_dst, op[0], op[1]));
1620       break;
1621    case ir_binop_bit_xor:
1622       inst = emit(XOR(result_dst, op[0], op[1]));
1623       break;
1624    case ir_binop_bit_or:
1625       inst = emit(OR(result_dst, op[0], op[1]));
1626       break;
1627
1628    case ir_binop_lshift:
1629       inst = emit(SHL(result_dst, op[0], op[1]));
1630       break;
1631
1632    case ir_binop_rshift:
1633       if (ir->type->base_type == GLSL_TYPE_INT)
1634          inst = emit(ASR(result_dst, op[0], op[1]));
1635       else
1636          inst = emit(SHR(result_dst, op[0], op[1]));
1637       break;
1638
1639    case ir_binop_bfm:
1640       emit(BFI1(result_dst, op[0], op[1]));
1641       break;
1642
1643    case ir_binop_ubo_load: {
1644       ir_constant *uniform_block = ir->operands[0]->as_constant();
1645       ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1646       unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1647       src_reg offset;
1648
1649       /* Now, load the vector from that offset. */
1650       assert(ir->type->is_vector() || ir->type->is_scalar());
1651
1652       src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1653       packed_consts.type = result.type;
1654       src_reg surf_index =
1655          src_reg(prog_data->base.binding_table.ubo_start + uniform_block->value.u[0]);
1656       if (const_offset_ir) {
1657          if (brw->gen >= 8) {
1658             /* Store the offset in a GRF so we can send-from-GRF. */
1659             offset = src_reg(this, glsl_type::int_type);
1660             emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1661          } else {
1662             /* Immediates are fine on older generations since they'll be moved
1663              * to a (potentially fake) MRF at the generator level.
1664              */
1665             offset = src_reg(const_offset / 16);
1666          }
1667       } else {
1668          offset = src_reg(this, glsl_type::uint_type);
1669          emit(SHR(dst_reg(offset), op[1], src_reg(4)));
1670       }
1671
1672       if (brw->gen >= 7) {
1673          dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1674          grf_offset.type = offset.type;
1675
1676          emit(MOV(grf_offset, offset));
1677
1678          emit(new(mem_ctx) vec4_instruction(this,
1679                                             VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1680                                             dst_reg(packed_consts),
1681                                             surf_index,
1682                                             src_reg(grf_offset)));
1683       } else {
1684          vec4_instruction *pull =
1685             emit(new(mem_ctx) vec4_instruction(this,
1686                                                VS_OPCODE_PULL_CONSTANT_LOAD,
1687                                                dst_reg(packed_consts),
1688                                                surf_index,
1689                                                offset));
1690          pull->base_mrf = 14;
1691          pull->mlen = 1;
1692       }
1693
1694       packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1695       packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1696                                             const_offset % 16 / 4,
1697                                             const_offset % 16 / 4,
1698                                             const_offset % 16 / 4);
1699
1700       /* UBO bools are any nonzero int.  We store bools as either 0 or 1. */
1701       if (ir->type->base_type == GLSL_TYPE_BOOL) {
1702          emit(CMP(result_dst, packed_consts, src_reg(0u),
1703                   BRW_CONDITIONAL_NZ));
1704          emit(AND(result_dst, result, src_reg(0x1)));
1705       } else {
1706          emit(MOV(result_dst, packed_consts));
1707       }
1708       break;
1709    }
1710
1711    case ir_binop_vector_extract:
1712       assert(!"should have been lowered by vec_index_to_cond_assign");
1713       break;
1714
1715    case ir_triop_fma:
1716       op[0] = fix_3src_operand(op[0]);
1717       op[1] = fix_3src_operand(op[1]);
1718       op[2] = fix_3src_operand(op[2]);
1719       /* Note that the instruction's argument order is reversed from GLSL
1720        * and the IR.
1721        */
1722       emit(MAD(result_dst, op[2], op[1], op[0]));
1723       break;
1724
1725    case ir_triop_lrp:
1726       emit_lrp(result_dst, op[0], op[1], op[2]);
1727       break;
1728
1729    case ir_triop_csel:
1730       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1731       inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1732       inst->predicate = BRW_PREDICATE_NORMAL;
1733       break;
1734
1735    case ir_triop_bfi:
1736       op[0] = fix_3src_operand(op[0]);
1737       op[1] = fix_3src_operand(op[1]);
1738       op[2] = fix_3src_operand(op[2]);
1739       emit(BFI2(result_dst, op[0], op[1], op[2]));
1740       break;
1741
1742    case ir_triop_bitfield_extract:
1743       op[0] = fix_3src_operand(op[0]);
1744       op[1] = fix_3src_operand(op[1]);
1745       op[2] = fix_3src_operand(op[2]);
1746       /* Note that the instruction's argument order is reversed from GLSL
1747        * and the IR.
1748        */
1749       emit(BFE(result_dst, op[2], op[1], op[0]));
1750       break;
1751
1752    case ir_triop_vector_insert:
1753       assert(!"should have been lowered by lower_vector_insert");
1754       break;
1755
1756    case ir_quadop_bitfield_insert:
1757       assert(!"not reached: should be handled by "
1758               "bitfield_insert_to_bfm_bfi\n");
1759       break;
1760
1761    case ir_quadop_vector:
1762       assert(!"not reached: should be handled by lower_quadop_vector");
1763       break;
1764
1765    case ir_unop_pack_half_2x16:
1766       emit_pack_half_2x16(result_dst, op[0]);
1767       break;
1768    case ir_unop_unpack_half_2x16:
1769       emit_unpack_half_2x16(result_dst, op[0]);
1770       break;
1771    case ir_unop_pack_snorm_2x16:
1772    case ir_unop_pack_snorm_4x8:
1773    case ir_unop_pack_unorm_2x16:
1774    case ir_unop_pack_unorm_4x8:
1775    case ir_unop_unpack_snorm_2x16:
1776    case ir_unop_unpack_snorm_4x8:
1777    case ir_unop_unpack_unorm_2x16:
1778    case ir_unop_unpack_unorm_4x8:
1779       assert(!"not reached: should be handled by lower_packing_builtins");
1780       break;
1781    case ir_unop_unpack_half_2x16_split_x:
1782    case ir_unop_unpack_half_2x16_split_y:
1783    case ir_binop_pack_half_2x16_split:
1784       assert(!"not reached: should not occur in vertex shader");
1785       break;
1786    case ir_binop_ldexp:
1787       assert(!"not reached: should be handled by ldexp_to_arith()");
1788       break;
1789    }
1790 }
1791
1792
1793 void
1794 vec4_visitor::visit(ir_swizzle *ir)
1795 {
1796    src_reg src;
1797    int i = 0;
1798    int swizzle[4];
1799
1800    /* Note that this is only swizzles in expressions, not those on the left
1801     * hand side of an assignment, which do write masking.  See ir_assignment
1802     * for that.
1803     */
1804
1805    ir->val->accept(this);
1806    src = this->result;
1807    assert(src.file != BAD_FILE);
1808
1809    for (i = 0; i < ir->type->vector_elements; i++) {
1810       switch (i) {
1811       case 0:
1812          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1813          break;
1814       case 1:
1815          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1816          break;
1817       case 2:
1818          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1819          break;
1820       case 3:
1821          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1822             break;
1823       }
1824    }
1825    for (; i < 4; i++) {
1826       /* Replicate the last channel out. */
1827       swizzle[i] = swizzle[ir->type->vector_elements - 1];
1828    }
1829
1830    src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1831
1832    this->result = src;
1833 }
1834
1835 void
1836 vec4_visitor::visit(ir_dereference_variable *ir)
1837 {
1838    const struct glsl_type *type = ir->type;
1839    dst_reg *reg = variable_storage(ir->var);
1840
1841    if (!reg) {
1842       fail("Failed to find variable storage for %s\n", ir->var->name);
1843       this->result = src_reg(brw_null_reg());
1844       return;
1845    }
1846
1847    this->result = src_reg(*reg);
1848
1849    /* System values get their swizzle from the dst_reg writemask */
1850    if (ir->var->data.mode == ir_var_system_value)
1851       return;
1852
1853    if (type->is_scalar() || type->is_vector() || type->is_matrix())
1854       this->result.swizzle = swizzle_for_size(type->vector_elements);
1855 }
1856
1857
1858 int
1859 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1860 {
1861    /* Under normal circumstances array elements are stored consecutively, so
1862     * the stride is equal to the size of the array element.
1863     */
1864    return type_size(ir->type);
1865 }
1866
1867
1868 void
1869 vec4_visitor::visit(ir_dereference_array *ir)
1870 {
1871    ir_constant *constant_index;
1872    src_reg src;
1873    int array_stride = compute_array_stride(ir);
1874
1875    constant_index = ir->array_index->constant_expression_value();
1876
1877    ir->array->accept(this);
1878    src = this->result;
1879
1880    if (constant_index) {
1881       src.reg_offset += constant_index->value.i[0] * array_stride;
1882    } else {
1883       /* Variable index array dereference.  It eats the "vec4" of the
1884        * base of the array and an index that offsets the Mesa register
1885        * index.
1886        */
1887       ir->array_index->accept(this);
1888
1889       src_reg index_reg;
1890
1891       if (array_stride == 1) {
1892          index_reg = this->result;
1893       } else {
1894          index_reg = src_reg(this, glsl_type::int_type);
1895
1896          emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
1897       }
1898
1899       if (src.reladdr) {
1900          src_reg temp = src_reg(this, glsl_type::int_type);
1901
1902          emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1903
1904          index_reg = temp;
1905       }
1906
1907       src.reladdr = ralloc(mem_ctx, src_reg);
1908       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1909    }
1910
1911    /* If the type is smaller than a vec4, replicate the last channel out. */
1912    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1913       src.swizzle = swizzle_for_size(ir->type->vector_elements);
1914    else
1915       src.swizzle = BRW_SWIZZLE_NOOP;
1916    src.type = brw_type_for_base_type(ir->type);
1917
1918    this->result = src;
1919 }
1920
1921 void
1922 vec4_visitor::visit(ir_dereference_record *ir)
1923 {
1924    unsigned int i;
1925    const glsl_type *struct_type = ir->record->type;
1926    int offset = 0;
1927
1928    ir->record->accept(this);
1929
1930    for (i = 0; i < struct_type->length; i++) {
1931       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1932          break;
1933       offset += type_size(struct_type->fields.structure[i].type);
1934    }
1935
1936    /* If the type is smaller than a vec4, replicate the last channel out. */
1937    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1938       this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1939    else
1940       this->result.swizzle = BRW_SWIZZLE_NOOP;
1941    this->result.type = brw_type_for_base_type(ir->type);
1942
1943    this->result.reg_offset += offset;
1944 }
1945
1946 /**
1947  * We want to be careful in assignment setup to hit the actual storage
1948  * instead of potentially using a temporary like we might with the
1949  * ir_dereference handler.
1950  */
1951 static dst_reg
1952 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1953 {
1954    /* The LHS must be a dereference.  If the LHS is a variable indexed array
1955     * access of a vector, it must be separated into a series conditional moves
1956     * before reaching this point (see ir_vec_index_to_cond_assign).
1957     */
1958    assert(ir->as_dereference());
1959    ir_dereference_array *deref_array = ir->as_dereference_array();
1960    if (deref_array) {
1961       assert(!deref_array->array->type->is_vector());
1962    }
1963
1964    /* Use the rvalue deref handler for the most part.  We'll ignore
1965     * swizzles in it and write swizzles using writemask, though.
1966     */
1967    ir->accept(v);
1968    return dst_reg(v->result);
1969 }
1970
1971 void
1972 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1973                               const struct glsl_type *type, uint32_t predicate)
1974 {
1975    if (type->base_type == GLSL_TYPE_STRUCT) {
1976       for (unsigned int i = 0; i < type->length; i++) {
1977          emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1978       }
1979       return;
1980    }
1981
1982    if (type->is_array()) {
1983       for (unsigned int i = 0; i < type->length; i++) {
1984          emit_block_move(dst, src, type->fields.array, predicate);
1985       }
1986       return;
1987    }
1988
1989    if (type->is_matrix()) {
1990       const struct glsl_type *vec_type;
1991
1992       vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1993                                          type->vector_elements, 1);
1994
1995       for (int i = 0; i < type->matrix_columns; i++) {
1996          emit_block_move(dst, src, vec_type, predicate);
1997       }
1998       return;
1999    }
2000
2001    assert(type->is_scalar() || type->is_vector());
2002
2003    dst->type = brw_type_for_base_type(type);
2004    src->type = dst->type;
2005
2006    dst->writemask = (1 << type->vector_elements) - 1;
2007
2008    src->swizzle = swizzle_for_size(type->vector_elements);
2009
2010    vec4_instruction *inst = emit(MOV(*dst, *src));
2011    inst->predicate = predicate;
2012
2013    dst->reg_offset++;
2014    src->reg_offset++;
2015 }
2016
2017
2018 /* If the RHS processing resulted in an instruction generating a
2019  * temporary value, and it would be easy to rewrite the instruction to
2020  * generate its result right into the LHS instead, do so.  This ends
2021  * up reliably removing instructions where it can be tricky to do so
2022  * later without real UD chain information.
2023  */
2024 bool
2025 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2026                                      dst_reg dst,
2027                                      src_reg src,
2028                                      vec4_instruction *pre_rhs_inst,
2029                                      vec4_instruction *last_rhs_inst)
2030 {
2031    /* This could be supported, but it would take more smarts. */
2032    if (ir->condition)
2033       return false;
2034
2035    if (pre_rhs_inst == last_rhs_inst)
2036       return false; /* No instructions generated to work with. */
2037
2038    /* Make sure the last instruction generated our source reg. */
2039    if (src.file != GRF ||
2040        src.file != last_rhs_inst->dst.file ||
2041        src.reg != last_rhs_inst->dst.reg ||
2042        src.reg_offset != last_rhs_inst->dst.reg_offset ||
2043        src.reladdr ||
2044        src.abs ||
2045        src.negate ||
2046        last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2047       return false;
2048
2049    /* Check that that last instruction fully initialized the channels
2050     * we want to use, in the order we want to use them.  We could
2051     * potentially reswizzle the operands of many instructions so that
2052     * we could handle out of order channels, but don't yet.
2053     */
2054
2055    for (unsigned i = 0; i < 4; i++) {
2056       if (dst.writemask & (1 << i)) {
2057          if (!(last_rhs_inst->dst.writemask & (1 << i)))
2058             return false;
2059
2060          if (BRW_GET_SWZ(src.swizzle, i) != i)
2061             return false;
2062       }
2063    }
2064
2065    /* Success!  Rewrite the instruction. */
2066    last_rhs_inst->dst.file = dst.file;
2067    last_rhs_inst->dst.reg = dst.reg;
2068    last_rhs_inst->dst.reg_offset = dst.reg_offset;
2069    last_rhs_inst->dst.reladdr = dst.reladdr;
2070    last_rhs_inst->dst.writemask &= dst.writemask;
2071
2072    return true;
2073 }
2074
2075 void
2076 vec4_visitor::visit(ir_assignment *ir)
2077 {
2078    dst_reg dst = get_assignment_lhs(ir->lhs, this);
2079    uint32_t predicate = BRW_PREDICATE_NONE;
2080
2081    if (!ir->lhs->type->is_scalar() &&
2082        !ir->lhs->type->is_vector()) {
2083       ir->rhs->accept(this);
2084       src_reg src = this->result;
2085
2086       if (ir->condition) {
2087          emit_bool_to_cond_code(ir->condition, &predicate);
2088       }
2089
2090       /* emit_block_move doesn't account for swizzles in the source register.
2091        * This should be ok, since the source register is a structure or an
2092        * array, and those can't be swizzled.  But double-check to be sure.
2093        */
2094       assert(src.swizzle ==
2095              (ir->rhs->type->is_matrix()
2096               ? swizzle_for_size(ir->rhs->type->vector_elements)
2097               : BRW_SWIZZLE_NOOP));
2098
2099       emit_block_move(&dst, &src, ir->rhs->type, predicate);
2100       return;
2101    }
2102
2103    /* Now we're down to just a scalar/vector with writemasks. */
2104    int i;
2105
2106    vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2107    pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2108
2109    ir->rhs->accept(this);
2110
2111    last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2112
2113    src_reg src = this->result;
2114
2115    int swizzles[4];
2116    int first_enabled_chan = 0;
2117    int src_chan = 0;
2118
2119    assert(ir->lhs->type->is_vector() ||
2120           ir->lhs->type->is_scalar());
2121    dst.writemask = ir->write_mask;
2122
2123    for (int i = 0; i < 4; i++) {
2124       if (dst.writemask & (1 << i)) {
2125          first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
2126          break;
2127       }
2128    }
2129
2130    /* Swizzle a small RHS vector into the channels being written.
2131     *
2132     * glsl ir treats write_mask as dictating how many channels are
2133     * present on the RHS while in our instructions we need to make
2134     * those channels appear in the slots of the vec4 they're written to.
2135     */
2136    for (int i = 0; i < 4; i++) {
2137       if (dst.writemask & (1 << i))
2138          swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
2139       else
2140          swizzles[i] = first_enabled_chan;
2141    }
2142    src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2143                               swizzles[2], swizzles[3]);
2144
2145    if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2146       return;
2147    }
2148
2149    if (ir->condition) {
2150       emit_bool_to_cond_code(ir->condition, &predicate);
2151    }
2152
2153    for (i = 0; i < type_size(ir->lhs->type); i++) {
2154       vec4_instruction *inst = emit(MOV(dst, src));
2155       inst->predicate = predicate;
2156
2157       dst.reg_offset++;
2158       src.reg_offset++;
2159    }
2160 }
2161
2162 void
2163 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2164 {
2165    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2166       foreach_list(node, &ir->components) {
2167          ir_constant *field_value = (ir_constant *)node;
2168
2169          emit_constant_values(dst, field_value);
2170       }
2171       return;
2172    }
2173
2174    if (ir->type->is_array()) {
2175       for (unsigned int i = 0; i < ir->type->length; i++) {
2176          emit_constant_values(dst, ir->array_elements[i]);
2177       }
2178       return;
2179    }
2180
2181    if (ir->type->is_matrix()) {
2182       for (int i = 0; i < ir->type->matrix_columns; i++) {
2183          float *vec = &ir->value.f[i * ir->type->vector_elements];
2184
2185          for (int j = 0; j < ir->type->vector_elements; j++) {
2186             dst->writemask = 1 << j;
2187             dst->type = BRW_REGISTER_TYPE_F;
2188
2189             emit(MOV(*dst, src_reg(vec[j])));
2190          }
2191          dst->reg_offset++;
2192       }
2193       return;
2194    }
2195
2196    int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2197
2198    for (int i = 0; i < ir->type->vector_elements; i++) {
2199       if (!(remaining_writemask & (1 << i)))
2200          continue;
2201
2202       dst->writemask = 1 << i;
2203       dst->type = brw_type_for_base_type(ir->type);
2204
2205       /* Find other components that match the one we're about to
2206        * write.  Emits fewer instructions for things like vec4(0.5,
2207        * 1.5, 1.5, 1.5).
2208        */
2209       for (int j = i + 1; j < ir->type->vector_elements; j++) {
2210          if (ir->type->base_type == GLSL_TYPE_BOOL) {
2211             if (ir->value.b[i] == ir->value.b[j])
2212                dst->writemask |= (1 << j);
2213          } else {
2214             /* u, i, and f storage all line up, so no need for a
2215              * switch case for comparing each type.
2216              */
2217             if (ir->value.u[i] == ir->value.u[j])
2218                dst->writemask |= (1 << j);
2219          }
2220       }
2221
2222       switch (ir->type->base_type) {
2223       case GLSL_TYPE_FLOAT:
2224          emit(MOV(*dst, src_reg(ir->value.f[i])));
2225          break;
2226       case GLSL_TYPE_INT:
2227          emit(MOV(*dst, src_reg(ir->value.i[i])));
2228          break;
2229       case GLSL_TYPE_UINT:
2230          emit(MOV(*dst, src_reg(ir->value.u[i])));
2231          break;
2232       case GLSL_TYPE_BOOL:
2233          emit(MOV(*dst, src_reg(ir->value.b[i])));
2234          break;
2235       default:
2236          assert(!"Non-float/uint/int/bool constant");
2237          break;
2238       }
2239
2240       remaining_writemask &= ~dst->writemask;
2241    }
2242    dst->reg_offset++;
2243 }
2244
2245 void
2246 vec4_visitor::visit(ir_constant *ir)
2247 {
2248    dst_reg dst = dst_reg(this, ir->type);
2249    this->result = src_reg(dst);
2250
2251    emit_constant_values(&dst, ir);
2252 }
2253
2254 void
2255 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2256 {
2257    ir_dereference *deref = static_cast<ir_dereference *>(
2258       ir->actual_parameters.get_head());
2259    ir_variable *location = deref->variable_referenced();
2260    unsigned surf_index = (prog_data->base.binding_table.abo_start +
2261                           location->data.atomic.buffer_index);
2262
2263    /* Calculate the surface offset */
2264    src_reg offset(this, glsl_type::uint_type);
2265    ir_dereference_array *deref_array = deref->as_dereference_array();
2266    if (deref_array) {
2267       deref_array->array_index->accept(this);
2268
2269       src_reg tmp(this, glsl_type::uint_type);
2270       emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2271       emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2272    } else {
2273       offset = location->data.atomic.offset;
2274    }
2275
2276    /* Emit the appropriate machine instruction */
2277    const char *callee = ir->callee->function_name();
2278    dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2279
2280    if (!strcmp("__intrinsic_atomic_read", callee)) {
2281       emit_untyped_surface_read(surf_index, dst, offset);
2282
2283    } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2284       emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2285                           src_reg(), src_reg());
2286
2287    } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2288       emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2289                           src_reg(), src_reg());
2290    }
2291 }
2292
2293 void
2294 vec4_visitor::visit(ir_call *ir)
2295 {
2296    const char *callee = ir->callee->function_name();
2297
2298    if (!strcmp("__intrinsic_atomic_read", callee) ||
2299        !strcmp("__intrinsic_atomic_increment", callee) ||
2300        !strcmp("__intrinsic_atomic_predecrement", callee)) {
2301       visit_atomic_counter_intrinsic(ir);
2302    } else {
2303       assert(!"Unsupported intrinsic.");
2304    }
2305 }
2306
2307 src_reg
2308 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, int sampler)
2309 {
2310    vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_MCS);
2311    inst->base_mrf = 2;
2312    inst->mlen = 1;
2313    inst->sampler = sampler;
2314    inst->dst = dst_reg(this, glsl_type::uvec4_type);
2315    inst->dst.writemask = WRITEMASK_XYZW;
2316
2317    /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2318    int param_base = inst->base_mrf;
2319    int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2320    int zero_mask = 0xf & ~coord_mask;
2321
2322    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2323             coordinate));
2324
2325    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2326             src_reg(0)));
2327
2328    emit(inst);
2329    return src_reg(inst->dst);
2330 }
2331
2332 void
2333 vec4_visitor::visit(ir_texture *ir)
2334 {
2335    int sampler =
2336       _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2337
2338    /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2339     * emitting anything other than setting up the constant result.
2340     */
2341    if (ir->op == ir_tg4) {
2342       ir_constant *chan = ir->lod_info.component->as_constant();
2343       int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2344       if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2345          dst_reg result(this, ir->type);
2346          this->result = src_reg(result);
2347          emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2348          return;
2349       }
2350    }
2351
2352    /* Should be lowered by do_lower_texture_projection */
2353    assert(!ir->projector);
2354
2355    /* Should be lowered */
2356    assert(!ir->offset || !ir->offset->type->is_array());
2357
2358    /* Generate code to compute all the subexpression trees.  This has to be
2359     * done before loading any values into MRFs for the sampler message since
2360     * generating these values may involve SEND messages that need the MRFs.
2361     */
2362    src_reg coordinate;
2363    if (ir->coordinate) {
2364       ir->coordinate->accept(this);
2365       coordinate = this->result;
2366    }
2367
2368    src_reg shadow_comparitor;
2369    if (ir->shadow_comparitor) {
2370       ir->shadow_comparitor->accept(this);
2371       shadow_comparitor = this->result;
2372    }
2373
2374    bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2375    src_reg offset_value;
2376    if (has_nonconstant_offset) {
2377       ir->offset->accept(this);
2378       offset_value = src_reg(this->result);
2379    }
2380
2381    const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2382    src_reg lod, dPdx, dPdy, sample_index, mcs;
2383    switch (ir->op) {
2384    case ir_tex:
2385       lod = src_reg(0.0f);
2386       lod_type = glsl_type::float_type;
2387       break;
2388    case ir_txf:
2389    case ir_txl:
2390    case ir_txs:
2391       ir->lod_info.lod->accept(this);
2392       lod = this->result;
2393       lod_type = ir->lod_info.lod->type;
2394       break;
2395    case ir_query_levels:
2396       lod = src_reg(0);
2397       lod_type = glsl_type::int_type;
2398       break;
2399    case ir_txf_ms:
2400       ir->lod_info.sample_index->accept(this);
2401       sample_index = this->result;
2402       sample_index_type = ir->lod_info.sample_index->type;
2403
2404       if (brw->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2405          mcs = emit_mcs_fetch(ir, coordinate, sampler);
2406       else
2407          mcs = src_reg(0u);
2408       break;
2409    case ir_txd:
2410       ir->lod_info.grad.dPdx->accept(this);
2411       dPdx = this->result;
2412
2413       ir->lod_info.grad.dPdy->accept(this);
2414       dPdy = this->result;
2415
2416       lod_type = ir->lod_info.grad.dPdx->type;
2417       break;
2418    case ir_txb:
2419    case ir_lod:
2420    case ir_tg4:
2421       break;
2422    }
2423
2424    vec4_instruction *inst = NULL;
2425    switch (ir->op) {
2426    case ir_tex:
2427    case ir_txl:
2428       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXL);
2429       break;
2430    case ir_txd:
2431       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXD);
2432       break;
2433    case ir_txf:
2434       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF);
2435       break;
2436    case ir_txf_ms:
2437       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_CMS);
2438       break;
2439    case ir_txs:
2440       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
2441       break;
2442    case ir_tg4:
2443       if (has_nonconstant_offset)
2444          inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TG4_OFFSET);
2445       else
2446          inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TG4);
2447       break;
2448    case ir_query_levels:
2449       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
2450       break;
2451    case ir_txb:
2452       assert(!"TXB is not valid for vertex shaders.");
2453       break;
2454    case ir_lod:
2455       assert(!"LOD is not valid for vertex shaders.");
2456       break;
2457    default:
2458       assert(!"Unrecognized tex op");
2459    }
2460
2461    if (ir->offset != NULL && ir->op != ir_txf)
2462       inst->texture_offset = brw_texture_offset(ctx, ir->offset->as_constant());
2463
2464    /* Stuff the channel select bits in the top of the texture offset */
2465    if (ir->op == ir_tg4)
2466       inst->texture_offset |= gather_channel(ir, sampler) << 16;
2467
2468    /* The message header is necessary for:
2469     * - Gen4 (always)
2470     * - Texel offsets
2471     * - Gather channel selection
2472     * - Sampler indices too large to fit in a 4-bit value.
2473     */
2474    inst->header_present =
2475       brw->gen < 5 || inst->texture_offset != 0 || ir->op == ir_tg4 ||
2476       sampler >= 16;
2477    inst->base_mrf = 2;
2478    inst->mlen = inst->header_present + 1; /* always at least one */
2479    inst->sampler = sampler;
2480    inst->dst = dst_reg(this, ir->type);
2481    inst->dst.writemask = WRITEMASK_XYZW;
2482    inst->shadow_compare = ir->shadow_comparitor != NULL;
2483
2484    /* MRF for the first parameter */
2485    int param_base = inst->base_mrf + inst->header_present;
2486
2487    if (ir->op == ir_txs || ir->op == ir_query_levels) {
2488       int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2489       emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2490    } else {
2491       /* Load the coordinate */
2492       /* FINISHME: gl_clamp_mask and saturate */
2493       int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2494       int zero_mask = 0xf & ~coord_mask;
2495
2496       emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2497                coordinate));
2498
2499       if (zero_mask != 0) {
2500          emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2501                   src_reg(0)));
2502       }
2503       /* Load the shadow comparitor */
2504       if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2505          emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2506                           WRITEMASK_X),
2507                   shadow_comparitor));
2508          inst->mlen++;
2509       }
2510
2511       /* Load the LOD info */
2512       if (ir->op == ir_tex || ir->op == ir_txl) {
2513          int mrf, writemask;
2514          if (brw->gen >= 5) {
2515             mrf = param_base + 1;
2516             if (ir->shadow_comparitor) {
2517                writemask = WRITEMASK_Y;
2518                /* mlen already incremented */
2519             } else {
2520                writemask = WRITEMASK_X;
2521                inst->mlen++;
2522             }
2523          } else /* brw->gen == 4 */ {
2524             mrf = param_base;
2525             writemask = WRITEMASK_W;
2526          }
2527          emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2528       } else if (ir->op == ir_txf) {
2529          emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2530       } else if (ir->op == ir_txf_ms) {
2531          emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2532                   sample_index));
2533          if (brw->gen >= 7)
2534             /* MCS data is in the first channel of `mcs`, but we need to get it into
2535              * the .y channel of the second vec4 of params, so replicate .x across
2536              * the whole vec4 and then mask off everything except .y
2537              */
2538             mcs.swizzle = BRW_SWIZZLE_XXXX;
2539             emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2540                      mcs));
2541          inst->mlen++;
2542       } else if (ir->op == ir_txd) {
2543          const glsl_type *type = lod_type;
2544
2545          if (brw->gen >= 5) {
2546             dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2547             dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2548             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2549             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2550             inst->mlen++;
2551
2552             if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2553                dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2554                dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2555                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2556                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2557                inst->mlen++;
2558
2559                if (ir->shadow_comparitor) {
2560                   emit(MOV(dst_reg(MRF, param_base + 2,
2561                                    ir->shadow_comparitor->type, WRITEMASK_Z),
2562                            shadow_comparitor));
2563                }
2564             }
2565          } else /* brw->gen == 4 */ {
2566             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2567             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2568             inst->mlen += 2;
2569          }
2570       } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2571          if (ir->shadow_comparitor) {
2572             emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2573                      shadow_comparitor));
2574          }
2575
2576          emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2577                   offset_value));
2578          inst->mlen++;
2579       }
2580    }
2581
2582    emit(inst);
2583
2584    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2585     * spec requires layers.
2586     */
2587    if (ir->op == ir_txs) {
2588       glsl_type const *type = ir->sampler->type;
2589       if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2590           type->sampler_array) {
2591          emit_math(SHADER_OPCODE_INT_QUOTIENT,
2592                    writemask(inst->dst, WRITEMASK_Z),
2593                    src_reg(inst->dst), src_reg(6));
2594       }
2595    }
2596
2597    if (brw->gen == 6 && ir->op == ir_tg4) {
2598       emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2599    }
2600
2601    swizzle_result(ir, src_reg(inst->dst), sampler);
2602 }
2603
2604 /**
2605  * Apply workarounds for Gen6 gather with UINT/SINT
2606  */
2607 void
2608 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2609 {
2610    if (!wa)
2611       return;
2612
2613    int width = (wa & WA_8BIT) ? 8 : 16;
2614    dst_reg dst_f = dst;
2615    dst_f.type = BRW_REGISTER_TYPE_F;
2616
2617    /* Convert from UNORM to UINT */
2618    emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2619    emit(MOV(dst, src_reg(dst_f)));
2620
2621    if (wa & WA_SIGN) {
2622       /* Reinterpret the UINT value as a signed INT value by
2623        * shifting the sign bit into place, then shifting back
2624        * preserving sign.
2625        */
2626       emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2627       emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2628    }
2629 }
2630
2631 /**
2632  * Set up the gather channel based on the swizzle, for gather4.
2633  */
2634 uint32_t
2635 vec4_visitor::gather_channel(ir_texture *ir, int sampler)
2636 {
2637    ir_constant *chan = ir->lod_info.component->as_constant();
2638    int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2639    switch (swiz) {
2640       case SWIZZLE_X: return 0;
2641       case SWIZZLE_Y:
2642          /* gather4 sampler is broken for green channel on RG32F --
2643           * we must ask for blue instead.
2644           */
2645          if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2646             return 2;
2647          return 1;
2648       case SWIZZLE_Z: return 2;
2649       case SWIZZLE_W: return 3;
2650       default:
2651          assert(!"Not reached"); /* zero, one swizzles handled already */
2652          return 0;
2653    }
2654 }
2655
2656 void
2657 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, int sampler)
2658 {
2659    int s = key->tex.swizzles[sampler];
2660
2661    this->result = src_reg(this, ir->type);
2662    dst_reg swizzled_result(this->result);
2663
2664    if (ir->op == ir_query_levels) {
2665       /* # levels is in .w */
2666       orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2667       emit(MOV(swizzled_result, orig_val));
2668       return;
2669    }
2670
2671    if (ir->op == ir_txs || ir->type == glsl_type::float_type
2672                         || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2673       emit(MOV(swizzled_result, orig_val));
2674       return;
2675    }
2676
2677
2678    int zero_mask = 0, one_mask = 0, copy_mask = 0;
2679    int swizzle[4] = {0};
2680
2681    for (int i = 0; i < 4; i++) {
2682       switch (GET_SWZ(s, i)) {
2683       case SWIZZLE_ZERO:
2684          zero_mask |= (1 << i);
2685          break;
2686       case SWIZZLE_ONE:
2687          one_mask |= (1 << i);
2688          break;
2689       default:
2690          copy_mask |= (1 << i);
2691          swizzle[i] = GET_SWZ(s, i);
2692          break;
2693       }
2694    }
2695
2696    if (copy_mask) {
2697       orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2698       swizzled_result.writemask = copy_mask;
2699       emit(MOV(swizzled_result, orig_val));
2700    }
2701
2702    if (zero_mask) {
2703       swizzled_result.writemask = zero_mask;
2704       emit(MOV(swizzled_result, src_reg(0.0f)));
2705    }
2706
2707    if (one_mask) {
2708       swizzled_result.writemask = one_mask;
2709       emit(MOV(swizzled_result, src_reg(1.0f)));
2710    }
2711 }
2712
2713 void
2714 vec4_visitor::visit(ir_return *)
2715 {
2716    assert(!"not reached");
2717 }
2718
2719 void
2720 vec4_visitor::visit(ir_discard *)
2721 {
2722    assert(!"not reached");
2723 }
2724
2725 void
2726 vec4_visitor::visit(ir_if *ir)
2727 {
2728    /* Don't point the annotation at the if statement, because then it plus
2729     * the then and else blocks get printed.
2730     */
2731    this->base_ir = ir->condition;
2732
2733    if (brw->gen == 6) {
2734       emit_if_gen6(ir);
2735    } else {
2736       uint32_t predicate;
2737       emit_bool_to_cond_code(ir->condition, &predicate);
2738       emit(IF(predicate));
2739    }
2740
2741    visit_instructions(&ir->then_instructions);
2742
2743    if (!ir->else_instructions.is_empty()) {
2744       this->base_ir = ir->condition;
2745       emit(BRW_OPCODE_ELSE);
2746
2747       visit_instructions(&ir->else_instructions);
2748    }
2749
2750    this->base_ir = ir->condition;
2751    emit(BRW_OPCODE_ENDIF);
2752 }
2753
2754 void
2755 vec4_visitor::visit(ir_emit_vertex *)
2756 {
2757    assert(!"not reached");
2758 }
2759
2760 void
2761 vec4_visitor::visit(ir_end_primitive *)
2762 {
2763    assert(!"not reached");
2764 }
2765
2766 void
2767 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2768                                   dst_reg dst, src_reg offset,
2769                                   src_reg src0, src_reg src1)
2770 {
2771    unsigned mlen = 0;
2772
2773    /* Set the atomic operation offset. */
2774    emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2775    mlen++;
2776
2777    /* Set the atomic operation arguments. */
2778    if (src0.file != BAD_FILE) {
2779       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2780       mlen++;
2781    }
2782
2783    if (src1.file != BAD_FILE) {
2784       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2785       mlen++;
2786    }
2787
2788    /* Emit the instruction.  Note that this maps to the normal SIMD8
2789     * untyped atomic message on Ivy Bridge, but that's OK because
2790     * unused channels will be masked out.
2791     */
2792    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
2793                                  src_reg(atomic_op), src_reg(surf_index));
2794    inst->base_mrf = 0;
2795    inst->mlen = mlen;
2796 }
2797
2798 void
2799 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
2800                                         src_reg offset)
2801 {
2802    /* Set the surface read offset. */
2803    emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
2804
2805    /* Emit the instruction.  Note that this maps to the normal SIMD8
2806     * untyped surface read message, but that's OK because unused
2807     * channels will be masked out.
2808     */
2809    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ,
2810                                  dst, src_reg(surf_index));
2811    inst->base_mrf = 0;
2812    inst->mlen = 1;
2813 }
2814
2815 void
2816 vec4_visitor::emit_ndc_computation()
2817 {
2818    /* Get the position */
2819    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2820
2821    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2822    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2823    output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2824
2825    current_annotation = "NDC";
2826    dst_reg ndc_w = ndc;
2827    ndc_w.writemask = WRITEMASK_W;
2828    src_reg pos_w = pos;
2829    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2830    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2831
2832    dst_reg ndc_xyz = ndc;
2833    ndc_xyz.writemask = WRITEMASK_XYZ;
2834
2835    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2836 }
2837
2838 void
2839 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2840 {
2841    if (brw->gen < 6 &&
2842        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2843         key->userclip_active || brw->has_negative_rhw_bug)) {
2844       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2845       dst_reg header1_w = header1;
2846       header1_w.writemask = WRITEMASK_W;
2847
2848       emit(MOV(header1, 0u));
2849
2850       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2851          src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2852
2853          current_annotation = "Point size";
2854          emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2855          emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2856       }
2857
2858       if (key->userclip_active) {
2859          current_annotation = "Clipping flags";
2860          dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
2861          dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
2862
2863          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
2864          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
2865          emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
2866
2867          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
2868          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
2869          emit(SHL(flags1, src_reg(flags1), src_reg(4)));
2870          emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
2871       }
2872
2873       /* i965 clipping workaround:
2874        * 1) Test for -ve rhw
2875        * 2) If set,
2876        *      set ndc = (0,0,0,0)
2877        *      set ucp[6] = 1
2878        *
2879        * Later, clipping will detect ucp[6] and ensure the primitive is
2880        * clipped against all fixed planes.
2881        */
2882       if (brw->has_negative_rhw_bug) {
2883          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
2884          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
2885          emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
2886          vec4_instruction *inst;
2887          inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
2888          inst->predicate = BRW_PREDICATE_NORMAL;
2889          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
2890          inst->predicate = BRW_PREDICATE_NORMAL;
2891       }
2892
2893       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2894    } else if (brw->gen < 6) {
2895       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2896    } else {
2897       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2898       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2899          emit(MOV(brw_writemask(reg, WRITEMASK_W),
2900                   src_reg(output_reg[VARYING_SLOT_PSIZ])));
2901       }
2902       if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
2903          emit(MOV(retype(brw_writemask(reg, WRITEMASK_Y), BRW_REGISTER_TYPE_D),
2904                   src_reg(output_reg[VARYING_SLOT_LAYER])));
2905       }
2906       if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
2907          emit(MOV(retype(brw_writemask(reg, WRITEMASK_Z), BRW_REGISTER_TYPE_D),
2908                   src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
2909       }
2910    }
2911 }
2912
2913 void
2914 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
2915 {
2916    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2917     *
2918     *     "If a linked set of shaders forming the vertex stage contains no
2919     *     static write to gl_ClipVertex or gl_ClipDistance, but the
2920     *     application has requested clipping against user clip planes through
2921     *     the API, then the coordinate written to gl_Position is used for
2922     *     comparison against the user clip planes."
2923     *
2924     * This function is only called if the shader didn't write to
2925     * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
2926     * if the user wrote to it; otherwise we use gl_Position.
2927     */
2928    gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
2929    if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
2930       clip_vertex = VARYING_SLOT_POS;
2931    }
2932
2933    for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
2934         ++i) {
2935       reg.writemask = 1 << i;
2936       emit(DP4(reg,
2937                src_reg(output_reg[clip_vertex]),
2938                src_reg(this->userplane[i + offset])));
2939    }
2940 }
2941
2942 void
2943 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
2944 {
2945    assert (varying < VARYING_SLOT_MAX);
2946    reg.type = output_reg[varying].type;
2947    current_annotation = output_reg_annotation[varying];
2948    /* Copy the register, saturating if necessary */
2949    vec4_instruction *inst = emit(MOV(reg,
2950                                      src_reg(output_reg[varying])));
2951    if ((varying == VARYING_SLOT_COL0 ||
2952         varying == VARYING_SLOT_COL1 ||
2953         varying == VARYING_SLOT_BFC0 ||
2954         varying == VARYING_SLOT_BFC1) &&
2955        key->clamp_vertex_color) {
2956       inst->saturate = true;
2957    }
2958 }
2959
2960 void
2961 vec4_visitor::emit_urb_slot(int mrf, int varying)
2962 {
2963    struct brw_reg hw_reg = brw_message_reg(mrf);
2964    dst_reg reg = dst_reg(MRF, mrf);
2965    reg.type = BRW_REGISTER_TYPE_F;
2966
2967    switch (varying) {
2968    case VARYING_SLOT_PSIZ:
2969       /* PSIZ is always in slot 0, and is coupled with other flags. */
2970       current_annotation = "indices, point width, clip flags";
2971       emit_psiz_and_flags(hw_reg);
2972       break;
2973    case BRW_VARYING_SLOT_NDC:
2974       current_annotation = "NDC";
2975       emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
2976       break;
2977    case VARYING_SLOT_POS:
2978       current_annotation = "gl_Position";
2979       emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
2980       break;
2981    case VARYING_SLOT_EDGE:
2982       /* This is present when doing unfilled polygons.  We're supposed to copy
2983        * the edge flag from the user-provided vertex array
2984        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
2985        * of that attribute (starts as 1.0f).  This is then used in clipping to
2986        * determine which edges should be drawn as wireframe.
2987        */
2988       current_annotation = "edge flag";
2989       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
2990                                     glsl_type::float_type, WRITEMASK_XYZW))));
2991       break;
2992    case BRW_VARYING_SLOT_PAD:
2993       /* No need to write to this slot */
2994       break;
2995    default:
2996       emit_generic_urb_slot(reg, varying);
2997       break;
2998    }
2999 }
3000
3001 static int
3002 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
3003 {
3004    if (brw->gen >= 6) {
3005       /* URB data written (does not include the message header reg) must
3006        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
3007        * section 5.4.3.2.2: URB_INTERLEAVED.
3008        *
3009        * URB entries are allocated on a multiple of 1024 bits, so an
3010        * extra 128 bits written here to make the end align to 256 is
3011        * no problem.
3012        */
3013       if ((mlen % 2) != 1)
3014          mlen++;
3015    }
3016
3017    return mlen;
3018 }
3019
3020
3021 /**
3022  * Generates the VUE payload plus the necessary URB write instructions to
3023  * output it.
3024  *
3025  * The VUE layout is documented in Volume 2a.
3026  */
3027 void
3028 vec4_visitor::emit_vertex()
3029 {
3030    /* MRF 0 is reserved for the debugger, so start with message header
3031     * in MRF 1.
3032     */
3033    int base_mrf = 1;
3034    int mrf = base_mrf;
3035    /* In the process of generating our URB write message contents, we
3036     * may need to unspill a register or load from an array.  Those
3037     * reads would use MRFs 14-15.
3038     */
3039    int max_usable_mrf = 13;
3040
3041    /* The following assertion verifies that max_usable_mrf causes an
3042     * even-numbered amount of URB write data, which will meet gen6's
3043     * requirements for length alignment.
3044     */
3045    assert ((max_usable_mrf - base_mrf) % 2 == 0);
3046
3047    /* First mrf is the g0-based message header containing URB handles and
3048     * such.
3049     */
3050    emit_urb_write_header(mrf++);
3051
3052    if (brw->gen < 6) {
3053       emit_ndc_computation();
3054    }
3055
3056    /* Lower legacy ff and ClipVertex clipping to clip distances */
3057    if (key->userclip_active && !prog->UsesClipDistanceOut) {
3058       current_annotation = "user clip distances";
3059
3060       output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3061       output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3062
3063       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3064       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3065    }
3066
3067    /* We may need to split this up into several URB writes, so do them in a
3068     * loop.
3069     */
3070    int slot = 0;
3071    bool complete = false;
3072    do {
3073       /* URB offset is in URB row increments, and each of our MRFs is half of
3074        * one of those, since we're doing interleaved writes.
3075        */
3076       int offset = slot / 2;
3077
3078       mrf = base_mrf + 1;
3079       for (; slot < prog_data->vue_map.num_slots; ++slot) {
3080          emit_urb_slot(mrf++, prog_data->vue_map.slot_to_varying[slot]);
3081
3082          /* If this was max_usable_mrf, we can't fit anything more into this
3083           * URB WRITE.
3084           */
3085          if (mrf > max_usable_mrf) {
3086             slot++;
3087             break;
3088          }
3089       }
3090
3091       complete = slot >= prog_data->vue_map.num_slots;
3092       current_annotation = "URB write";
3093       vec4_instruction *inst = emit_urb_write_opcode(complete);
3094       inst->base_mrf = base_mrf;
3095       inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
3096       inst->offset += offset;
3097    } while(!complete);
3098 }
3099
3100
3101 src_reg
3102 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
3103                                  src_reg *reladdr, int reg_offset)
3104 {
3105    /* Because we store the values to scratch interleaved like our
3106     * vertex data, we need to scale the vec4 index by 2.
3107     */
3108    int message_header_scale = 2;
3109
3110    /* Pre-gen6, the message header uses byte offsets instead of vec4
3111     * (16-byte) offset units.
3112     */
3113    if (brw->gen < 6)
3114       message_header_scale *= 16;
3115
3116    if (reladdr) {
3117       src_reg index = src_reg(this, glsl_type::int_type);
3118
3119       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3120       emit_before(inst, MUL(dst_reg(index),
3121                             index, src_reg(message_header_scale)));
3122
3123       return index;
3124    } else {
3125       return src_reg(reg_offset * message_header_scale);
3126    }
3127 }
3128
3129 src_reg
3130 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
3131                                        src_reg *reladdr, int reg_offset)
3132 {
3133    if (reladdr) {
3134       src_reg index = src_reg(this, glsl_type::int_type);
3135
3136       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3137
3138       /* Pre-gen6, the message header uses byte offsets instead of vec4
3139        * (16-byte) offset units.
3140        */
3141       if (brw->gen < 6) {
3142          emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
3143       }
3144
3145       return index;
3146    } else if (brw->gen >= 8) {
3147       /* Store the offset in a GRF so we can send-from-GRF. */
3148       src_reg offset = src_reg(this, glsl_type::int_type);
3149       emit_before(inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3150       return offset;
3151    } else {
3152       int message_header_scale = brw->gen < 6 ? 16 : 1;
3153       return src_reg(reg_offset * message_header_scale);
3154    }
3155 }
3156
3157 /**
3158  * Emits an instruction before @inst to load the value named by @orig_src
3159  * from scratch space at @base_offset to @temp.
3160  *
3161  * @base_offset is measured in 32-byte units (the size of a register).
3162  */
3163 void
3164 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
3165                                 dst_reg temp, src_reg orig_src,
3166                                 int base_offset)
3167 {
3168    int reg_offset = base_offset + orig_src.reg_offset;
3169    src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
3170
3171    emit_before(inst, SCRATCH_READ(temp, index));
3172 }
3173
3174 /**
3175  * Emits an instruction after @inst to store the value to be written
3176  * to @orig_dst to scratch space at @base_offset, from @temp.
3177  *
3178  * @base_offset is measured in 32-byte units (the size of a register).
3179  */
3180 void
3181 vec4_visitor::emit_scratch_write(vec4_instruction *inst, int base_offset)
3182 {
3183    int reg_offset = base_offset + inst->dst.reg_offset;
3184    src_reg index = get_scratch_offset(inst, inst->dst.reladdr, reg_offset);
3185
3186    /* Create a temporary register to store *inst's result in.
3187     *
3188     * We have to be careful in MOVing from our temporary result register in
3189     * the scratch write.  If we swizzle from channels of the temporary that
3190     * weren't initialized, it will confuse live interval analysis, which will
3191     * make spilling fail to make progress.
3192     */
3193    src_reg temp = src_reg(this, glsl_type::vec4_type);
3194    temp.type = inst->dst.type;
3195    int first_writemask_chan = ffs(inst->dst.writemask) - 1;
3196    int swizzles[4];
3197    for (int i = 0; i < 4; i++)
3198       if (inst->dst.writemask & (1 << i))
3199          swizzles[i] = i;
3200       else
3201          swizzles[i] = first_writemask_chan;
3202    temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
3203                                swizzles[2], swizzles[3]);
3204
3205    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3206                                        inst->dst.writemask));
3207    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3208    write->predicate = inst->predicate;
3209    write->ir = inst->ir;
3210    write->annotation = inst->annotation;
3211    inst->insert_after(write);
3212
3213    inst->dst.file = temp.file;
3214    inst->dst.reg = temp.reg;
3215    inst->dst.reg_offset = temp.reg_offset;
3216    inst->dst.reladdr = NULL;
3217 }
3218
3219 /**
3220  * We can't generally support array access in GRF space, because a
3221  * single instruction's destination can only span 2 contiguous
3222  * registers.  So, we send all GRF arrays that get variable index
3223  * access to scratch space.
3224  */
3225 void
3226 vec4_visitor::move_grf_array_access_to_scratch()
3227 {
3228    int scratch_loc[this->virtual_grf_count];
3229
3230    for (int i = 0; i < this->virtual_grf_count; i++) {
3231       scratch_loc[i] = -1;
3232    }
3233
3234    /* First, calculate the set of virtual GRFs that need to be punted
3235     * to scratch due to having any array access on them, and where in
3236     * scratch.
3237     */
3238    foreach_list(node, &this->instructions) {
3239       vec4_instruction *inst = (vec4_instruction *)node;
3240
3241       if (inst->dst.file == GRF && inst->dst.reladdr &&
3242           scratch_loc[inst->dst.reg] == -1) {
3243          scratch_loc[inst->dst.reg] = c->last_scratch;
3244          c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
3245       }
3246
3247       for (int i = 0 ; i < 3; i++) {
3248          src_reg *src = &inst->src[i];
3249
3250          if (src->file == GRF && src->reladdr &&
3251              scratch_loc[src->reg] == -1) {
3252             scratch_loc[src->reg] = c->last_scratch;
3253             c->last_scratch += this->virtual_grf_sizes[src->reg];
3254          }
3255       }
3256    }
3257
3258    /* Now, for anything that will be accessed through scratch, rewrite
3259     * it to load/store.  Note that this is a _safe list walk, because
3260     * we may generate a new scratch_write instruction after the one
3261     * we're processing.
3262     */
3263    foreach_list_safe(node, &this->instructions) {
3264       vec4_instruction *inst = (vec4_instruction *)node;
3265
3266       /* Set up the annotation tracking for new generated instructions. */
3267       base_ir = inst->ir;
3268       current_annotation = inst->annotation;
3269
3270       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
3271          emit_scratch_write(inst, scratch_loc[inst->dst.reg]);
3272       }
3273
3274       for (int i = 0 ; i < 3; i++) {
3275          if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
3276             continue;
3277
3278          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3279
3280          emit_scratch_read(inst, temp, inst->src[i],
3281                            scratch_loc[inst->src[i].reg]);
3282
3283          inst->src[i].file = temp.file;
3284          inst->src[i].reg = temp.reg;
3285          inst->src[i].reg_offset = temp.reg_offset;
3286          inst->src[i].reladdr = NULL;
3287       }
3288    }
3289 }
3290
3291 /**
3292  * Emits an instruction before @inst to load the value named by @orig_src
3293  * from the pull constant buffer (surface) at @base_offset to @temp.
3294  */
3295 void
3296 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
3297                                       dst_reg temp, src_reg orig_src,
3298                                       int base_offset)
3299 {
3300    int reg_offset = base_offset + orig_src.reg_offset;
3301    src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3302    src_reg offset = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
3303    vec4_instruction *load;
3304
3305    if (brw->gen >= 7) {
3306       dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
3307       grf_offset.type = offset.type;
3308       emit_before(inst, MOV(grf_offset, offset));
3309
3310       load = new(mem_ctx) vec4_instruction(this,
3311                                            VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
3312                                            temp, index, src_reg(grf_offset));
3313    } else {
3314       load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
3315                                            temp, index, offset);
3316       load->base_mrf = 14;
3317       load->mlen = 1;
3318    }
3319    emit_before(inst, load);
3320 }
3321
3322 /**
3323  * Implements array access of uniforms by inserting a
3324  * PULL_CONSTANT_LOAD instruction.
3325  *
3326  * Unlike temporary GRF array access (where we don't support it due to
3327  * the difficulty of doing relative addressing on instruction
3328  * destinations), we could potentially do array access of uniforms
3329  * that were loaded in GRF space as push constants.  In real-world
3330  * usage we've seen, though, the arrays being used are always larger
3331  * than we could load as push constants, so just always move all
3332  * uniform array access out to a pull constant buffer.
3333  */
3334 void
3335 vec4_visitor::move_uniform_array_access_to_pull_constants()
3336 {
3337    int pull_constant_loc[this->uniforms];
3338
3339    for (int i = 0; i < this->uniforms; i++) {
3340       pull_constant_loc[i] = -1;
3341    }
3342
3343    /* Walk through and find array access of uniforms.  Put a copy of that
3344     * uniform in the pull constant buffer.
3345     *
3346     * Note that we don't move constant-indexed accesses to arrays.  No
3347     * testing has been done of the performance impact of this choice.
3348     */
3349    foreach_list_safe(node, &this->instructions) {
3350       vec4_instruction *inst = (vec4_instruction *)node;
3351
3352       for (int i = 0 ; i < 3; i++) {
3353          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3354             continue;
3355
3356          int uniform = inst->src[i].reg;
3357
3358          /* If this array isn't already present in the pull constant buffer,
3359           * add it.
3360           */
3361          if (pull_constant_loc[uniform] == -1) {
3362             const float **values = &stage_prog_data->param[uniform * 4];
3363
3364             pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3365
3366             assert(uniform < uniform_array_size);
3367             for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3368                stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3369                   = values[j];
3370             }
3371          }
3372
3373          /* Set up the annotation tracking for new generated instructions. */
3374          base_ir = inst->ir;
3375          current_annotation = inst->annotation;
3376
3377          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3378
3379          emit_pull_constant_load(inst, temp, inst->src[i],
3380                                  pull_constant_loc[uniform]);
3381
3382          inst->src[i].file = temp.file;
3383          inst->src[i].reg = temp.reg;
3384          inst->src[i].reg_offset = temp.reg_offset;
3385          inst->src[i].reladdr = NULL;
3386       }
3387    }
3388
3389    /* Now there are no accesses of the UNIFORM file with a reladdr, so
3390     * no need to track them as larger-than-vec4 objects.  This will be
3391     * relied on in cutting out unused uniform vectors from push
3392     * constants.
3393     */
3394    split_uniform_registers();
3395 }
3396
3397 void
3398 vec4_visitor::resolve_ud_negate(src_reg *reg)
3399 {
3400    if (reg->type != BRW_REGISTER_TYPE_UD ||
3401        !reg->negate)
3402       return;
3403
3404    src_reg temp = src_reg(this, glsl_type::uvec4_type);
3405    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3406    *reg = temp;
3407 }
3408
3409 vec4_visitor::vec4_visitor(struct brw_context *brw,
3410                            struct brw_vec4_compile *c,
3411                            struct gl_program *prog,
3412                            const struct brw_vec4_prog_key *key,
3413                            struct brw_vec4_prog_data *prog_data,
3414                            struct gl_shader_program *shader_prog,
3415                            gl_shader_stage stage,
3416                            void *mem_ctx,
3417                            bool debug_flag,
3418                            bool no_spills,
3419                            shader_time_shader_type st_base,
3420                            shader_time_shader_type st_written,
3421                            shader_time_shader_type st_reset)
3422    : backend_visitor(brw, shader_prog, prog, &prog_data->base, stage),
3423      c(c),
3424      key(key),
3425      prog_data(prog_data),
3426      sanity_param_count(0),
3427      fail_msg(NULL),
3428      first_non_payload_grf(0),
3429      need_all_constants_in_pull_buffer(false),
3430      debug_flag(debug_flag),
3431      no_spills(no_spills),
3432      st_base(st_base),
3433      st_written(st_written),
3434      st_reset(st_reset)
3435 {
3436    this->mem_ctx = mem_ctx;
3437    this->failed = false;
3438
3439    this->base_ir = NULL;
3440    this->current_annotation = NULL;
3441    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3442
3443    this->variable_ht = hash_table_ctor(0,
3444                                        hash_table_pointer_hash,
3445                                        hash_table_pointer_compare);
3446
3447    this->virtual_grf_start = NULL;
3448    this->virtual_grf_end = NULL;
3449    this->virtual_grf_sizes = NULL;
3450    this->virtual_grf_count = 0;
3451    this->virtual_grf_reg_map = NULL;
3452    this->virtual_grf_reg_count = 0;
3453    this->virtual_grf_array_size = 0;
3454    this->live_intervals_valid = false;
3455
3456    this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3457
3458    this->uniforms = 0;
3459
3460    /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3461     * at least one. See setup_uniforms() in brw_vec4.cpp.
3462     */
3463    this->uniform_array_size = 1;
3464    if (prog_data) {
3465       this->uniform_array_size = MAX2(stage_prog_data->nr_params, 1);
3466    }
3467
3468    this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3469    this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3470 }
3471
3472 vec4_visitor::~vec4_visitor()
3473 {
3474    hash_table_dtor(this->variable_ht);
3475 }
3476
3477
3478 void
3479 vec4_visitor::fail(const char *format, ...)
3480 {
3481    va_list va;
3482    char *msg;
3483
3484    if (failed)
3485       return;
3486
3487    failed = true;
3488
3489    va_start(va, format);
3490    msg = ralloc_vasprintf(mem_ctx, format, va);
3491    va_end(va);
3492    msg = ralloc_asprintf(mem_ctx, "vec4 compile failed: %s\n", msg);
3493
3494    this->fail_msg = msg;
3495
3496    if (debug_flag) {
3497       fprintf(stderr, "%s",  msg);
3498    }
3499 }
3500
3501 } /* namespace brw */