src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "glsl/ir_uniform.h"
  26 extern "C" {
  27 #include "program/sampler.h"
  28 }
  29
  30 namespace brw {
  31
  32 vec4_instruction::vec4_instruction(vec4_visitor *v,
  33                                    enum opcode opcode, const dst_reg &dst,
  34                                    const src_reg &src0, const src_reg &src1,
  35                                    const src_reg &src2)
  36 {
  37    this->opcode = opcode;
  38    this->dst = dst;
  39    this->src[0] = src0;
  40    this->src[1] = src1;
  41    this->src[2] = src2;
  42    this->saturate = false;
  43    this->force_writemask_all = false;
  44    this->no_dd_clear = false;
  45    this->no_dd_check = false;
  46    this->writes_accumulator = false;
  47    this->conditional_mod = BRW_CONDITIONAL_NONE;
  48    this->texture_offset = 0;
  49    this->target = 0;
  50    this->shadow_compare = false;
  51    this->ir = v->base_ir;
  52    this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
  53    this->header_present = false;
  54    this->mlen = 0;
  55    this->base_mrf = 0;
  56    this->offset = 0;
  57    this->annotation = v->current_annotation;
  58 }
  59
  60 vec4_instruction *
  61 vec4_visitor::emit(vec4_instruction *inst)
  62 {
  63    this->instructions.push_tail(inst);
  64
  65    return inst;
  66 }
  67
  68 vec4_instruction *
  69 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
  70 {
  71    new_inst->ir = inst->ir;
  72    new_inst->annotation = inst->annotation;
  73
  74    inst->insert_before(new_inst);
  75
  76    return inst;
  77 }
  78
  79 vec4_instruction *
  80 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
  81                    src_reg src0, src_reg src1, src_reg src2)
  82 {
  83    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
  84                                              src0, src1, src2));
  85 }
  86
  87
  88 vec4_instruction *
  89 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
  90 {
  91    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
  92 }
  93
  94 vec4_instruction *
  95 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
  96 {
  97    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
  98 }
  99
 100 vec4_instruction *
 101 vec4_visitor::emit(enum opcode opcode, dst_reg dst)
 102 {
 103    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst));
 104 }
 105
 106 vec4_instruction *
 107 vec4_visitor::emit(enum opcode opcode)
 108 {
 109    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
 110 }
 111
 112 #define ALU1(op)                                                        \
 113    vec4_instruction *                                                   \
 114    vec4_visitor::op(const dst_reg &dst, const src_reg &src0)            \
 115    {                                                                    \
 116       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 117                                            src0);                       \
 118    }
 119
 120 #define ALU2(op)                                                        \
 121    vec4_instruction *                                                   \
 122    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 123                     const src_reg &src1)                                \
 124    {                                                                    \
 125       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 126                                            src0, src1);                 \
 127    }
 128
 129 #define ALU2_ACC(op)                                                    \
 130    vec4_instruction *                                                   \
 131    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 132                     const src_reg &src1)                                \
 133    {                                                                    \
 134       vec4_instruction *inst = new(mem_ctx) vec4_instruction(this,     \
 135                        BRW_OPCODE_##op, dst, src0, src1);               \
 136       inst->writes_accumulator = true;                                 \
 137       return inst;                                                     \
 138    }
 139
 140 #define ALU3(op)                                                        \
 141    vec4_instruction *                                                   \
 142    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 143                     const src_reg &src1, const src_reg &src2)           \
 144    {                                                                    \
 145       assert(brw->gen >= 6);                                            \
 146       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 147                                            src0, src1, src2);           \
 148    }
 149
 150 ALU1(NOT)
 151 ALU1(MOV)
 152 ALU1(FRC)
 153 ALU1(RNDD)
 154 ALU1(RNDE)
 155 ALU1(RNDZ)
 156 ALU1(F32TO16)
 157 ALU1(F16TO32)
 158 ALU2(ADD)
 159 ALU2(MUL)
 160 ALU2_ACC(MACH)
 161 ALU2(AND)
 162 ALU2(OR)
 163 ALU2(XOR)
 164 ALU2(DP3)
 165 ALU2(DP4)
 166 ALU2(DPH)
 167 ALU2(SHL)
 168 ALU2(SHR)
 169 ALU2(ASR)
 170 ALU3(LRP)
 171 ALU1(BFREV)
 172 ALU3(BFE)
 173 ALU2(BFI1)
 174 ALU3(BFI2)
 175 ALU1(FBH)
 176 ALU1(FBL)
 177 ALU1(CBIT)
 178 ALU3(MAD)
 179 ALU2_ACC(ADDC)
 180 ALU2_ACC(SUBB)
 181 ALU2(MAC)
 182
 183 /** Gen4 predicated IF. */
 184 vec4_instruction *
 185 vec4_visitor::IF(enum brw_predicate predicate)
 186 {
 187    vec4_instruction *inst;
 188
 189    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
 190    inst->predicate = predicate;
 191
 192    return inst;
 193 }
 194
 195 /** Gen6 IF with embedded comparison. */
 196 vec4_instruction *
 197 vec4_visitor::IF(src_reg src0, src_reg src1,
 198                  enum brw_conditional_mod condition)
 199 {
 200    assert(brw->gen == 6);
 201
 202    vec4_instruction *inst;
 203
 204    resolve_ud_negate(&src0);
 205    resolve_ud_negate(&src1);
 206
 207    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
 208                                         src0, src1);
 209    inst->conditional_mod = condition;
 210
 211    return inst;
 212 }
 213
 214 /**
 215  * CMP: Sets the low bit of the destination channels with the result
 216  * of the comparison, while the upper bits are undefined, and updates
 217  * the flag register with the packed 16 bits of the result.
 218  */
 219 vec4_instruction *
 220 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
 221                   enum brw_conditional_mod condition)
 222 {
 223    vec4_instruction *inst;
 224
 225    /* original gen4 does type conversion to the destination type
 226     * before before comparison, producing garbage results for floating
 227     * point comparisons.
 228     */
 229    if (brw->gen == 4) {
 230       dst.type = src0.type;
 231       if (dst.file == HW_REG)
 232          dst.fixed_hw_reg.type = dst.type;
 233    }
 234
 235    resolve_ud_negate(&src0);
 236    resolve_ud_negate(&src1);
 237
 238    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
 239    inst->conditional_mod = condition;
 240
 241    return inst;
 242 }
 243
 244 vec4_instruction *
 245 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
 246 {
 247    vec4_instruction *inst;
 248
 249    inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_READ,
 250                                         dst, index);
 251    inst->base_mrf = 14;
 252    inst->mlen = 2;
 253
 254    return inst;
 255 }
 256
 257 vec4_instruction *
 258 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
 259                             const src_reg &index)
 260 {
 261    vec4_instruction *inst;
 262
 263    inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_WRITE,
 264                                         dst, src, index);
 265    inst->base_mrf = 13;
 266    inst->mlen = 3;
 267
 268    return inst;
 269 }
 270
 271 void
 272 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
 273 {
 274    static enum opcode dot_opcodes[] = {
 275       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
 276    };
 277
 278    emit(dot_opcodes[elements - 2], dst, src0, src1);
 279 }
 280
 281 src_reg
 282 vec4_visitor::fix_3src_operand(src_reg src)
 283 {
 284    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
 285     * able to use vertical stride of zero to replicate the vec4 uniform, like
 286     *
 287     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
 288     *
 289     * But you can't, since vertical stride is always four in three-source
 290     * instructions. Instead, insert a MOV instruction to do the replication so
 291     * that the three-source instruction can consume it.
 292     */
 293
 294    /* The MOV is only needed if the source is a uniform or immediate. */
 295    if (src.file != UNIFORM && src.file != IMM)
 296       return src;
 297
 298    if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
 299       return src;
 300
 301    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 302    expanded.type = src.type;
 303    emit(MOV(expanded, src));
 304    return src_reg(expanded);
 305 }
 306
 307 src_reg
 308 vec4_visitor::fix_math_operand(src_reg src)
 309 {
 310    /* The gen6 math instruction ignores the source modifiers --
 311     * swizzle, abs, negate, and at least some parts of the register
 312     * region description.
 313     *
 314     * Rather than trying to enumerate all these cases, *always* expand the
 315     * operand to a temp GRF for gen6.
 316     *
 317     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 318     * can't use.
 319     */
 320
 321    if (brw->gen == 7 && src.file != IMM)
 322       return src;
 323
 324    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 325    expanded.type = src.type;
 326    emit(MOV(expanded, src));
 327    return src_reg(expanded);
 328 }
 329
 330 void
 331 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
 332 {
 333    src = fix_math_operand(src);
 334
 335    if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
 336       /* The gen6 math instruction must be align1, so we can't do
 337        * writemasks.
 338        */
 339       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 340
 341       emit(opcode, temp_dst, src);
 342
 343       emit(MOV(dst, src_reg(temp_dst)));
 344    } else {
 345       emit(opcode, dst, src);
 346    }
 347 }
 348
 349 void
 350 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
 351 {
 352    vec4_instruction *inst = emit(opcode, dst, src);
 353    inst->base_mrf = 1;
 354    inst->mlen = 1;
 355 }
 356
 357 void
 358 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
 359 {
 360    switch (opcode) {
 361    case SHADER_OPCODE_RCP:
 362    case SHADER_OPCODE_RSQ:
 363    case SHADER_OPCODE_SQRT:
 364    case SHADER_OPCODE_EXP2:
 365    case SHADER_OPCODE_LOG2:
 366    case SHADER_OPCODE_SIN:
 367    case SHADER_OPCODE_COS:
 368       break;
 369    default:
 370       unreachable("not reached: bad math opcode");
 371    }
 372
 373    if (brw->gen >= 8) {
 374       emit(opcode, dst, src);
 375    } else if (brw->gen >= 6) {
 376       emit_math1_gen6(opcode, dst, src);
 377    } else {
 378       emit_math1_gen4(opcode, dst, src);
 379    }
 380 }
 381
 382 void
 383 vec4_visitor::emit_math2_gen6(enum opcode opcode,
 384                               dst_reg dst, src_reg src0, src_reg src1)
 385 {
 386    src0 = fix_math_operand(src0);
 387    src1 = fix_math_operand(src1);
 388
 389    if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
 390       /* The gen6 math instruction must be align1, so we can't do
 391        * writemasks.
 392        */
 393       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 394       temp_dst.type = dst.type;
 395
 396       emit(opcode, temp_dst, src0, src1);
 397
 398       emit(MOV(dst, src_reg(temp_dst)));
 399    } else {
 400       emit(opcode, dst, src0, src1);
 401    }
 402 }
 403
 404 void
 405 vec4_visitor::emit_math2_gen4(enum opcode opcode,
 406                               dst_reg dst, src_reg src0, src_reg src1)
 407 {
 408    vec4_instruction *inst = emit(opcode, dst, src0, src1);
 409    inst->base_mrf = 1;
 410    inst->mlen = 2;
 411 }
 412
 413 void
 414 vec4_visitor::emit_math(enum opcode opcode,
 415                         dst_reg dst, src_reg src0, src_reg src1)
 416 {
 417    switch (opcode) {
 418    case SHADER_OPCODE_POW:
 419    case SHADER_OPCODE_INT_QUOTIENT:
 420    case SHADER_OPCODE_INT_REMAINDER:
 421       break;
 422    default:
 423       unreachable("not reached: unsupported binary math opcode");
 424    }
 425
 426    if (brw->gen >= 8) {
 427       emit(opcode, dst, src0, src1);
 428    } else if (brw->gen >= 6) {
 429       emit_math2_gen6(opcode, dst, src0, src1);
 430    } else {
 431       emit_math2_gen4(opcode, dst, src0, src1);
 432    }
 433 }
 434
 435 void
 436 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 437 {
 438    if (brw->gen < 7) {
 439       unreachable("ir_unop_pack_half_2x16 should be lowered");
 440    }
 441
 442    assert(dst.type == BRW_REGISTER_TYPE_UD);
 443    assert(src0.type == BRW_REGISTER_TYPE_F);
 444
 445    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
 446     *
 447     *   Because this instruction does not have a 16-bit floating-point type,
 448     *   the destination data type must be Word (W).
 449     *
 450     *   The destination must be DWord-aligned and specify a horizontal stride
 451     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
 452     *   each destination channel and the upper word is not modified.
 453     *
 454     * The above restriction implies that the f32to16 instruction must use
 455     * align1 mode, because only in align1 mode is it possible to specify
 456     * horizontal stride.  We choose here to defy the hardware docs and emit
 457     * align16 instructions.
 458     *
 459     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
 460     * instructions. I was partially successful in that the code passed all
 461     * tests.  However, the code was dubiously correct and fragile, and the
 462     * tests were not harsh enough to probe that frailty. Not trusting the
 463     * code, I chose instead to remain in align16 mode in defiance of the hw
 464     * docs).
 465     *
 466     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
 467     * simulator, emitting a f32to16 in align16 mode with UD as destination
 468     * data type is safe. The behavior differs from that specified in the PRM
 469     * in that the upper word of each destination channel is cleared to 0.
 470     */
 471
 472    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 473    src_reg tmp_src(tmp_dst);
 474
 475 #if 0
 476    /* Verify the undocumented behavior on which the following instructions
 477     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
 478     * then the result of the bit-or instruction below will be incorrect.
 479     *
 480     * You should inspect the disasm output in order to verify that the MOV is
 481     * not optimized away.
 482     */
 483    emit(MOV(tmp_dst, src_reg(0x12345678u)));
 484 #endif
 485
 486    /* Give tmp the form below, where "." means untouched.
 487     *
 488     *     w z          y          x w z          y          x
 489     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
 490     *
 491     * That the upper word of each write-channel be 0 is required for the
 492     * following bit-shift and bit-or instructions to work. Note that this
 493     * relies on the undocumented hardware behavior mentioned above.
 494     */
 495    tmp_dst.writemask = WRITEMASK_XY;
 496    emit(F32TO16(tmp_dst, src0));
 497
 498    /* Give the write-channels of dst the form:
 499     *   0xhhhh0000
 500     */
 501    tmp_src.swizzle = BRW_SWIZZLE_YYYY;
 502    emit(SHL(dst, tmp_src, src_reg(16u)));
 503
 504    /* Finally, give the write-channels of dst the form of packHalf2x16's
 505     * output:
 506     *   0xhhhhllll
 507     */
 508    tmp_src.swizzle = BRW_SWIZZLE_XXXX;
 509    emit(OR(dst, src_reg(dst), tmp_src));
 510 }
 511
 512 void
 513 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 514 {
 515    if (brw->gen < 7) {
 516       unreachable("ir_unop_unpack_half_2x16 should be lowered");
 517    }
 518
 519    assert(dst.type == BRW_REGISTER_TYPE_F);
 520    assert(src0.type == BRW_REGISTER_TYPE_UD);
 521
 522    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
 523     *
 524     *   Because this instruction does not have a 16-bit floating-point type,
 525     *   the source data type must be Word (W). The destination type must be
 526     *   F (Float).
 527     *
 528     * To use W as the source data type, we must adjust horizontal strides,
 529     * which is only possible in align1 mode. All my [chadv] attempts at
 530     * emitting align1 instructions for unpackHalf2x16 failed to pass the
 531     * Piglit tests, so I gave up.
 532     *
 533     * I've verified that, on gen7 hardware and the simulator, it is safe to
 534     * emit f16to32 in align16 mode with UD as source data type.
 535     */
 536
 537    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 538    src_reg tmp_src(tmp_dst);
 539
 540    tmp_dst.writemask = WRITEMASK_X;
 541    emit(AND(tmp_dst, src0, src_reg(0xffffu)));
 542
 543    tmp_dst.writemask = WRITEMASK_Y;
 544    emit(SHR(tmp_dst, src0, src_reg(16u)));
 545
 546    dst.writemask = WRITEMASK_XY;
 547    emit(F16TO32(dst, tmp_src));
 548 }
 549
 550 void
 551 vec4_visitor::visit_instructions(const exec_list *list)
 552 {
 553    foreach_in_list(ir_instruction, ir, list) {
 554       base_ir = ir;
 555       ir->accept(this);
 556    }
 557 }
 558
 559
 560 static int
 561 type_size(const struct glsl_type *type)
 562 {
 563    unsigned int i;
 564    int size;
 565
 566    switch (type->base_type) {
 567    case GLSL_TYPE_UINT:
 568    case GLSL_TYPE_INT:
 569    case GLSL_TYPE_FLOAT:
 570    case GLSL_TYPE_BOOL:
 571       if (type->is_matrix()) {
 572          return type->matrix_columns;
 573       } else {
 574          /* Regardless of size of vector, it gets a vec4. This is bad
 575           * packing for things like floats, but otherwise arrays become a
 576           * mess.  Hopefully a later pass over the code can pack scalars
 577           * down if appropriate.
 578           */
 579          return 1;
 580       }
 581    case GLSL_TYPE_ARRAY:
 582       assert(type->length > 0);
 583       return type_size(type->fields.array) * type->length;
 584    case GLSL_TYPE_STRUCT:
 585       size = 0;
 586       for (i = 0; i < type->length; i++) {
 587          size += type_size(type->fields.structure[i].type);
 588       }
 589       return size;
 590    case GLSL_TYPE_SAMPLER:
 591       /* Samplers take up one slot in UNIFORMS[], but they're baked in
 592        * at link time.
 593        */
 594       return 1;
 595    case GLSL_TYPE_ATOMIC_UINT:
 596       return 0;
 597    case GLSL_TYPE_IMAGE:
 598    case GLSL_TYPE_VOID:
 599    case GLSL_TYPE_ERROR:
 600    case GLSL_TYPE_INTERFACE:
 601       unreachable("not reached");
 602    }
 603
 604    return 0;
 605 }
 606
 607 int
 608 vec4_visitor::virtual_grf_alloc(int size)
 609 {
 610    if (virtual_grf_array_size <= virtual_grf_count) {
 611       if (virtual_grf_array_size == 0)
 612          virtual_grf_array_size = 16;
 613       else
 614          virtual_grf_array_size *= 2;
 615       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 616                                    virtual_grf_array_size);
 617       virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
 618                                      virtual_grf_array_size);
 619    }
 620    virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
 621    virtual_grf_reg_count += size;
 622    virtual_grf_sizes[virtual_grf_count] = size;
 623    return virtual_grf_count++;
 624 }
 625
 626 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 627 {
 628    init();
 629
 630    this->file = GRF;
 631    this->reg = v->virtual_grf_alloc(type_size(type));
 632
 633    if (type->is_array() || type->is_record()) {
 634       this->swizzle = BRW_SWIZZLE_NOOP;
 635    } else {
 636       this->swizzle = swizzle_for_size(type->vector_elements);
 637    }
 638
 639    this->type = brw_type_for_base_type(type);
 640 }
 641
 642 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 643 {
 644    init();
 645
 646    this->file = GRF;
 647    this->reg = v->virtual_grf_alloc(type_size(type));
 648
 649    if (type->is_array() || type->is_record()) {
 650       this->writemask = WRITEMASK_XYZW;
 651    } else {
 652       this->writemask = (1 << type->vector_elements) - 1;
 653    }
 654
 655    this->type = brw_type_for_base_type(type);
 656 }
 657
 658 /* Our support for uniforms is piggy-backed on the struct
 659  * gl_fragment_program, because that's where the values actually
 660  * get stored, rather than in some global gl_shader_program uniform
 661  * store.
 662  */
 663 void
 664 vec4_visitor::setup_uniform_values(ir_variable *ir)
 665 {
 666    int namelen = strlen(ir->name);
 667
 668    /* The data for our (non-builtin) uniforms is stored in a series of
 669     * gl_uniform_driver_storage structs for each subcomponent that
 670     * glGetUniformLocation() could name.  We know it's been set up in the same
 671     * order we'd walk the type, so walk the list of storage and find anything
 672     * with our name, or the prefix of a component that starts with our name.
 673     */
 674    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 675       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 676
 677       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 678           (storage->name[namelen] != 0 &&
 679            storage->name[namelen] != '.' &&
 680            storage->name[namelen] != '[')) {
 681          continue;
 682       }
 683
 684       gl_constant_value *components = storage->storage;
 685       unsigned vector_count = (MAX2(storage->array_elements, 1) *
 686                                storage->type->matrix_columns);
 687
 688       for (unsigned s = 0; s < vector_count; s++) {
 689          assert(uniforms < uniform_array_size);
 690          uniform_vector_size[uniforms] = storage->type->vector_elements;
 691
 692          int i;
 693          for (i = 0; i < uniform_vector_size[uniforms]; i++) {
 694             stage_prog_data->param[uniforms * 4 + i] = components;
 695             components++;
 696          }
 697          for (; i < 4; i++) {
 698             static gl_constant_value zero = { 0.0 };
 699             stage_prog_data->param[uniforms * 4 + i] = &zero;
 700          }
 701
 702          uniforms++;
 703       }
 704    }
 705 }
 706
 707 void
 708 vec4_visitor::setup_uniform_clipplane_values()
 709 {
 710    gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
 711
 712    for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
 713       assert(this->uniforms < uniform_array_size);
 714       this->uniform_vector_size[this->uniforms] = 4;
 715       this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
 716       this->userplane[i].type = BRW_REGISTER_TYPE_F;
 717       for (int j = 0; j < 4; ++j) {
 718          stage_prog_data->param[this->uniforms * 4 + j] =
 719             (gl_constant_value *) &clip_planes[i][j];
 720       }
 721       ++this->uniforms;
 722    }
 723 }
 724
 725 /* Our support for builtin uniforms is even scarier than non-builtin.
 726  * It sits on top of the PROG_STATE_VAR parameters that are
 727  * automatically updated from GL context state.
 728  */
 729 void
 730 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
 731 {
 732    const ir_state_slot *const slots = ir->state_slots;
 733    assert(ir->state_slots != NULL);
 734
 735    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 736       /* This state reference has already been setup by ir_to_mesa,
 737        * but we'll get the same index back here.  We can reference
 738        * ParameterValues directly, since unlike brw_fs.cpp, we never
 739        * add new state references during compile.
 740        */
 741       int index = _mesa_add_state_reference(this->prog->Parameters,
 742                                             (gl_state_index *)slots[i].tokens);
 743       gl_constant_value *values =
 744          &this->prog->Parameters->ParameterValues[index][0];
 745
 746       assert(this->uniforms < uniform_array_size);
 747       this->uniform_vector_size[this->uniforms] = 0;
 748       /* Add each of the unique swizzled channels of the element.
 749        * This will end up matching the size of the glsl_type of this field.
 750        */
 751       int last_swiz = -1;
 752       for (unsigned int j = 0; j < 4; j++) {
 753          int swiz = GET_SWZ(slots[i].swizzle, j);
 754          last_swiz = swiz;
 755
 756          stage_prog_data->param[this->uniforms * 4 + j] = &values[swiz];
 757          assert(this->uniforms < uniform_array_size);
 758          if (swiz <= last_swiz)
 759             this->uniform_vector_size[this->uniforms]++;
 760       }
 761       this->uniforms++;
 762    }
 763 }
 764
 765 dst_reg *
 766 vec4_visitor::variable_storage(ir_variable *var)
 767 {
 768    return (dst_reg *)hash_table_find(this->variable_ht, var);
 769 }
 770
 771 void
 772 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir,
 773                                      enum brw_predicate *predicate)
 774 {
 775    ir_expression *expr = ir->as_expression();
 776
 777    *predicate = BRW_PREDICATE_NORMAL;
 778
 779    if (expr && expr->operation != ir_binop_ubo_load) {
 780       src_reg op[3];
 781       vec4_instruction *inst;
 782
 783       assert(expr->get_num_operands() <= 3);
 784       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 785          expr->operands[i]->accept(this);
 786          op[i] = this->result;
 787
 788          resolve_ud_negate(&op[i]);
 789       }
 790
 791       switch (expr->operation) {
 792       case ir_unop_logic_not:
 793          inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
 794          inst->conditional_mod = BRW_CONDITIONAL_Z;
 795          break;
 796
 797       case ir_binop_logic_xor:
 798          inst = emit(XOR(dst_null_d(), op[0], op[1]));
 799          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 800          break;
 801
 802       case ir_binop_logic_or:
 803          inst = emit(OR(dst_null_d(), op[0], op[1]));
 804          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 805          break;
 806
 807       case ir_binop_logic_and:
 808          inst = emit(AND(dst_null_d(), op[0], op[1]));
 809          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 810          break;
 811
 812       case ir_unop_f2b:
 813          if (brw->gen >= 6) {
 814             emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
 815          } else {
 816             inst = emit(MOV(dst_null_f(), op[0]));
 817             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 818          }
 819          break;
 820
 821       case ir_unop_i2b:
 822          if (brw->gen >= 6) {
 823             emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 824          } else {
 825             inst = emit(MOV(dst_null_d(), op[0]));
 826             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 827          }
 828          break;
 829
 830       case ir_binop_all_equal:
 831          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 832          *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
 833          break;
 834
 835       case ir_binop_any_nequal:
 836          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 837          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 838          break;
 839
 840       case ir_unop_any:
 841          inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 842          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 843          break;
 844
 845       case ir_binop_greater:
 846       case ir_binop_gequal:
 847       case ir_binop_less:
 848       case ir_binop_lequal:
 849       case ir_binop_equal:
 850       case ir_binop_nequal:
 851          emit(CMP(dst_null_d(), op[0], op[1],
 852                   brw_conditional_for_comparison(expr->operation)));
 853          break;
 854
 855       case ir_triop_csel: {
 856          /* Expand the boolean condition into the flag register. */
 857          inst = emit(MOV(dst_null_d(), op[0]));
 858          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 859
 860          /* Select which boolean to return. */
 861          dst_reg temp(this, expr->operands[1]->type);
 862          inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
 863          inst->predicate = BRW_PREDICATE_NORMAL;
 864
 865          /* Expand the result to a condition code. */
 866          inst = emit(MOV(dst_null_d(), src_reg(temp)));
 867          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 868          break;
 869       }
 870
 871       default:
 872          unreachable("not reached");
 873       }
 874       return;
 875    }
 876
 877    ir->accept(this);
 878
 879    resolve_ud_negate(&this->result);
 880
 881    if (brw->gen >= 6) {
 882       vec4_instruction *inst = emit(AND(dst_null_d(),
 883                                         this->result, src_reg(1)));
 884       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 885    } else {
 886       vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
 887       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 888    }
 889 }
 890
 891 /**
 892  * Emit a gen6 IF statement with the comparison folded into the IF
 893  * instruction.
 894  */
 895 void
 896 vec4_visitor::emit_if_gen6(ir_if *ir)
 897 {
 898    ir_expression *expr = ir->condition->as_expression();
 899
 900    if (expr && expr->operation != ir_binop_ubo_load) {
 901       src_reg op[3];
 902       dst_reg temp;
 903
 904       assert(expr->get_num_operands() <= 3);
 905       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 906          expr->operands[i]->accept(this);
 907          op[i] = this->result;
 908       }
 909
 910       switch (expr->operation) {
 911       case ir_unop_logic_not:
 912          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
 913          return;
 914
 915       case ir_binop_logic_xor:
 916          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
 917          return;
 918
 919       case ir_binop_logic_or:
 920          temp = dst_reg(this, glsl_type::bool_type);
 921          emit(OR(temp, op[0], op[1]));
 922          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 923          return;
 924
 925       case ir_binop_logic_and:
 926          temp = dst_reg(this, glsl_type::bool_type);
 927          emit(AND(temp, op[0], op[1]));
 928          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 929          return;
 930
 931       case ir_unop_f2b:
 932          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 933          return;
 934
 935       case ir_unop_i2b:
 936          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 937          return;
 938
 939       case ir_binop_greater:
 940       case ir_binop_gequal:
 941       case ir_binop_less:
 942       case ir_binop_lequal:
 943       case ir_binop_equal:
 944       case ir_binop_nequal:
 945          emit(IF(op[0], op[1],
 946                  brw_conditional_for_comparison(expr->operation)));
 947          return;
 948
 949       case ir_binop_all_equal:
 950          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 951          emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
 952          return;
 953
 954       case ir_binop_any_nequal:
 955          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 956          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 957          return;
 958
 959       case ir_unop_any:
 960          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 961          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 962          return;
 963
 964       case ir_triop_csel: {
 965          /* Expand the boolean condition into the flag register. */
 966          vec4_instruction *inst = emit(MOV(dst_null_d(), op[0]));
 967          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 968
 969          /* Select which boolean to return. */
 970          dst_reg temp(this, expr->operands[1]->type);
 971          inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
 972          inst->predicate = BRW_PREDICATE_NORMAL;
 973
 974          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 975          return;
 976       }
 977
 978       default:
 979          unreachable("not reached");
 980       }
 981       return;
 982    }
 983
 984    ir->condition->accept(this);
 985
 986    emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
 987 }
 988
 989 void
 990 vec4_visitor::visit(ir_variable *ir)
 991 {
 992    dst_reg *reg = NULL;
 993
 994    if (variable_storage(ir))
 995       return;
 996
 997    switch (ir->data.mode) {
 998    case ir_var_shader_in:
 999       reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
1000       break;
1001
1002    case ir_var_shader_out:
1003       reg = new(mem_ctx) dst_reg(this, ir->type);
1004
1005       for (int i = 0; i < type_size(ir->type); i++) {
1006          output_reg[ir->data.location + i] = *reg;
1007          output_reg[ir->data.location + i].reg_offset = i;
1008          output_reg[ir->data.location + i].type =
1009             brw_type_for_base_type(ir->type->get_scalar_type());
1010          output_reg_annotation[ir->data.location + i] = ir->name;
1011       }
1012       break;
1013
1014    case ir_var_auto:
1015    case ir_var_temporary:
1016       reg = new(mem_ctx) dst_reg(this, ir->type);
1017       break;
1018
1019    case ir_var_uniform:
1020       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
1021
1022       /* Thanks to the lower_ubo_reference pass, we will see only
1023        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
1024        * variables, so no need for them to be in variable_ht.
1025        *
1026        * Some uniforms, such as samplers and atomic counters, have no actual
1027        * storage, so we should ignore them.
1028        */
1029       if (ir->is_in_uniform_block() || type_size(ir->type) == 0)
1030          return;
1031
1032       /* Track how big the whole uniform variable is, in case we need to put a
1033        * copy of its data into pull constants for array access.
1034        */
1035       assert(this->uniforms < uniform_array_size);
1036       this->uniform_size[this->uniforms] = type_size(ir->type);
1037
1038       if (!strncmp(ir->name, "gl_", 3)) {
1039          setup_builtin_uniform_values(ir);
1040       } else {
1041          setup_uniform_values(ir);
1042       }
1043       break;
1044
1045    case ir_var_system_value:
1046       reg = make_reg_for_system_value(ir);
1047       break;
1048
1049    default:
1050       unreachable("not reached");
1051    }
1052
1053    reg->type = brw_type_for_base_type(ir->type);
1054    hash_table_insert(this->variable_ht, reg, ir);
1055 }
1056
1057 void
1058 vec4_visitor::visit(ir_loop *ir)
1059 {
1060    /* We don't want debugging output to print the whole body of the
1061     * loop as the annotation.
1062     */
1063    this->base_ir = NULL;
1064
1065    emit(BRW_OPCODE_DO);
1066
1067    visit_instructions(&ir->body_instructions);
1068
1069    emit(BRW_OPCODE_WHILE);
1070 }
1071
1072 void
1073 vec4_visitor::visit(ir_loop_jump *ir)
1074 {
1075    switch (ir->mode) {
1076    case ir_loop_jump::jump_break:
1077       emit(BRW_OPCODE_BREAK);
1078       break;
1079    case ir_loop_jump::jump_continue:
1080       emit(BRW_OPCODE_CONTINUE);
1081       break;
1082    }
1083 }
1084
1085
1086 void
1087 vec4_visitor::visit(ir_function_signature *)
1088 {
1089    unreachable("not reached");
1090 }
1091
1092 void
1093 vec4_visitor::visit(ir_function *ir)
1094 {
1095    /* Ignore function bodies other than main() -- we shouldn't see calls to
1096     * them since they should all be inlined.
1097     */
1098    if (strcmp(ir->name, "main") == 0) {
1099       const ir_function_signature *sig;
1100       exec_list empty;
1101
1102       sig = ir->matching_signature(NULL, &empty, false);
1103
1104       assert(sig);
1105
1106       visit_instructions(&sig->body);
1107    }
1108 }
1109
1110 bool
1111 vec4_visitor::try_emit_mad(ir_expression *ir)
1112 {
1113    /* 3-src instructions were introduced in gen6. */
1114    if (brw->gen < 6)
1115       return false;
1116
1117    /* MAD can only handle floating-point data. */
1118    if (ir->type->base_type != GLSL_TYPE_FLOAT)
1119       return false;
1120
1121    ir_rvalue *nonmul = ir->operands[1];
1122    ir_expression *mul = ir->operands[0]->as_expression();
1123
1124    if (!mul || mul->operation != ir_binop_mul) {
1125       nonmul = ir->operands[0];
1126       mul = ir->operands[1]->as_expression();
1127
1128       if (!mul || mul->operation != ir_binop_mul)
1129          return false;
1130    }
1131
1132    nonmul->accept(this);
1133    src_reg src0 = fix_3src_operand(this->result);
1134
1135    mul->operands[0]->accept(this);
1136    src_reg src1 = fix_3src_operand(this->result);
1137
1138    mul->operands[1]->accept(this);
1139    src_reg src2 = fix_3src_operand(this->result);
1140
1141    this->result = src_reg(this, ir->type);
1142    emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1143
1144    return true;
1145 }
1146
1147 bool
1148 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1149 {
1150    /* This optimization relies on CMP setting the destination to 0 when
1151     * false.  Early hardware only sets the least significant bit, and
1152     * leaves the other bits undefined.  So we can't use it.
1153     */
1154    if (brw->gen < 6)
1155       return false;
1156
1157    ir_expression *const cmp = ir->operands[0]->as_expression();
1158
1159    if (cmp == NULL)
1160       return false;
1161
1162    switch (cmp->operation) {
1163    case ir_binop_less:
1164    case ir_binop_greater:
1165    case ir_binop_lequal:
1166    case ir_binop_gequal:
1167    case ir_binop_equal:
1168    case ir_binop_nequal:
1169       break;
1170
1171    default:
1172       return false;
1173    }
1174
1175    cmp->operands[0]->accept(this);
1176    const src_reg cmp_src0 = this->result;
1177
1178    cmp->operands[1]->accept(this);
1179    const src_reg cmp_src1 = this->result;
1180
1181    this->result = src_reg(this, ir->type);
1182
1183    emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1184             brw_conditional_for_comparison(cmp->operation)));
1185
1186    /* If the comparison is false, this->result will just happen to be zero.
1187     */
1188    vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1189                                        this->result, src_reg(1.0f));
1190    inst->predicate = BRW_PREDICATE_NORMAL;
1191    inst->predicate_inverse = true;
1192
1193    return true;
1194 }
1195
1196 void
1197 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
1198                           src_reg src0, src_reg src1)
1199 {
1200    vec4_instruction *inst;
1201
1202    if (brw->gen >= 6) {
1203       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1204       inst->conditional_mod = conditionalmod;
1205    } else {
1206       emit(CMP(dst, src0, src1, conditionalmod));
1207
1208       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1209       inst->predicate = BRW_PREDICATE_NORMAL;
1210    }
1211 }
1212
1213 void
1214 vec4_visitor::emit_lrp(const dst_reg &dst,
1215                        const src_reg &x, const src_reg &y, const src_reg &a)
1216 {
1217    if (brw->gen >= 6) {
1218       /* Note that the instruction's argument order is reversed from GLSL
1219        * and the IR.
1220        */
1221       emit(LRP(dst,
1222                fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
1223    } else {
1224       /* Earlier generations don't support three source operations, so we
1225        * need to emit x*(1-a) + y*a.
1226        */
1227       dst_reg y_times_a           = dst_reg(this, glsl_type::vec4_type);
1228       dst_reg one_minus_a         = dst_reg(this, glsl_type::vec4_type);
1229       dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1230       y_times_a.writemask           = dst.writemask;
1231       one_minus_a.writemask         = dst.writemask;
1232       x_times_one_minus_a.writemask = dst.writemask;
1233
1234       emit(MUL(y_times_a, y, a));
1235       emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1236       emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1237       emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1238    }
1239 }
1240
1241 void
1242 vec4_visitor::visit(ir_expression *ir)
1243 {
1244    unsigned int operand;
1245    src_reg op[Elements(ir->operands)];
1246    src_reg result_src;
1247    dst_reg result_dst;
1248    vec4_instruction *inst;
1249
1250    if (ir->operation == ir_binop_add) {
1251       if (try_emit_mad(ir))
1252          return;
1253    }
1254
1255    if (ir->operation == ir_unop_b2f) {
1256       if (try_emit_b2f_of_compare(ir))
1257          return;
1258    }
1259
1260    for (operand = 0; operand < ir->get_num_operands(); operand++) {
1261       this->result.file = BAD_FILE;
1262       ir->operands[operand]->accept(this);
1263       if (this->result.file == BAD_FILE) {
1264          fprintf(stderr, "Failed to get tree for expression operand:\n");
1265          ir->operands[operand]->fprint(stderr);
1266          exit(1);
1267       }
1268       op[operand] = this->result;
1269
1270       /* Matrix expression operands should have been broken down to vector
1271        * operations already.
1272        */
1273       assert(!ir->operands[operand]->type->is_matrix());
1274    }
1275
1276    int vector_elements = ir->operands[0]->type->vector_elements;
1277    if (ir->operands[1]) {
1278       vector_elements = MAX2(vector_elements,
1279                              ir->operands[1]->type->vector_elements);
1280    }
1281
1282    this->result.file = BAD_FILE;
1283
1284    /* Storage for our result.  Ideally for an assignment we'd be using
1285     * the actual storage for the result here, instead.
1286     */
1287    result_src = src_reg(this, ir->type);
1288    /* convenience for the emit functions below. */
1289    result_dst = dst_reg(result_src);
1290    /* If nothing special happens, this is the result. */
1291    this->result = result_src;
1292    /* Limit writes to the channels that will be used by result_src later.
1293     * This does limit this temp's use as a temporary for multi-instruction
1294     * sequences.
1295     */
1296    result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1297
1298    switch (ir->operation) {
1299    case ir_unop_logic_not:
1300       if (ctx->Const.UniformBooleanTrue != 1) {
1301          emit(NOT(result_dst, op[0]));
1302       } else {
1303          emit(XOR(result_dst, op[0], src_reg(1)));
1304       }
1305       break;
1306    case ir_unop_neg:
1307       op[0].negate = !op[0].negate;
1308       emit(MOV(result_dst, op[0]));
1309       break;
1310    case ir_unop_abs:
1311       op[0].abs = true;
1312       op[0].negate = false;
1313       emit(MOV(result_dst, op[0]));
1314       break;
1315
1316    case ir_unop_sign:
1317       if (ir->type->is_float()) {
1318          /* AND(val, 0x80000000) gives the sign bit.
1319           *
1320           * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1321           * zero.
1322           */
1323          emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1324
1325          op[0].type = BRW_REGISTER_TYPE_UD;
1326          result_dst.type = BRW_REGISTER_TYPE_UD;
1327          emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1328
1329          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1330          inst->predicate = BRW_PREDICATE_NORMAL;
1331
1332          this->result.type = BRW_REGISTER_TYPE_F;
1333       } else {
1334          /*  ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1335           *               -> non-negative val generates 0x00000000.
1336           *  Predicated OR sets 1 if val is positive.
1337           */
1338          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1339
1340          emit(ASR(result_dst, op[0], src_reg(31)));
1341
1342          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1343          inst->predicate = BRW_PREDICATE_NORMAL;
1344       }
1345       break;
1346
1347    case ir_unop_rcp:
1348       emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1349       break;
1350
1351    case ir_unop_exp2:
1352       emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1353       break;
1354    case ir_unop_log2:
1355       emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1356       break;
1357    case ir_unop_exp:
1358    case ir_unop_log:
1359       unreachable("not reached: should be handled by ir_explog_to_explog2");
1360    case ir_unop_sin:
1361    case ir_unop_sin_reduced:
1362       emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1363       break;
1364    case ir_unop_cos:
1365    case ir_unop_cos_reduced:
1366       emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1367       break;
1368
1369    case ir_unop_dFdx:
1370    case ir_unop_dFdx_coarse:
1371    case ir_unop_dFdx_fine:
1372    case ir_unop_dFdy:
1373    case ir_unop_dFdy_coarse:
1374    case ir_unop_dFdy_fine:
1375       unreachable("derivatives not valid in vertex shader");
1376
1377    case ir_unop_bitfield_reverse:
1378       emit(BFREV(result_dst, op[0]));
1379       break;
1380    case ir_unop_bit_count:
1381       emit(CBIT(result_dst, op[0]));
1382       break;
1383    case ir_unop_find_msb: {
1384       src_reg temp = src_reg(this, glsl_type::uint_type);
1385
1386       inst = emit(FBH(dst_reg(temp), op[0]));
1387       inst->dst.writemask = WRITEMASK_XYZW;
1388
1389       /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1390        * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1391        * subtract the result from 31 to convert the MSB count into an LSB count.
1392        */
1393
1394       /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1395       temp.swizzle = BRW_SWIZZLE_NOOP;
1396       emit(MOV(result_dst, temp));
1397
1398       src_reg src_tmp = src_reg(result_dst);
1399       emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1400
1401       src_tmp.negate = true;
1402       inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1403       inst->predicate = BRW_PREDICATE_NORMAL;
1404       break;
1405    }
1406    case ir_unop_find_lsb:
1407       emit(FBL(result_dst, op[0]));
1408       break;
1409    case ir_unop_saturate:
1410       inst = emit(MOV(result_dst, op[0]));
1411       inst->saturate = true;
1412       break;
1413
1414    case ir_unop_noise:
1415       unreachable("not reached: should be handled by lower_noise");
1416
1417    case ir_binop_add:
1418       emit(ADD(result_dst, op[0], op[1]));
1419       break;
1420    case ir_binop_sub:
1421       unreachable("not reached: should be handled by ir_sub_to_add_neg");
1422
1423    case ir_binop_mul:
1424       if (brw->gen < 8 && ir->type->is_integer()) {
1425          /* For integer multiplication, the MUL uses the low 16 bits of one of
1426           * the operands (src0 through SNB, src1 on IVB and later).  The MACH
1427           * accumulates in the contribution of the upper 16 bits of that
1428           * operand.  If we can determine that one of the args is in the low
1429           * 16 bits, though, we can just emit a single MUL.
1430           */
1431          if (ir->operands[0]->is_uint16_constant()) {
1432             if (brw->gen < 7)
1433                emit(MUL(result_dst, op[0], op[1]));
1434             else
1435                emit(MUL(result_dst, op[1], op[0]));
1436          } else if (ir->operands[1]->is_uint16_constant()) {
1437             if (brw->gen < 7)
1438                emit(MUL(result_dst, op[1], op[0]));
1439             else
1440                emit(MUL(result_dst, op[0], op[1]));
1441          } else {
1442             struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1443
1444             emit(MUL(acc, op[0], op[1]));
1445             emit(MACH(dst_null_d(), op[0], op[1]));
1446             emit(MOV(result_dst, src_reg(acc)));
1447          }
1448       } else {
1449          emit(MUL(result_dst, op[0], op[1]));
1450       }
1451       break;
1452    case ir_binop_imul_high: {
1453       struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1454
1455       emit(MUL(acc, op[0], op[1]));
1456       emit(MACH(result_dst, op[0], op[1]));
1457       break;
1458    }
1459    case ir_binop_div:
1460       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1461       assert(ir->type->is_integer());
1462       emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1463       break;
1464    case ir_binop_carry: {
1465       struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1466
1467       emit(ADDC(dst_null_ud(), op[0], op[1]));
1468       emit(MOV(result_dst, src_reg(acc)));
1469       break;
1470    }
1471    case ir_binop_borrow: {
1472       struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1473
1474       emit(SUBB(dst_null_ud(), op[0], op[1]));
1475       emit(MOV(result_dst, src_reg(acc)));
1476       break;
1477    }
1478    case ir_binop_mod:
1479       /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1480       assert(ir->type->is_integer());
1481       emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1482       break;
1483
1484    case ir_binop_less:
1485    case ir_binop_greater:
1486    case ir_binop_lequal:
1487    case ir_binop_gequal:
1488    case ir_binop_equal:
1489    case ir_binop_nequal: {
1490       emit(CMP(result_dst, op[0], op[1],
1491                brw_conditional_for_comparison(ir->operation)));
1492       if (ctx->Const.UniformBooleanTrue == 1) {
1493          emit(AND(result_dst, result_src, src_reg(1)));
1494       }
1495       break;
1496    }
1497
1498    case ir_binop_all_equal:
1499       /* "==" operator producing a scalar boolean. */
1500       if (ir->operands[0]->type->is_vector() ||
1501           ir->operands[1]->type->is_vector()) {
1502          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1503          emit(MOV(result_dst, src_reg(0)));
1504          inst = emit(MOV(result_dst, src_reg(ctx->Const.UniformBooleanTrue)));
1505          inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1506       } else {
1507          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1508          if (ctx->Const.UniformBooleanTrue == 1) {
1509             emit(AND(result_dst, result_src, src_reg(1)));
1510          }
1511       }
1512       break;
1513    case ir_binop_any_nequal:
1514       /* "!=" operator producing a scalar boolean. */
1515       if (ir->operands[0]->type->is_vector() ||
1516           ir->operands[1]->type->is_vector()) {
1517          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1518
1519          emit(MOV(result_dst, src_reg(0)));
1520          inst = emit(MOV(result_dst, src_reg(ctx->Const.UniformBooleanTrue)));
1521          inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1522       } else {
1523          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1524          if (ctx->Const.UniformBooleanTrue == 1) {
1525             emit(AND(result_dst, result_src, src_reg(1)));
1526          }
1527       }
1528       break;
1529
1530    case ir_unop_any:
1531       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1532       emit(MOV(result_dst, src_reg(0)));
1533
1534       inst = emit(MOV(result_dst, src_reg(ctx->Const.UniformBooleanTrue)));
1535       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1536       break;
1537
1538    case ir_binop_logic_xor:
1539       emit(XOR(result_dst, op[0], op[1]));
1540       break;
1541
1542    case ir_binop_logic_or:
1543       emit(OR(result_dst, op[0], op[1]));
1544       break;
1545
1546    case ir_binop_logic_and:
1547       emit(AND(result_dst, op[0], op[1]));
1548       break;
1549
1550    case ir_binop_dot:
1551       assert(ir->operands[0]->type->is_vector());
1552       assert(ir->operands[0]->type == ir->operands[1]->type);
1553       emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1554       break;
1555
1556    case ir_unop_sqrt:
1557       emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1558       break;
1559    case ir_unop_rsq:
1560       emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1561       break;
1562
1563    case ir_unop_bitcast_i2f:
1564    case ir_unop_bitcast_u2f:
1565       this->result = op[0];
1566       this->result.type = BRW_REGISTER_TYPE_F;
1567       break;
1568
1569    case ir_unop_bitcast_f2i:
1570       this->result = op[0];
1571       this->result.type = BRW_REGISTER_TYPE_D;
1572       break;
1573
1574    case ir_unop_bitcast_f2u:
1575       this->result = op[0];
1576       this->result.type = BRW_REGISTER_TYPE_UD;
1577       break;
1578
1579    case ir_unop_i2f:
1580    case ir_unop_i2u:
1581    case ir_unop_u2i:
1582    case ir_unop_u2f:
1583    case ir_unop_f2i:
1584    case ir_unop_f2u:
1585       emit(MOV(result_dst, op[0]));
1586       break;
1587    case ir_unop_b2i:
1588       if (ctx->Const.UniformBooleanTrue != 1) {
1589          emit(AND(result_dst, op[0], src_reg(1)));
1590       } else {
1591          emit(MOV(result_dst, op[0]));
1592       }
1593       break;
1594    case ir_unop_b2f:
1595       if (ctx->Const.UniformBooleanTrue != 1) {
1596          op[0].type = BRW_REGISTER_TYPE_UD;
1597          result_dst.type = BRW_REGISTER_TYPE_UD;
1598          emit(AND(result_dst, op[0], src_reg(0x3f800000u)));
1599          result_dst.type = BRW_REGISTER_TYPE_F;
1600       } else {
1601          emit(MOV(result_dst, op[0]));
1602       }
1603       break;
1604    case ir_unop_f2b:
1605    case ir_unop_i2b:
1606       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1607       if (ctx->Const.UniformBooleanTrue == 1) {
1608          emit(AND(result_dst, result_src, src_reg(1)));
1609       }
1610       break;
1611
1612    case ir_unop_trunc:
1613       emit(RNDZ(result_dst, op[0]));
1614       break;
1615    case ir_unop_ceil:
1616       op[0].negate = !op[0].negate;
1617       inst = emit(RNDD(result_dst, op[0]));
1618       this->result.negate = true;
1619       break;
1620    case ir_unop_floor:
1621       inst = emit(RNDD(result_dst, op[0]));
1622       break;
1623    case ir_unop_fract:
1624       inst = emit(FRC(result_dst, op[0]));
1625       break;
1626    case ir_unop_round_even:
1627       emit(RNDE(result_dst, op[0]));
1628       break;
1629
1630    case ir_binop_min:
1631       emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1632       break;
1633    case ir_binop_max:
1634       emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1635       break;
1636
1637    case ir_binop_pow:
1638       emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1639       break;
1640
1641    case ir_unop_bit_not:
1642       inst = emit(NOT(result_dst, op[0]));
1643       break;
1644    case ir_binop_bit_and:
1645       inst = emit(AND(result_dst, op[0], op[1]));
1646       break;
1647    case ir_binop_bit_xor:
1648       inst = emit(XOR(result_dst, op[0], op[1]));
1649       break;
1650    case ir_binop_bit_or:
1651       inst = emit(OR(result_dst, op[0], op[1]));
1652       break;
1653
1654    case ir_binop_lshift:
1655       inst = emit(SHL(result_dst, op[0], op[1]));
1656       break;
1657
1658    case ir_binop_rshift:
1659       if (ir->type->base_type == GLSL_TYPE_INT)
1660          inst = emit(ASR(result_dst, op[0], op[1]));
1661       else
1662          inst = emit(SHR(result_dst, op[0], op[1]));
1663       break;
1664
1665    case ir_binop_bfm:
1666       emit(BFI1(result_dst, op[0], op[1]));
1667       break;
1668
1669    case ir_binop_ubo_load: {
1670       ir_constant *const_uniform_block = ir->operands[0]->as_constant();
1671       ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1672       unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1673       src_reg offset;
1674
1675       /* Now, load the vector from that offset. */
1676       assert(ir->type->is_vector() || ir->type->is_scalar());
1677
1678       src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1679       packed_consts.type = result.type;
1680       src_reg surf_index;
1681
1682       if (const_uniform_block) {
1683          /* The block index is a constant, so just emit the binding table entry
1684           * as an immediate.
1685           */
1686          surf_index = src_reg(prog_data->base.binding_table.ubo_start +
1687                               const_uniform_block->value.u[0]);
1688       } else {
1689          /* The block index is not a constant. Evaluate the index expression
1690           * per-channel and add the base UBO index; the generator will select
1691           * a value from any live channel.
1692           */
1693          surf_index = src_reg(this, glsl_type::uint_type);
1694          emit(ADD(dst_reg(surf_index), op[0],
1695                   src_reg(prog_data->base.binding_table.ubo_start)));
1696
1697          /* Assume this may touch any UBO. It would be nice to provide
1698           * a tighter bound, but the array information is already lowered away.
1699           */
1700          brw_mark_surface_used(&prog_data->base,
1701                                prog_data->base.binding_table.ubo_start +
1702                                shader_prog->NumUniformBlocks - 1);
1703       }
1704
1705       if (const_offset_ir) {
1706          if (brw->gen >= 8) {
1707             /* Store the offset in a GRF so we can send-from-GRF. */
1708             offset = src_reg(this, glsl_type::int_type);
1709             emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1710          } else {
1711             /* Immediates are fine on older generations since they'll be moved
1712              * to a (potentially fake) MRF at the generator level.
1713              */
1714             offset = src_reg(const_offset / 16);
1715          }
1716       } else {
1717          offset = src_reg(this, glsl_type::uint_type);
1718          emit(SHR(dst_reg(offset), op[1], src_reg(4)));
1719       }
1720
1721       if (brw->gen >= 7) {
1722          dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1723          grf_offset.type = offset.type;
1724
1725          emit(MOV(grf_offset, offset));
1726
1727          emit(new(mem_ctx) vec4_instruction(this,
1728                                             VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1729                                             dst_reg(packed_consts),
1730                                             surf_index,
1731                                             src_reg(grf_offset)));
1732       } else {
1733          vec4_instruction *pull =
1734             emit(new(mem_ctx) vec4_instruction(this,
1735                                                VS_OPCODE_PULL_CONSTANT_LOAD,
1736                                                dst_reg(packed_consts),
1737                                                surf_index,
1738                                                offset));
1739          pull->base_mrf = 14;
1740          pull->mlen = 1;
1741       }
1742
1743       packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1744       packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1745                                             const_offset % 16 / 4,
1746                                             const_offset % 16 / 4,
1747                                             const_offset % 16 / 4);
1748
1749       /* UBO bools are any nonzero int.  We need to convert them to use the
1750        * value of true stored in ctx->Const.UniformBooleanTrue.
1751        */
1752       if (ir->type->base_type == GLSL_TYPE_BOOL) {
1753          emit(CMP(result_dst, packed_consts, src_reg(0u),
1754                   BRW_CONDITIONAL_NZ));
1755          if (ctx->Const.UniformBooleanTrue == 1) {
1756             emit(AND(result_dst, result, src_reg(1)));
1757          }
1758       } else {
1759          emit(MOV(result_dst, packed_consts));
1760       }
1761       break;
1762    }
1763
1764    case ir_binop_vector_extract:
1765       unreachable("should have been lowered by vec_index_to_cond_assign");
1766
1767    case ir_triop_fma:
1768       op[0] = fix_3src_operand(op[0]);
1769       op[1] = fix_3src_operand(op[1]);
1770       op[2] = fix_3src_operand(op[2]);
1771       /* Note that the instruction's argument order is reversed from GLSL
1772        * and the IR.
1773        */
1774       emit(MAD(result_dst, op[2], op[1], op[0]));
1775       break;
1776
1777    case ir_triop_lrp:
1778       emit_lrp(result_dst, op[0], op[1], op[2]);
1779       break;
1780
1781    case ir_triop_csel:
1782       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1783       inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1784       inst->predicate = BRW_PREDICATE_NORMAL;
1785       break;
1786
1787    case ir_triop_bfi:
1788       op[0] = fix_3src_operand(op[0]);
1789       op[1] = fix_3src_operand(op[1]);
1790       op[2] = fix_3src_operand(op[2]);
1791       emit(BFI2(result_dst, op[0], op[1], op[2]));
1792       break;
1793
1794    case ir_triop_bitfield_extract:
1795       op[0] = fix_3src_operand(op[0]);
1796       op[1] = fix_3src_operand(op[1]);
1797       op[2] = fix_3src_operand(op[2]);
1798       /* Note that the instruction's argument order is reversed from GLSL
1799        * and the IR.
1800        */
1801       emit(BFE(result_dst, op[2], op[1], op[0]));
1802       break;
1803
1804    case ir_triop_vector_insert:
1805       unreachable("should have been lowered by lower_vector_insert");
1806
1807    case ir_quadop_bitfield_insert:
1808       unreachable("not reached: should be handled by "
1809               "bitfield_insert_to_bfm_bfi\n");
1810
1811    case ir_quadop_vector:
1812       unreachable("not reached: should be handled by lower_quadop_vector");
1813
1814    case ir_unop_pack_half_2x16:
1815       emit_pack_half_2x16(result_dst, op[0]);
1816       break;
1817    case ir_unop_unpack_half_2x16:
1818       emit_unpack_half_2x16(result_dst, op[0]);
1819       break;
1820    case ir_unop_pack_snorm_2x16:
1821    case ir_unop_pack_snorm_4x8:
1822    case ir_unop_pack_unorm_2x16:
1823    case ir_unop_pack_unorm_4x8:
1824    case ir_unop_unpack_snorm_2x16:
1825    case ir_unop_unpack_snorm_4x8:
1826    case ir_unop_unpack_unorm_2x16:
1827    case ir_unop_unpack_unorm_4x8:
1828       unreachable("not reached: should be handled by lower_packing_builtins");
1829    case ir_unop_unpack_half_2x16_split_x:
1830    case ir_unop_unpack_half_2x16_split_y:
1831    case ir_binop_pack_half_2x16_split:
1832    case ir_unop_interpolate_at_centroid:
1833    case ir_binop_interpolate_at_sample:
1834    case ir_binop_interpolate_at_offset:
1835       unreachable("not reached: should not occur in vertex shader");
1836    case ir_binop_ldexp:
1837       unreachable("not reached: should be handled by ldexp_to_arith()");
1838    }
1839 }
1840
1841
1842 void
1843 vec4_visitor::visit(ir_swizzle *ir)
1844 {
1845    src_reg src;
1846    int i = 0;
1847    int swizzle[4];
1848
1849    /* Note that this is only swizzles in expressions, not those on the left
1850     * hand side of an assignment, which do write masking.  See ir_assignment
1851     * for that.
1852     */
1853
1854    ir->val->accept(this);
1855    src = this->result;
1856    assert(src.file != BAD_FILE);
1857
1858    for (i = 0; i < ir->type->vector_elements; i++) {
1859       switch (i) {
1860       case 0:
1861          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1862          break;
1863       case 1:
1864          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1865          break;
1866       case 2:
1867          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1868          break;
1869       case 3:
1870          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1871             break;
1872       }
1873    }
1874    for (; i < 4; i++) {
1875       /* Replicate the last channel out. */
1876       swizzle[i] = swizzle[ir->type->vector_elements - 1];
1877    }
1878
1879    src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1880
1881    this->result = src;
1882 }
1883
1884 void
1885 vec4_visitor::visit(ir_dereference_variable *ir)
1886 {
1887    const struct glsl_type *type = ir->type;
1888    dst_reg *reg = variable_storage(ir->var);
1889
1890    if (!reg) {
1891       fail("Failed to find variable storage for %s\n", ir->var->name);
1892       this->result = src_reg(brw_null_reg());
1893       return;
1894    }
1895
1896    this->result = src_reg(*reg);
1897
1898    /* System values get their swizzle from the dst_reg writemask */
1899    if (ir->var->data.mode == ir_var_system_value)
1900       return;
1901
1902    if (type->is_scalar() || type->is_vector() || type->is_matrix())
1903       this->result.swizzle = swizzle_for_size(type->vector_elements);
1904 }
1905
1906
1907 int
1908 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1909 {
1910    /* Under normal circumstances array elements are stored consecutively, so
1911     * the stride is equal to the size of the array element.
1912     */
1913    return type_size(ir->type);
1914 }
1915
1916
1917 void
1918 vec4_visitor::visit(ir_dereference_array *ir)
1919 {
1920    ir_constant *constant_index;
1921    src_reg src;
1922    int array_stride = compute_array_stride(ir);
1923
1924    constant_index = ir->array_index->constant_expression_value();
1925
1926    ir->array->accept(this);
1927    src = this->result;
1928
1929    if (constant_index) {
1930       src.reg_offset += constant_index->value.i[0] * array_stride;
1931    } else {
1932       /* Variable index array dereference.  It eats the "vec4" of the
1933        * base of the array and an index that offsets the Mesa register
1934        * index.
1935        */
1936       ir->array_index->accept(this);
1937
1938       src_reg index_reg;
1939
1940       if (array_stride == 1) {
1941          index_reg = this->result;
1942       } else {
1943          index_reg = src_reg(this, glsl_type::int_type);
1944
1945          emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
1946       }
1947
1948       if (src.reladdr) {
1949          src_reg temp = src_reg(this, glsl_type::int_type);
1950
1951          emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1952
1953          index_reg = temp;
1954       }
1955
1956       src.reladdr = ralloc(mem_ctx, src_reg);
1957       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1958    }
1959
1960    /* If the type is smaller than a vec4, replicate the last channel out. */
1961    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1962       src.swizzle = swizzle_for_size(ir->type->vector_elements);
1963    else
1964       src.swizzle = BRW_SWIZZLE_NOOP;
1965    src.type = brw_type_for_base_type(ir->type);
1966
1967    this->result = src;
1968 }
1969
1970 void
1971 vec4_visitor::visit(ir_dereference_record *ir)
1972 {
1973    unsigned int i;
1974    const glsl_type *struct_type = ir->record->type;
1975    int offset = 0;
1976
1977    ir->record->accept(this);
1978
1979    for (i = 0; i < struct_type->length; i++) {
1980       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1981          break;
1982       offset += type_size(struct_type->fields.structure[i].type);
1983    }
1984
1985    /* If the type is smaller than a vec4, replicate the last channel out. */
1986    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1987       this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1988    else
1989       this->result.swizzle = BRW_SWIZZLE_NOOP;
1990    this->result.type = brw_type_for_base_type(ir->type);
1991
1992    this->result.reg_offset += offset;
1993 }
1994
1995 /**
1996  * We want to be careful in assignment setup to hit the actual storage
1997  * instead of potentially using a temporary like we might with the
1998  * ir_dereference handler.
1999  */
2000 static dst_reg
2001 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
2002 {
2003    /* The LHS must be a dereference.  If the LHS is a variable indexed array
2004     * access of a vector, it must be separated into a series conditional moves
2005     * before reaching this point (see ir_vec_index_to_cond_assign).
2006     */
2007    assert(ir->as_dereference());
2008    ir_dereference_array *deref_array = ir->as_dereference_array();
2009    if (deref_array) {
2010       assert(!deref_array->array->type->is_vector());
2011    }
2012
2013    /* Use the rvalue deref handler for the most part.  We'll ignore
2014     * swizzles in it and write swizzles using writemask, though.
2015     */
2016    ir->accept(v);
2017    return dst_reg(v->result);
2018 }
2019
2020 void
2021 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
2022                               const struct glsl_type *type,
2023                               enum brw_predicate predicate)
2024 {
2025    if (type->base_type == GLSL_TYPE_STRUCT) {
2026       for (unsigned int i = 0; i < type->length; i++) {
2027          emit_block_move(dst, src, type->fields.structure[i].type, predicate);
2028       }
2029       return;
2030    }
2031
2032    if (type->is_array()) {
2033       for (unsigned int i = 0; i < type->length; i++) {
2034          emit_block_move(dst, src, type->fields.array, predicate);
2035       }
2036       return;
2037    }
2038
2039    if (type->is_matrix()) {
2040       const struct glsl_type *vec_type;
2041
2042       vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
2043                                          type->vector_elements, 1);
2044
2045       for (int i = 0; i < type->matrix_columns; i++) {
2046          emit_block_move(dst, src, vec_type, predicate);
2047       }
2048       return;
2049    }
2050
2051    assert(type->is_scalar() || type->is_vector());
2052
2053    dst->type = brw_type_for_base_type(type);
2054    src->type = dst->type;
2055
2056    dst->writemask = (1 << type->vector_elements) - 1;
2057
2058    src->swizzle = swizzle_for_size(type->vector_elements);
2059
2060    vec4_instruction *inst = emit(MOV(*dst, *src));
2061    inst->predicate = predicate;
2062
2063    dst->reg_offset++;
2064    src->reg_offset++;
2065 }
2066
2067
2068 /* If the RHS processing resulted in an instruction generating a
2069  * temporary value, and it would be easy to rewrite the instruction to
2070  * generate its result right into the LHS instead, do so.  This ends
2071  * up reliably removing instructions where it can be tricky to do so
2072  * later without real UD chain information.
2073  */
2074 bool
2075 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2076                                      dst_reg dst,
2077                                      src_reg src,
2078                                      vec4_instruction *pre_rhs_inst,
2079                                      vec4_instruction *last_rhs_inst)
2080 {
2081    /* This could be supported, but it would take more smarts. */
2082    if (ir->condition)
2083       return false;
2084
2085    if (pre_rhs_inst == last_rhs_inst)
2086       return false; /* No instructions generated to work with. */
2087
2088    /* Make sure the last instruction generated our source reg. */
2089    if (src.file != GRF ||
2090        src.file != last_rhs_inst->dst.file ||
2091        src.reg != last_rhs_inst->dst.reg ||
2092        src.reg_offset != last_rhs_inst->dst.reg_offset ||
2093        src.reladdr ||
2094        src.abs ||
2095        src.negate ||
2096        last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2097       return false;
2098
2099    /* Check that that last instruction fully initialized the channels
2100     * we want to use, in the order we want to use them.  We could
2101     * potentially reswizzle the operands of many instructions so that
2102     * we could handle out of order channels, but don't yet.
2103     */
2104
2105    for (unsigned i = 0; i < 4; i++) {
2106       if (dst.writemask & (1 << i)) {
2107          if (!(last_rhs_inst->dst.writemask & (1 << i)))
2108             return false;
2109
2110          if (BRW_GET_SWZ(src.swizzle, i) != i)
2111             return false;
2112       }
2113    }
2114
2115    /* Success!  Rewrite the instruction. */
2116    last_rhs_inst->dst.file = dst.file;
2117    last_rhs_inst->dst.reg = dst.reg;
2118    last_rhs_inst->dst.reg_offset = dst.reg_offset;
2119    last_rhs_inst->dst.reladdr = dst.reladdr;
2120    last_rhs_inst->dst.writemask &= dst.writemask;
2121
2122    return true;
2123 }
2124
2125 void
2126 vec4_visitor::visit(ir_assignment *ir)
2127 {
2128    dst_reg dst = get_assignment_lhs(ir->lhs, this);
2129    enum brw_predicate predicate = BRW_PREDICATE_NONE;
2130
2131    if (!ir->lhs->type->is_scalar() &&
2132        !ir->lhs->type->is_vector()) {
2133       ir->rhs->accept(this);
2134       src_reg src = this->result;
2135
2136       if (ir->condition) {
2137          emit_bool_to_cond_code(ir->condition, &predicate);
2138       }
2139
2140       /* emit_block_move doesn't account for swizzles in the source register.
2141        * This should be ok, since the source register is a structure or an
2142        * array, and those can't be swizzled.  But double-check to be sure.
2143        */
2144       assert(src.swizzle ==
2145              (ir->rhs->type->is_matrix()
2146               ? swizzle_for_size(ir->rhs->type->vector_elements)
2147               : BRW_SWIZZLE_NOOP));
2148
2149       emit_block_move(&dst, &src, ir->rhs->type, predicate);
2150       return;
2151    }
2152
2153    /* Now we're down to just a scalar/vector with writemasks. */
2154    int i;
2155
2156    vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2157    pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2158
2159    ir->rhs->accept(this);
2160
2161    last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2162
2163    src_reg src = this->result;
2164
2165    int swizzles[4];
2166    int first_enabled_chan = 0;
2167    int src_chan = 0;
2168
2169    assert(ir->lhs->type->is_vector() ||
2170           ir->lhs->type->is_scalar());
2171    dst.writemask = ir->write_mask;
2172
2173    for (int i = 0; i < 4; i++) {
2174       if (dst.writemask & (1 << i)) {
2175          first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
2176          break;
2177       }
2178    }
2179
2180    /* Swizzle a small RHS vector into the channels being written.
2181     *
2182     * glsl ir treats write_mask as dictating how many channels are
2183     * present on the RHS while in our instructions we need to make
2184     * those channels appear in the slots of the vec4 they're written to.
2185     */
2186    for (int i = 0; i < 4; i++) {
2187       if (dst.writemask & (1 << i))
2188          swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
2189       else
2190          swizzles[i] = first_enabled_chan;
2191    }
2192    src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2193                               swizzles[2], swizzles[3]);
2194
2195    if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2196       return;
2197    }
2198
2199    if (ir->condition) {
2200       emit_bool_to_cond_code(ir->condition, &predicate);
2201    }
2202
2203    for (i = 0; i < type_size(ir->lhs->type); i++) {
2204       vec4_instruction *inst = emit(MOV(dst, src));
2205       inst->predicate = predicate;
2206
2207       dst.reg_offset++;
2208       src.reg_offset++;
2209    }
2210 }
2211
2212 void
2213 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2214 {
2215    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2216       foreach_in_list(ir_constant, field_value, &ir->components) {
2217          emit_constant_values(dst, field_value);
2218       }
2219       return;
2220    }
2221
2222    if (ir->type->is_array()) {
2223       for (unsigned int i = 0; i < ir->type->length; i++) {
2224          emit_constant_values(dst, ir->array_elements[i]);
2225       }
2226       return;
2227    }
2228
2229    if (ir->type->is_matrix()) {
2230       for (int i = 0; i < ir->type->matrix_columns; i++) {
2231          float *vec = &ir->value.f[i * ir->type->vector_elements];
2232
2233          for (int j = 0; j < ir->type->vector_elements; j++) {
2234             dst->writemask = 1 << j;
2235             dst->type = BRW_REGISTER_TYPE_F;
2236
2237             emit(MOV(*dst, src_reg(vec[j])));
2238          }
2239          dst->reg_offset++;
2240       }
2241       return;
2242    }
2243
2244    int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2245
2246    for (int i = 0; i < ir->type->vector_elements; i++) {
2247       if (!(remaining_writemask & (1 << i)))
2248          continue;
2249
2250       dst->writemask = 1 << i;
2251       dst->type = brw_type_for_base_type(ir->type);
2252
2253       /* Find other components that match the one we're about to
2254        * write.  Emits fewer instructions for things like vec4(0.5,
2255        * 1.5, 1.5, 1.5).
2256        */
2257       for (int j = i + 1; j < ir->type->vector_elements; j++) {
2258          if (ir->type->base_type == GLSL_TYPE_BOOL) {
2259             if (ir->value.b[i] == ir->value.b[j])
2260                dst->writemask |= (1 << j);
2261          } else {
2262             /* u, i, and f storage all line up, so no need for a
2263              * switch case for comparing each type.
2264              */
2265             if (ir->value.u[i] == ir->value.u[j])
2266                dst->writemask |= (1 << j);
2267          }
2268       }
2269
2270       switch (ir->type->base_type) {
2271       case GLSL_TYPE_FLOAT:
2272          emit(MOV(*dst, src_reg(ir->value.f[i])));
2273          break;
2274       case GLSL_TYPE_INT:
2275          emit(MOV(*dst, src_reg(ir->value.i[i])));
2276          break;
2277       case GLSL_TYPE_UINT:
2278          emit(MOV(*dst, src_reg(ir->value.u[i])));
2279          break;
2280       case GLSL_TYPE_BOOL:
2281          emit(MOV(*dst,
2282                   src_reg(ir->value.b[i] != 0 ? ctx->Const.UniformBooleanTrue
2283                                               : 0)));
2284          break;
2285       default:
2286          unreachable("Non-float/uint/int/bool constant");
2287       }
2288
2289       remaining_writemask &= ~dst->writemask;
2290    }
2291    dst->reg_offset++;
2292 }
2293
2294 void
2295 vec4_visitor::visit(ir_constant *ir)
2296 {
2297    dst_reg dst = dst_reg(this, ir->type);
2298    this->result = src_reg(dst);
2299
2300    emit_constant_values(&dst, ir);
2301 }
2302
2303 void
2304 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2305 {
2306    ir_dereference *deref = static_cast<ir_dereference *>(
2307       ir->actual_parameters.get_head());
2308    ir_variable *location = deref->variable_referenced();
2309    unsigned surf_index = (prog_data->base.binding_table.abo_start +
2310                           location->data.binding);
2311
2312    /* Calculate the surface offset */
2313    src_reg offset(this, glsl_type::uint_type);
2314    ir_dereference_array *deref_array = deref->as_dereference_array();
2315    if (deref_array) {
2316       deref_array->array_index->accept(this);
2317
2318       src_reg tmp(this, glsl_type::uint_type);
2319       emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2320       emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2321    } else {
2322       offset = location->data.atomic.offset;
2323    }
2324
2325    /* Emit the appropriate machine instruction */
2326    const char *callee = ir->callee->function_name();
2327    dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2328
2329    if (!strcmp("__intrinsic_atomic_read", callee)) {
2330       emit_untyped_surface_read(surf_index, dst, offset);
2331
2332    } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2333       emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2334                           src_reg(), src_reg());
2335
2336    } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2337       emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2338                           src_reg(), src_reg());
2339    }
2340 }
2341
2342 void
2343 vec4_visitor::visit(ir_call *ir)
2344 {
2345    const char *callee = ir->callee->function_name();
2346
2347    if (!strcmp("__intrinsic_atomic_read", callee) ||
2348        !strcmp("__intrinsic_atomic_increment", callee) ||
2349        !strcmp("__intrinsic_atomic_predecrement", callee)) {
2350       visit_atomic_counter_intrinsic(ir);
2351    } else {
2352       unreachable("Unsupported intrinsic.");
2353    }
2354 }
2355
2356 src_reg
2357 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, src_reg sampler)
2358 {
2359    vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_MCS);
2360    inst->base_mrf = 2;
2361    inst->mlen = 1;
2362    inst->dst = dst_reg(this, glsl_type::uvec4_type);
2363    inst->dst.writemask = WRITEMASK_XYZW;
2364
2365    inst->src[1] = sampler;
2366
2367    /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2368    int param_base = inst->base_mrf;
2369    int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2370    int zero_mask = 0xf & ~coord_mask;
2371
2372    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2373             coordinate));
2374
2375    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2376             src_reg(0)));
2377
2378    emit(inst);
2379    return src_reg(inst->dst);
2380 }
2381
2382 static bool
2383 is_high_sampler(struct brw_context *brw, src_reg sampler)
2384 {
2385    if (brw->gen < 8 && !brw->is_haswell)
2386       return false;
2387
2388    return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
2389 }
2390
2391 void
2392 vec4_visitor::visit(ir_texture *ir)
2393 {
2394    uint32_t sampler =
2395       _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2396
2397    ir_rvalue *nonconst_sampler_index =
2398       _mesa_get_sampler_array_nonconst_index(ir->sampler);
2399
2400    /* Handle non-constant sampler array indexing */
2401    src_reg sampler_reg;
2402    if (nonconst_sampler_index) {
2403       /* The highest sampler which may be used by this operation is
2404        * the last element of the array. Mark it here, because the generator
2405        * doesn't have enough information to determine the bound.
2406        */
2407       uint32_t array_size = ir->sampler->as_dereference_array()
2408          ->array->type->array_size();
2409
2410       uint32_t max_used = sampler + array_size - 1;
2411       if (ir->op == ir_tg4 && brw->gen < 8) {
2412          max_used += prog_data->base.binding_table.gather_texture_start;
2413       } else {
2414          max_used += prog_data->base.binding_table.texture_start;
2415       }
2416
2417       brw_mark_surface_used(&prog_data->base, max_used);
2418
2419       /* Emit code to evaluate the actual indexing expression */
2420       nonconst_sampler_index->accept(this);
2421       dst_reg temp(this, glsl_type::uint_type);
2422       emit(ADD(temp, this->result, src_reg(sampler)))
2423          ->force_writemask_all = true;
2424       sampler_reg = src_reg(temp);
2425    } else {
2426       /* Single sampler, or constant array index; the indexing expression
2427        * is just an immediate.
2428        */
2429       sampler_reg = src_reg(sampler);
2430    }
2431
2432    /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2433     * emitting anything other than setting up the constant result.
2434     */
2435    if (ir->op == ir_tg4) {
2436       ir_constant *chan = ir->lod_info.component->as_constant();
2437       int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2438       if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2439          dst_reg result(this, ir->type);
2440          this->result = src_reg(result);
2441          emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2442          return;
2443       }
2444    }
2445
2446    /* Should be lowered by do_lower_texture_projection */
2447    assert(!ir->projector);
2448
2449    /* Should be lowered */
2450    assert(!ir->offset || !ir->offset->type->is_array());
2451
2452    /* Generate code to compute all the subexpression trees.  This has to be
2453     * done before loading any values into MRFs for the sampler message since
2454     * generating these values may involve SEND messages that need the MRFs.
2455     */
2456    src_reg coordinate;
2457    if (ir->coordinate) {
2458       ir->coordinate->accept(this);
2459       coordinate = this->result;
2460    }
2461
2462    src_reg shadow_comparitor;
2463    if (ir->shadow_comparitor) {
2464       ir->shadow_comparitor->accept(this);
2465       shadow_comparitor = this->result;
2466    }
2467
2468    bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2469    src_reg offset_value;
2470    if (has_nonconstant_offset) {
2471       ir->offset->accept(this);
2472       offset_value = src_reg(this->result);
2473    }
2474
2475    const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2476    src_reg lod, dPdx, dPdy, sample_index, mcs;
2477    switch (ir->op) {
2478    case ir_tex:
2479       lod = src_reg(0.0f);
2480       lod_type = glsl_type::float_type;
2481       break;
2482    case ir_txf:
2483    case ir_txl:
2484    case ir_txs:
2485       ir->lod_info.lod->accept(this);
2486       lod = this->result;
2487       lod_type = ir->lod_info.lod->type;
2488       break;
2489    case ir_query_levels:
2490       lod = src_reg(0);
2491       lod_type = glsl_type::int_type;
2492       break;
2493    case ir_txf_ms:
2494       ir->lod_info.sample_index->accept(this);
2495       sample_index = this->result;
2496       sample_index_type = ir->lod_info.sample_index->type;
2497
2498       if (brw->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2499          mcs = emit_mcs_fetch(ir, coordinate, sampler_reg);
2500       else
2501          mcs = src_reg(0u);
2502       break;
2503    case ir_txd:
2504       ir->lod_info.grad.dPdx->accept(this);
2505       dPdx = this->result;
2506
2507       ir->lod_info.grad.dPdy->accept(this);
2508       dPdy = this->result;
2509
2510       lod_type = ir->lod_info.grad.dPdx->type;
2511       break;
2512    case ir_txb:
2513    case ir_lod:
2514    case ir_tg4:
2515       break;
2516    }
2517
2518    enum opcode opcode;
2519    switch (ir->op) {
2520    case ir_tex: opcode = SHADER_OPCODE_TXL; break;
2521    case ir_txl: opcode = SHADER_OPCODE_TXL; break;
2522    case ir_txd: opcode = SHADER_OPCODE_TXD; break;
2523    case ir_txf: opcode = SHADER_OPCODE_TXF; break;
2524    case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
2525    case ir_txs: opcode = SHADER_OPCODE_TXS; break;
2526    case ir_tg4: opcode = has_nonconstant_offset
2527                          ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
2528    case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
2529    case ir_txb:
2530       unreachable("TXB is not valid for vertex shaders.");
2531    case ir_lod:
2532       unreachable("LOD is not valid for vertex shaders.");
2533    default:
2534       unreachable("Unrecognized tex op");
2535    }
2536
2537    vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, opcode);
2538
2539    if (ir->offset != NULL && ir->op != ir_txf)
2540       inst->texture_offset = brw_texture_offset(ctx, ir->offset->as_constant());
2541
2542    /* Stuff the channel select bits in the top of the texture offset */
2543    if (ir->op == ir_tg4)
2544       inst->texture_offset |= gather_channel(ir, sampler) << 16;
2545
2546    /* The message header is necessary for:
2547     * - Gen4 (always)
2548     * - Texel offsets
2549     * - Gather channel selection
2550     * - Sampler indices too large to fit in a 4-bit value.
2551     */
2552    inst->header_present =
2553       brw->gen < 5 || inst->texture_offset != 0 || ir->op == ir_tg4 ||
2554       is_high_sampler(brw, sampler_reg);
2555    inst->base_mrf = 2;
2556    inst->mlen = inst->header_present + 1; /* always at least one */
2557    inst->dst = dst_reg(this, ir->type);
2558    inst->dst.writemask = WRITEMASK_XYZW;
2559    inst->shadow_compare = ir->shadow_comparitor != NULL;
2560
2561    inst->src[1] = sampler_reg;
2562
2563    /* MRF for the first parameter */
2564    int param_base = inst->base_mrf + inst->header_present;
2565
2566    if (ir->op == ir_txs || ir->op == ir_query_levels) {
2567       int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2568       emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2569    } else {
2570       /* Load the coordinate */
2571       /* FINISHME: gl_clamp_mask and saturate */
2572       int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2573       int zero_mask = 0xf & ~coord_mask;
2574
2575       emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2576                coordinate));
2577
2578       if (zero_mask != 0) {
2579          emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2580                   src_reg(0)));
2581       }
2582       /* Load the shadow comparitor */
2583       if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2584          emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2585                           WRITEMASK_X),
2586                   shadow_comparitor));
2587          inst->mlen++;
2588       }
2589
2590       /* Load the LOD info */
2591       if (ir->op == ir_tex || ir->op == ir_txl) {
2592          int mrf, writemask;
2593          if (brw->gen >= 5) {
2594             mrf = param_base + 1;
2595             if (ir->shadow_comparitor) {
2596                writemask = WRITEMASK_Y;
2597                /* mlen already incremented */
2598             } else {
2599                writemask = WRITEMASK_X;
2600                inst->mlen++;
2601             }
2602          } else /* brw->gen == 4 */ {
2603             mrf = param_base;
2604             writemask = WRITEMASK_W;
2605          }
2606          emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2607       } else if (ir->op == ir_txf) {
2608          emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2609       } else if (ir->op == ir_txf_ms) {
2610          emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2611                   sample_index));
2612          if (brw->gen >= 7) {
2613             /* MCS data is in the first channel of `mcs`, but we need to get it into
2614              * the .y channel of the second vec4 of params, so replicate .x across
2615              * the whole vec4 and then mask off everything except .y
2616              */
2617             mcs.swizzle = BRW_SWIZZLE_XXXX;
2618             emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2619                      mcs));
2620          }
2621          inst->mlen++;
2622       } else if (ir->op == ir_txd) {
2623          const glsl_type *type = lod_type;
2624
2625          if (brw->gen >= 5) {
2626             dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2627             dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2628             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2629             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2630             inst->mlen++;
2631
2632             if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2633                dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2634                dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2635                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2636                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2637                inst->mlen++;
2638
2639                if (ir->shadow_comparitor) {
2640                   emit(MOV(dst_reg(MRF, param_base + 2,
2641                                    ir->shadow_comparitor->type, WRITEMASK_Z),
2642                            shadow_comparitor));
2643                }
2644             }
2645          } else /* brw->gen == 4 */ {
2646             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2647             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2648             inst->mlen += 2;
2649          }
2650       } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2651          if (ir->shadow_comparitor) {
2652             emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2653                      shadow_comparitor));
2654          }
2655
2656          emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2657                   offset_value));
2658          inst->mlen++;
2659       }
2660    }
2661
2662    emit(inst);
2663
2664    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2665     * spec requires layers.
2666     */
2667    if (ir->op == ir_txs) {
2668       glsl_type const *type = ir->sampler->type;
2669       if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2670           type->sampler_array) {
2671          emit_math(SHADER_OPCODE_INT_QUOTIENT,
2672                    writemask(inst->dst, WRITEMASK_Z),
2673                    src_reg(inst->dst), src_reg(6));
2674       }
2675    }
2676
2677    if (brw->gen == 6 && ir->op == ir_tg4) {
2678       emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2679    }
2680
2681    swizzle_result(ir, src_reg(inst->dst), sampler);
2682 }
2683
2684 /**
2685  * Apply workarounds for Gen6 gather with UINT/SINT
2686  */
2687 void
2688 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2689 {
2690    if (!wa)
2691       return;
2692
2693    int width = (wa & WA_8BIT) ? 8 : 16;
2694    dst_reg dst_f = dst;
2695    dst_f.type = BRW_REGISTER_TYPE_F;
2696
2697    /* Convert from UNORM to UINT */
2698    emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2699    emit(MOV(dst, src_reg(dst_f)));
2700
2701    if (wa & WA_SIGN) {
2702       /* Reinterpret the UINT value as a signed INT value by
2703        * shifting the sign bit into place, then shifting back
2704        * preserving sign.
2705        */
2706       emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2707       emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2708    }
2709 }
2710
2711 /**
2712  * Set up the gather channel based on the swizzle, for gather4.
2713  */
2714 uint32_t
2715 vec4_visitor::gather_channel(ir_texture *ir, uint32_t sampler)
2716 {
2717    ir_constant *chan = ir->lod_info.component->as_constant();
2718    int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2719    switch (swiz) {
2720       case SWIZZLE_X: return 0;
2721       case SWIZZLE_Y:
2722          /* gather4 sampler is broken for green channel on RG32F --
2723           * we must ask for blue instead.
2724           */
2725          if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2726             return 2;
2727          return 1;
2728       case SWIZZLE_Z: return 2;
2729       case SWIZZLE_W: return 3;
2730       default:
2731          unreachable("Not reached"); /* zero, one swizzles handled already */
2732    }
2733 }
2734
2735 void
2736 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, uint32_t sampler)
2737 {
2738    int s = key->tex.swizzles[sampler];
2739
2740    this->result = src_reg(this, ir->type);
2741    dst_reg swizzled_result(this->result);
2742
2743    if (ir->op == ir_query_levels) {
2744       /* # levels is in .w */
2745       orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2746       emit(MOV(swizzled_result, orig_val));
2747       return;
2748    }
2749
2750    if (ir->op == ir_txs || ir->type == glsl_type::float_type
2751                         || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2752       emit(MOV(swizzled_result, orig_val));
2753       return;
2754    }
2755
2756
2757    int zero_mask = 0, one_mask = 0, copy_mask = 0;
2758    int swizzle[4] = {0};
2759
2760    for (int i = 0; i < 4; i++) {
2761       switch (GET_SWZ(s, i)) {
2762       case SWIZZLE_ZERO:
2763          zero_mask |= (1 << i);
2764          break;
2765       case SWIZZLE_ONE:
2766          one_mask |= (1 << i);
2767          break;
2768       default:
2769          copy_mask |= (1 << i);
2770          swizzle[i] = GET_SWZ(s, i);
2771          break;
2772       }
2773    }
2774
2775    if (copy_mask) {
2776       orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2777       swizzled_result.writemask = copy_mask;
2778       emit(MOV(swizzled_result, orig_val));
2779    }
2780
2781    if (zero_mask) {
2782       swizzled_result.writemask = zero_mask;
2783       emit(MOV(swizzled_result, src_reg(0.0f)));
2784    }
2785
2786    if (one_mask) {
2787       swizzled_result.writemask = one_mask;
2788       emit(MOV(swizzled_result, src_reg(1.0f)));
2789    }
2790 }
2791
2792 void
2793 vec4_visitor::visit(ir_return *)
2794 {
2795    unreachable("not reached");
2796 }
2797
2798 void
2799 vec4_visitor::visit(ir_discard *)
2800 {
2801    unreachable("not reached");
2802 }
2803
2804 void
2805 vec4_visitor::visit(ir_if *ir)
2806 {
2807    /* Don't point the annotation at the if statement, because then it plus
2808     * the then and else blocks get printed.
2809     */
2810    this->base_ir = ir->condition;
2811
2812    if (brw->gen == 6) {
2813       emit_if_gen6(ir);
2814    } else {
2815       enum brw_predicate predicate;
2816       emit_bool_to_cond_code(ir->condition, &predicate);
2817       emit(IF(predicate));
2818    }
2819
2820    visit_instructions(&ir->then_instructions);
2821
2822    if (!ir->else_instructions.is_empty()) {
2823       this->base_ir = ir->condition;
2824       emit(BRW_OPCODE_ELSE);
2825
2826       visit_instructions(&ir->else_instructions);
2827    }
2828
2829    this->base_ir = ir->condition;
2830    emit(BRW_OPCODE_ENDIF);
2831 }
2832
2833 void
2834 vec4_visitor::visit(ir_emit_vertex *)
2835 {
2836    unreachable("not reached");
2837 }
2838
2839 void
2840 vec4_visitor::visit(ir_end_primitive *)
2841 {
2842    unreachable("not reached");
2843 }
2844
2845 void
2846 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2847                                   dst_reg dst, src_reg offset,
2848                                   src_reg src0, src_reg src1)
2849 {
2850    unsigned mlen = 0;
2851
2852    /* Set the atomic operation offset. */
2853    emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2854    mlen++;
2855
2856    /* Set the atomic operation arguments. */
2857    if (src0.file != BAD_FILE) {
2858       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2859       mlen++;
2860    }
2861
2862    if (src1.file != BAD_FILE) {
2863       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2864       mlen++;
2865    }
2866
2867    /* Emit the instruction.  Note that this maps to the normal SIMD8
2868     * untyped atomic message on Ivy Bridge, but that's OK because
2869     * unused channels will be masked out.
2870     */
2871    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
2872                                  src_reg(atomic_op), src_reg(surf_index));
2873    inst->base_mrf = 0;
2874    inst->mlen = mlen;
2875 }
2876
2877 void
2878 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
2879                                         src_reg offset)
2880 {
2881    /* Set the surface read offset. */
2882    emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
2883
2884    /* Emit the instruction.  Note that this maps to the normal SIMD8
2885     * untyped surface read message, but that's OK because unused
2886     * channels will be masked out.
2887     */
2888    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ,
2889                                  dst, src_reg(surf_index));
2890    inst->base_mrf = 0;
2891    inst->mlen = 1;
2892 }
2893
2894 void
2895 vec4_visitor::emit_ndc_computation()
2896 {
2897    /* Get the position */
2898    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2899
2900    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2901    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2902    output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2903
2904    current_annotation = "NDC";
2905    dst_reg ndc_w = ndc;
2906    ndc_w.writemask = WRITEMASK_W;
2907    src_reg pos_w = pos;
2908    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2909    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2910
2911    dst_reg ndc_xyz = ndc;
2912    ndc_xyz.writemask = WRITEMASK_XYZ;
2913
2914    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2915 }
2916
2917 void
2918 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2919 {
2920    if (brw->gen < 6 &&
2921        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2922         key->userclip_active || brw->has_negative_rhw_bug)) {
2923       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2924       dst_reg header1_w = header1;
2925       header1_w.writemask = WRITEMASK_W;
2926
2927       emit(MOV(header1, 0u));
2928
2929       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2930          src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2931
2932          current_annotation = "Point size";
2933          emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2934          emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2935       }
2936
2937       if (key->userclip_active) {
2938          current_annotation = "Clipping flags";
2939          dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
2940          dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
2941
2942          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
2943          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
2944          emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
2945
2946          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
2947          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
2948          emit(SHL(flags1, src_reg(flags1), src_reg(4)));
2949          emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
2950       }
2951
2952       /* i965 clipping workaround:
2953        * 1) Test for -ve rhw
2954        * 2) If set,
2955        *      set ndc = (0,0,0,0)
2956        *      set ucp[6] = 1
2957        *
2958        * Later, clipping will detect ucp[6] and ensure the primitive is
2959        * clipped against all fixed planes.
2960        */
2961       if (brw->has_negative_rhw_bug) {
2962          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
2963          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
2964          emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
2965          vec4_instruction *inst;
2966          inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
2967          inst->predicate = BRW_PREDICATE_NORMAL;
2968          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
2969          inst->predicate = BRW_PREDICATE_NORMAL;
2970       }
2971
2972       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2973    } else if (brw->gen < 6) {
2974       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2975    } else {
2976       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2977       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2978          emit(MOV(brw_writemask(reg, WRITEMASK_W),
2979                   src_reg(output_reg[VARYING_SLOT_PSIZ])));
2980       }
2981       if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
2982          emit(MOV(retype(brw_writemask(reg, WRITEMASK_Y), BRW_REGISTER_TYPE_D),
2983                   src_reg(output_reg[VARYING_SLOT_LAYER])));
2984       }
2985       if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
2986          emit(MOV(retype(brw_writemask(reg, WRITEMASK_Z), BRW_REGISTER_TYPE_D),
2987                   src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
2988       }
2989    }
2990 }
2991
2992 void
2993 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
2994 {
2995    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2996     *
2997     *     "If a linked set of shaders forming the vertex stage contains no
2998     *     static write to gl_ClipVertex or gl_ClipDistance, but the
2999     *     application has requested clipping against user clip planes through
3000     *     the API, then the coordinate written to gl_Position is used for
3001     *     comparison against the user clip planes."
3002     *
3003     * This function is only called if the shader didn't write to
3004     * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
3005     * if the user wrote to it; otherwise we use gl_Position.
3006     */
3007    gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
3008    if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
3009       clip_vertex = VARYING_SLOT_POS;
3010    }
3011
3012    for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
3013         ++i) {
3014       reg.writemask = 1 << i;
3015       emit(DP4(reg,
3016                src_reg(output_reg[clip_vertex]),
3017                src_reg(this->userplane[i + offset])));
3018    }
3019 }
3020
3021 void
3022 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
3023 {
3024    assert (varying < VARYING_SLOT_MAX);
3025    reg.type = output_reg[varying].type;
3026    current_annotation = output_reg_annotation[varying];
3027    /* Copy the register, saturating if necessary */
3028    vec4_instruction *inst = emit(MOV(reg,
3029                                      src_reg(output_reg[varying])));
3030    if ((varying == VARYING_SLOT_COL0 ||
3031         varying == VARYING_SLOT_COL1 ||
3032         varying == VARYING_SLOT_BFC0 ||
3033         varying == VARYING_SLOT_BFC1) &&
3034        key->clamp_vertex_color) {
3035       inst->saturate = true;
3036    }
3037 }
3038
3039 void
3040 vec4_visitor::emit_urb_slot(int mrf, int varying)
3041 {
3042    struct brw_reg hw_reg = brw_message_reg(mrf);
3043    dst_reg reg = dst_reg(MRF, mrf);
3044    reg.type = BRW_REGISTER_TYPE_F;
3045
3046    switch (varying) {
3047    case VARYING_SLOT_PSIZ:
3048       /* PSIZ is always in slot 0, and is coupled with other flags. */
3049       current_annotation = "indices, point width, clip flags";
3050       emit_psiz_and_flags(hw_reg);
3051       break;
3052    case BRW_VARYING_SLOT_NDC:
3053       current_annotation = "NDC";
3054       emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
3055       break;
3056    case VARYING_SLOT_POS:
3057       current_annotation = "gl_Position";
3058       emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
3059       break;
3060    case VARYING_SLOT_EDGE:
3061       /* This is present when doing unfilled polygons.  We're supposed to copy
3062        * the edge flag from the user-provided vertex array
3063        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
3064        * of that attribute (starts as 1.0f).  This is then used in clipping to
3065        * determine which edges should be drawn as wireframe.
3066        */
3067       current_annotation = "edge flag";
3068       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
3069                                     glsl_type::float_type, WRITEMASK_XYZW))));
3070       break;
3071    case BRW_VARYING_SLOT_PAD:
3072       /* No need to write to this slot */
3073       break;
3074    default:
3075       emit_generic_urb_slot(reg, varying);
3076       break;
3077    }
3078 }
3079
3080 static int
3081 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
3082 {
3083    if (brw->gen >= 6) {
3084       /* URB data written (does not include the message header reg) must
3085        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
3086        * section 5.4.3.2.2: URB_INTERLEAVED.
3087        *
3088        * URB entries are allocated on a multiple of 1024 bits, so an
3089        * extra 128 bits written here to make the end align to 256 is
3090        * no problem.
3091        */
3092       if ((mlen % 2) != 1)
3093          mlen++;
3094    }
3095
3096    return mlen;
3097 }
3098
3099
3100 /**
3101  * Generates the VUE payload plus the necessary URB write instructions to
3102  * output it.
3103  *
3104  * The VUE layout is documented in Volume 2a.
3105  */
3106 void
3107 vec4_visitor::emit_vertex()
3108 {
3109    /* MRF 0 is reserved for the debugger, so start with message header
3110     * in MRF 1.
3111     */
3112    int base_mrf = 1;
3113    int mrf = base_mrf;
3114    /* In the process of generating our URB write message contents, we
3115     * may need to unspill a register or load from an array.  Those
3116     * reads would use MRFs 14-15.
3117     */
3118    int max_usable_mrf = 13;
3119
3120    /* The following assertion verifies that max_usable_mrf causes an
3121     * even-numbered amount of URB write data, which will meet gen6's
3122     * requirements for length alignment.
3123     */
3124    assert ((max_usable_mrf - base_mrf) % 2 == 0);
3125
3126    /* First mrf is the g0-based message header containing URB handles and
3127     * such.
3128     */
3129    emit_urb_write_header(mrf++);
3130
3131    if (brw->gen < 6) {
3132       emit_ndc_computation();
3133    }
3134
3135    /* Lower legacy ff and ClipVertex clipping to clip distances */
3136    if (key->userclip_active && !prog->UsesClipDistanceOut) {
3137       current_annotation = "user clip distances";
3138
3139       output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3140       output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3141
3142       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3143       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3144    }
3145
3146    /* We may need to split this up into several URB writes, so do them in a
3147     * loop.
3148     */
3149    int slot = 0;
3150    bool complete = false;
3151    do {
3152       /* URB offset is in URB row increments, and each of our MRFs is half of
3153        * one of those, since we're doing interleaved writes.
3154        */
3155       int offset = slot / 2;
3156
3157       mrf = base_mrf + 1;
3158       for (; slot < prog_data->vue_map.num_slots; ++slot) {
3159          emit_urb_slot(mrf++, prog_data->vue_map.slot_to_varying[slot]);
3160
3161          /* If this was max_usable_mrf, we can't fit anything more into this
3162           * URB WRITE.
3163           */
3164          if (mrf > max_usable_mrf) {
3165             slot++;
3166             break;
3167          }
3168       }
3169
3170       complete = slot >= prog_data->vue_map.num_slots;
3171       current_annotation = "URB write";
3172       vec4_instruction *inst = emit_urb_write_opcode(complete);
3173       inst->base_mrf = base_mrf;
3174       inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
3175       inst->offset += offset;
3176    } while(!complete);
3177 }
3178
3179
3180 src_reg
3181 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
3182                                  src_reg *reladdr, int reg_offset)
3183 {
3184    /* Because we store the values to scratch interleaved like our
3185     * vertex data, we need to scale the vec4 index by 2.
3186     */
3187    int message_header_scale = 2;
3188
3189    /* Pre-gen6, the message header uses byte offsets instead of vec4
3190     * (16-byte) offset units.
3191     */
3192    if (brw->gen < 6)
3193       message_header_scale *= 16;
3194
3195    if (reladdr) {
3196       src_reg index = src_reg(this, glsl_type::int_type);
3197
3198       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3199       emit_before(inst, MUL(dst_reg(index),
3200                             index, src_reg(message_header_scale)));
3201
3202       return index;
3203    } else {
3204       return src_reg(reg_offset * message_header_scale);
3205    }
3206 }
3207
3208 src_reg
3209 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
3210                                        src_reg *reladdr, int reg_offset)
3211 {
3212    if (reladdr) {
3213       src_reg index = src_reg(this, glsl_type::int_type);
3214
3215       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3216
3217       /* Pre-gen6, the message header uses byte offsets instead of vec4
3218        * (16-byte) offset units.
3219        */
3220       if (brw->gen < 6) {
3221          emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
3222       }
3223
3224       return index;
3225    } else if (brw->gen >= 8) {
3226       /* Store the offset in a GRF so we can send-from-GRF. */
3227       src_reg offset = src_reg(this, glsl_type::int_type);
3228       emit_before(inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3229       return offset;
3230    } else {
3231       int message_header_scale = brw->gen < 6 ? 16 : 1;
3232       return src_reg(reg_offset * message_header_scale);
3233    }
3234 }
3235
3236 /**
3237  * Emits an instruction before @inst to load the value named by @orig_src
3238  * from scratch space at @base_offset to @temp.
3239  *
3240  * @base_offset is measured in 32-byte units (the size of a register).
3241  */
3242 void
3243 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
3244                                 dst_reg temp, src_reg orig_src,
3245                                 int base_offset)
3246 {
3247    int reg_offset = base_offset + orig_src.reg_offset;
3248    src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
3249
3250    emit_before(inst, SCRATCH_READ(temp, index));
3251 }
3252
3253 /**
3254  * Emits an instruction after @inst to store the value to be written
3255  * to @orig_dst to scratch space at @base_offset, from @temp.
3256  *
3257  * @base_offset is measured in 32-byte units (the size of a register).
3258  */
3259 void
3260 vec4_visitor::emit_scratch_write(vec4_instruction *inst, int base_offset)
3261 {
3262    int reg_offset = base_offset + inst->dst.reg_offset;
3263    src_reg index = get_scratch_offset(inst, inst->dst.reladdr, reg_offset);
3264
3265    /* Create a temporary register to store *inst's result in.
3266     *
3267     * We have to be careful in MOVing from our temporary result register in
3268     * the scratch write.  If we swizzle from channels of the temporary that
3269     * weren't initialized, it will confuse live interval analysis, which will
3270     * make spilling fail to make progress.
3271     */
3272    src_reg temp = src_reg(this, glsl_type::vec4_type);
3273    temp.type = inst->dst.type;
3274    int first_writemask_chan = ffs(inst->dst.writemask) - 1;
3275    int swizzles[4];
3276    for (int i = 0; i < 4; i++)
3277       if (inst->dst.writemask & (1 << i))
3278          swizzles[i] = i;
3279       else
3280          swizzles[i] = first_writemask_chan;
3281    temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
3282                                swizzles[2], swizzles[3]);
3283
3284    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3285                                        inst->dst.writemask));
3286    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3287    write->predicate = inst->predicate;
3288    write->ir = inst->ir;
3289    write->annotation = inst->annotation;
3290    inst->insert_after(write);
3291
3292    inst->dst.file = temp.file;
3293    inst->dst.reg = temp.reg;
3294    inst->dst.reg_offset = temp.reg_offset;
3295    inst->dst.reladdr = NULL;
3296 }
3297
3298 /**
3299  * We can't generally support array access in GRF space, because a
3300  * single instruction's destination can only span 2 contiguous
3301  * registers.  So, we send all GRF arrays that get variable index
3302  * access to scratch space.
3303  */
3304 void
3305 vec4_visitor::move_grf_array_access_to_scratch()
3306 {
3307    int scratch_loc[this->virtual_grf_count];
3308
3309    for (int i = 0; i < this->virtual_grf_count; i++) {
3310       scratch_loc[i] = -1;
3311    }
3312
3313    /* First, calculate the set of virtual GRFs that need to be punted
3314     * to scratch due to having any array access on them, and where in
3315     * scratch.
3316     */
3317    foreach_in_list(vec4_instruction, inst, &instructions) {
3318       if (inst->dst.file == GRF && inst->dst.reladdr &&
3319           scratch_loc[inst->dst.reg] == -1) {
3320          scratch_loc[inst->dst.reg] = c->last_scratch;
3321          c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
3322       }
3323
3324       for (int i = 0 ; i < 3; i++) {
3325          src_reg *src = &inst->src[i];
3326
3327          if (src->file == GRF && src->reladdr &&
3328              scratch_loc[src->reg] == -1) {
3329             scratch_loc[src->reg] = c->last_scratch;
3330             c->last_scratch += this->virtual_grf_sizes[src->reg];
3331          }
3332       }
3333    }
3334
3335    /* Now, for anything that will be accessed through scratch, rewrite
3336     * it to load/store.  Note that this is a _safe list walk, because
3337     * we may generate a new scratch_write instruction after the one
3338     * we're processing.
3339     */
3340    foreach_in_list_safe(vec4_instruction, inst, &instructions) {
3341       /* Set up the annotation tracking for new generated instructions. */
3342       base_ir = inst->ir;
3343       current_annotation = inst->annotation;
3344
3345       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
3346          emit_scratch_write(inst, scratch_loc[inst->dst.reg]);
3347       }
3348
3349       for (int i = 0 ; i < 3; i++) {
3350          if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
3351             continue;
3352
3353          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3354
3355          emit_scratch_read(inst, temp, inst->src[i],
3356                            scratch_loc[inst->src[i].reg]);
3357
3358          inst->src[i].file = temp.file;
3359          inst->src[i].reg = temp.reg;
3360          inst->src[i].reg_offset = temp.reg_offset;
3361          inst->src[i].reladdr = NULL;
3362       }
3363    }
3364 }
3365
3366 /**
3367  * Emits an instruction before @inst to load the value named by @orig_src
3368  * from the pull constant buffer (surface) at @base_offset to @temp.
3369  */
3370 void
3371 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
3372                                       dst_reg temp, src_reg orig_src,
3373                                       int base_offset)
3374 {
3375    int reg_offset = base_offset + orig_src.reg_offset;
3376    src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3377    src_reg offset = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
3378    vec4_instruction *load;
3379
3380    if (brw->gen >= 7) {
3381       dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
3382       grf_offset.type = offset.type;
3383       emit_before(inst, MOV(grf_offset, offset));
3384
3385       load = new(mem_ctx) vec4_instruction(this,
3386                                            VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
3387                                            temp, index, src_reg(grf_offset));
3388    } else {
3389       load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
3390                                            temp, index, offset);
3391       load->base_mrf = 14;
3392       load->mlen = 1;
3393    }
3394    emit_before(inst, load);
3395 }
3396
3397 /**
3398  * Implements array access of uniforms by inserting a
3399  * PULL_CONSTANT_LOAD instruction.
3400  *
3401  * Unlike temporary GRF array access (where we don't support it due to
3402  * the difficulty of doing relative addressing on instruction
3403  * destinations), we could potentially do array access of uniforms
3404  * that were loaded in GRF space as push constants.  In real-world
3405  * usage we've seen, though, the arrays being used are always larger
3406  * than we could load as push constants, so just always move all
3407  * uniform array access out to a pull constant buffer.
3408  */
3409 void
3410 vec4_visitor::move_uniform_array_access_to_pull_constants()
3411 {
3412    int pull_constant_loc[this->uniforms];
3413
3414    for (int i = 0; i < this->uniforms; i++) {
3415       pull_constant_loc[i] = -1;
3416    }
3417
3418    /* Walk through and find array access of uniforms.  Put a copy of that
3419     * uniform in the pull constant buffer.
3420     *
3421     * Note that we don't move constant-indexed accesses to arrays.  No
3422     * testing has been done of the performance impact of this choice.
3423     */
3424    foreach_in_list_safe(vec4_instruction, inst, &instructions) {
3425       for (int i = 0 ; i < 3; i++) {
3426          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3427             continue;
3428
3429          int uniform = inst->src[i].reg;
3430
3431          /* If this array isn't already present in the pull constant buffer,
3432           * add it.
3433           */
3434          if (pull_constant_loc[uniform] == -1) {
3435             const gl_constant_value **values =
3436                &stage_prog_data->param[uniform * 4];
3437
3438             pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3439
3440             assert(uniform < uniform_array_size);
3441             for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3442                stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3443                   = values[j];
3444             }
3445          }
3446
3447          /* Set up the annotation tracking for new generated instructions. */
3448          base_ir = inst->ir;
3449          current_annotation = inst->annotation;
3450
3451          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3452
3453          emit_pull_constant_load(inst, temp, inst->src[i],
3454                                  pull_constant_loc[uniform]);
3455
3456          inst->src[i].file = temp.file;
3457          inst->src[i].reg = temp.reg;
3458          inst->src[i].reg_offset = temp.reg_offset;
3459          inst->src[i].reladdr = NULL;
3460       }
3461    }
3462
3463    /* Now there are no accesses of the UNIFORM file with a reladdr, so
3464     * no need to track them as larger-than-vec4 objects.  This will be
3465     * relied on in cutting out unused uniform vectors from push
3466     * constants.
3467     */
3468    split_uniform_registers();
3469 }
3470
3471 void
3472 vec4_visitor::resolve_ud_negate(src_reg *reg)
3473 {
3474    if (reg->type != BRW_REGISTER_TYPE_UD ||
3475        !reg->negate)
3476       return;
3477
3478    src_reg temp = src_reg(this, glsl_type::uvec4_type);
3479    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3480    *reg = temp;
3481 }
3482
3483 vec4_visitor::vec4_visitor(struct brw_context *brw,
3484                            struct brw_vec4_compile *c,
3485                            struct gl_program *prog,
3486                            const struct brw_vec4_prog_key *key,
3487                            struct brw_vec4_prog_data *prog_data,
3488                            struct gl_shader_program *shader_prog,
3489                            gl_shader_stage stage,
3490                            void *mem_ctx,
3491                            bool debug_flag,
3492                            bool no_spills,
3493                            shader_time_shader_type st_base,
3494                            shader_time_shader_type st_written,
3495                            shader_time_shader_type st_reset)
3496    : backend_visitor(brw, shader_prog, prog, &prog_data->base, stage),
3497      c(c),
3498      key(key),
3499      prog_data(prog_data),
3500      sanity_param_count(0),
3501      fail_msg(NULL),
3502      first_non_payload_grf(0),
3503      need_all_constants_in_pull_buffer(false),
3504      debug_flag(debug_flag),
3505      no_spills(no_spills),
3506      st_base(st_base),
3507      st_written(st_written),
3508      st_reset(st_reset)
3509 {
3510    this->mem_ctx = mem_ctx;
3511    this->failed = false;
3512
3513    this->base_ir = NULL;
3514    this->current_annotation = NULL;
3515    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3516
3517    this->variable_ht = hash_table_ctor(0,
3518                                        hash_table_pointer_hash,
3519                                        hash_table_pointer_compare);
3520
3521    this->virtual_grf_start = NULL;
3522    this->virtual_grf_end = NULL;
3523    this->virtual_grf_sizes = NULL;
3524    this->virtual_grf_count = 0;
3525    this->virtual_grf_reg_map = NULL;
3526    this->virtual_grf_reg_count = 0;
3527    this->virtual_grf_array_size = 0;
3528    this->live_intervals_valid = false;
3529
3530    this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3531
3532    this->uniforms = 0;
3533
3534    /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3535     * at least one. See setup_uniforms() in brw_vec4.cpp.
3536     */
3537    this->uniform_array_size = 1;
3538    if (prog_data) {
3539       this->uniform_array_size = MAX2(stage_prog_data->nr_params, 1);
3540    }
3541
3542    this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3543    this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3544 }
3545
3546 vec4_visitor::~vec4_visitor()
3547 {
3548    hash_table_dtor(this->variable_ht);
3549 }
3550
3551
3552 void
3553 vec4_visitor::fail(const char *format, ...)
3554 {
3555    va_list va;
3556    char *msg;
3557
3558    if (failed)
3559       return;
3560
3561    failed = true;
3562
3563    va_start(va, format);
3564    msg = ralloc_vasprintf(mem_ctx, format, va);
3565    va_end(va);
3566    msg = ralloc_asprintf(mem_ctx, "vec4 compile failed: %s\n", msg);
3567
3568    this->fail_msg = msg;
3569
3570    if (debug_flag) {
3571       fprintf(stderr, "%s",  msg);
3572    }
3573 }
3574
3575 } /* namespace brw */