src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "glsl/ir_uniform.h"
  26 extern "C" {
  27 #include "program/sampler.h"
  28 }
  29
  30 namespace brw {
  31
  32 vec4_instruction::vec4_instruction(vec4_visitor *v,
  33                                    enum opcode opcode, const dst_reg &dst,
  34                                    const src_reg &src0, const src_reg &src1,
  35                                    const src_reg &src2)
  36 {
  37    this->opcode = opcode;
  38    this->dst = dst;
  39    this->src[0] = src0;
  40    this->src[1] = src1;
  41    this->src[2] = src2;
  42    this->saturate = false;
  43    this->force_writemask_all = false;
  44    this->no_dd_clear = false;
  45    this->no_dd_check = false;
  46    this->writes_accumulator = false;
  47    this->conditional_mod = BRW_CONDITIONAL_NONE;
  48    this->texture_offset = 0;
  49    this->target = 0;
  50    this->shadow_compare = false;
  51    this->ir = v->base_ir;
  52    this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
  53    this->header_present = false;
  54    this->mlen = 0;
  55    this->base_mrf = 0;
  56    this->offset = 0;
  57    this->annotation = v->current_annotation;
  58 }
  59
  60 vec4_instruction *
  61 vec4_visitor::emit(vec4_instruction *inst)
  62 {
  63    this->instructions.push_tail(inst);
  64
  65    return inst;
  66 }
  67
  68 vec4_instruction *
  69 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
  70 {
  71    new_inst->ir = inst->ir;
  72    new_inst->annotation = inst->annotation;
  73
  74    inst->insert_before(new_inst);
  75
  76    return inst;
  77 }
  78
  79 vec4_instruction *
  80 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
  81                    src_reg src0, src_reg src1, src_reg src2)
  82 {
  83    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
  84                                              src0, src1, src2));
  85 }
  86
  87
  88 vec4_instruction *
  89 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
  90 {
  91    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
  92 }
  93
  94 vec4_instruction *
  95 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
  96 {
  97    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
  98 }
  99
 100 vec4_instruction *
 101 vec4_visitor::emit(enum opcode opcode, dst_reg dst)
 102 {
 103    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst));
 104 }
 105
 106 vec4_instruction *
 107 vec4_visitor::emit(enum opcode opcode)
 108 {
 109    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
 110 }
 111
 112 #define ALU1(op)                                                        \
 113    vec4_instruction *                                                   \
 114    vec4_visitor::op(const dst_reg &dst, const src_reg &src0)            \
 115    {                                                                    \
 116       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 117                                            src0);                       \
 118    }
 119
 120 #define ALU2(op)                                                        \
 121    vec4_instruction *                                                   \
 122    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 123                     const src_reg &src1)                                \
 124    {                                                                    \
 125       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 126                                            src0, src1);                 \
 127    }
 128
 129 #define ALU2_ACC(op)                                                    \
 130    vec4_instruction *                                                   \
 131    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 132                     const src_reg &src1)                                \
 133    {                                                                    \
 134       vec4_instruction *inst = new(mem_ctx) vec4_instruction(this,     \
 135                        BRW_OPCODE_##op, dst, src0, src1);               \
 136       inst->writes_accumulator = true;                                 \
 137       return inst;                                                     \
 138    }
 139
 140 #define ALU3(op)                                                        \
 141    vec4_instruction *                                                   \
 142    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 143                     const src_reg &src1, const src_reg &src2)           \
 144    {                                                                    \
 145       assert(brw->gen >= 6);                                            \
 146       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 147                                            src0, src1, src2);           \
 148    }
 149
 150 ALU1(NOT)
 151 ALU1(MOV)
 152 ALU1(FRC)
 153 ALU1(RNDD)
 154 ALU1(RNDE)
 155 ALU1(RNDZ)
 156 ALU1(F32TO16)
 157 ALU1(F16TO32)
 158 ALU2(ADD)
 159 ALU2(MUL)
 160 ALU2_ACC(MACH)
 161 ALU2(AND)
 162 ALU2(OR)
 163 ALU2(XOR)
 164 ALU2(DP3)
 165 ALU2(DP4)
 166 ALU2(DPH)
 167 ALU2(SHL)
 168 ALU2(SHR)
 169 ALU2(ASR)
 170 ALU3(LRP)
 171 ALU1(BFREV)
 172 ALU3(BFE)
 173 ALU2(BFI1)
 174 ALU3(BFI2)
 175 ALU1(FBH)
 176 ALU1(FBL)
 177 ALU1(CBIT)
 178 ALU3(MAD)
 179 ALU2_ACC(ADDC)
 180 ALU2_ACC(SUBB)
 181 ALU2(MAC)
 182
 183 /** Gen4 predicated IF. */
 184 vec4_instruction *
 185 vec4_visitor::IF(enum brw_predicate predicate)
 186 {
 187    vec4_instruction *inst;
 188
 189    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
 190    inst->predicate = predicate;
 191
 192    return inst;
 193 }
 194
 195 /** Gen6 IF with embedded comparison. */
 196 vec4_instruction *
 197 vec4_visitor::IF(src_reg src0, src_reg src1,
 198                  enum brw_conditional_mod condition)
 199 {
 200    assert(brw->gen == 6);
 201
 202    vec4_instruction *inst;
 203
 204    resolve_ud_negate(&src0);
 205    resolve_ud_negate(&src1);
 206
 207    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
 208                                         src0, src1);
 209    inst->conditional_mod = condition;
 210
 211    return inst;
 212 }
 213
 214 /**
 215  * CMP: Sets the low bit of the destination channels with the result
 216  * of the comparison, while the upper bits are undefined, and updates
 217  * the flag register with the packed 16 bits of the result.
 218  */
 219 vec4_instruction *
 220 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
 221                   enum brw_conditional_mod condition)
 222 {
 223    vec4_instruction *inst;
 224
 225    /* original gen4 does type conversion to the destination type
 226     * before before comparison, producing garbage results for floating
 227     * point comparisons.
 228     */
 229    if (brw->gen == 4) {
 230       dst.type = src0.type;
 231       if (dst.file == HW_REG)
 232          dst.fixed_hw_reg.type = dst.type;
 233    }
 234
 235    resolve_ud_negate(&src0);
 236    resolve_ud_negate(&src1);
 237
 238    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
 239    inst->conditional_mod = condition;
 240
 241    return inst;
 242 }
 243
 244 vec4_instruction *
 245 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
 246 {
 247    vec4_instruction *inst;
 248
 249    inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_READ,
 250                                         dst, index);
 251    inst->base_mrf = 14;
 252    inst->mlen = 2;
 253
 254    return inst;
 255 }
 256
 257 vec4_instruction *
 258 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
 259                             const src_reg &index)
 260 {
 261    vec4_instruction *inst;
 262
 263    inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_WRITE,
 264                                         dst, src, index);
 265    inst->base_mrf = 13;
 266    inst->mlen = 3;
 267
 268    return inst;
 269 }
 270
 271 void
 272 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
 273 {
 274    static enum opcode dot_opcodes[] = {
 275       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
 276    };
 277
 278    emit(dot_opcodes[elements - 2], dst, src0, src1);
 279 }
 280
 281 src_reg
 282 vec4_visitor::fix_3src_operand(src_reg src)
 283 {
 284    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
 285     * able to use vertical stride of zero to replicate the vec4 uniform, like
 286     *
 287     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
 288     *
 289     * But you can't, since vertical stride is always four in three-source
 290     * instructions. Instead, insert a MOV instruction to do the replication so
 291     * that the three-source instruction can consume it.
 292     */
 293
 294    /* The MOV is only needed if the source is a uniform or immediate. */
 295    if (src.file != UNIFORM && src.file != IMM)
 296       return src;
 297
 298    if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
 299       return src;
 300
 301    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 302    expanded.type = src.type;
 303    emit(MOV(expanded, src));
 304    return src_reg(expanded);
 305 }
 306
 307 src_reg
 308 vec4_visitor::fix_math_operand(src_reg src)
 309 {
 310    /* The gen6 math instruction ignores the source modifiers --
 311     * swizzle, abs, negate, and at least some parts of the register
 312     * region description.
 313     *
 314     * Rather than trying to enumerate all these cases, *always* expand the
 315     * operand to a temp GRF for gen6.
 316     *
 317     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 318     * can't use.
 319     */
 320
 321    if (brw->gen == 7 && src.file != IMM)
 322       return src;
 323
 324    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 325    expanded.type = src.type;
 326    emit(MOV(expanded, src));
 327    return src_reg(expanded);
 328 }
 329
 330 void
 331 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
 332 {
 333    src = fix_math_operand(src);
 334
 335    if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
 336       /* The gen6 math instruction must be align1, so we can't do
 337        * writemasks.
 338        */
 339       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 340
 341       emit(opcode, temp_dst, src);
 342
 343       emit(MOV(dst, src_reg(temp_dst)));
 344    } else {
 345       emit(opcode, dst, src);
 346    }
 347 }
 348
 349 void
 350 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
 351 {
 352    vec4_instruction *inst = emit(opcode, dst, src);
 353    inst->base_mrf = 1;
 354    inst->mlen = 1;
 355 }
 356
 357 void
 358 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
 359 {
 360    switch (opcode) {
 361    case SHADER_OPCODE_RCP:
 362    case SHADER_OPCODE_RSQ:
 363    case SHADER_OPCODE_SQRT:
 364    case SHADER_OPCODE_EXP2:
 365    case SHADER_OPCODE_LOG2:
 366    case SHADER_OPCODE_SIN:
 367    case SHADER_OPCODE_COS:
 368       break;
 369    default:
 370       unreachable("not reached: bad math opcode");
 371    }
 372
 373    if (brw->gen >= 8) {
 374       emit(opcode, dst, src);
 375    } else if (brw->gen >= 6) {
 376       emit_math1_gen6(opcode, dst, src);
 377    } else {
 378       emit_math1_gen4(opcode, dst, src);
 379    }
 380 }
 381
 382 void
 383 vec4_visitor::emit_math2_gen6(enum opcode opcode,
 384                               dst_reg dst, src_reg src0, src_reg src1)
 385 {
 386    src0 = fix_math_operand(src0);
 387    src1 = fix_math_operand(src1);
 388
 389    if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
 390       /* The gen6 math instruction must be align1, so we can't do
 391        * writemasks.
 392        */
 393       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 394       temp_dst.type = dst.type;
 395
 396       emit(opcode, temp_dst, src0, src1);
 397
 398       emit(MOV(dst, src_reg(temp_dst)));
 399    } else {
 400       emit(opcode, dst, src0, src1);
 401    }
 402 }
 403
 404 void
 405 vec4_visitor::emit_math2_gen4(enum opcode opcode,
 406                               dst_reg dst, src_reg src0, src_reg src1)
 407 {
 408    vec4_instruction *inst = emit(opcode, dst, src0, src1);
 409    inst->base_mrf = 1;
 410    inst->mlen = 2;
 411 }
 412
 413 void
 414 vec4_visitor::emit_math(enum opcode opcode,
 415                         dst_reg dst, src_reg src0, src_reg src1)
 416 {
 417    switch (opcode) {
 418    case SHADER_OPCODE_POW:
 419    case SHADER_OPCODE_INT_QUOTIENT:
 420    case SHADER_OPCODE_INT_REMAINDER:
 421       break;
 422    default:
 423       unreachable("not reached: unsupported binary math opcode");
 424    }
 425
 426    if (brw->gen >= 8) {
 427       emit(opcode, dst, src0, src1);
 428    } else if (brw->gen >= 6) {
 429       emit_math2_gen6(opcode, dst, src0, src1);
 430    } else {
 431       emit_math2_gen4(opcode, dst, src0, src1);
 432    }
 433 }
 434
 435 void
 436 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 437 {
 438    if (brw->gen < 7) {
 439       unreachable("ir_unop_pack_half_2x16 should be lowered");
 440    }
 441
 442    assert(dst.type == BRW_REGISTER_TYPE_UD);
 443    assert(src0.type == BRW_REGISTER_TYPE_F);
 444
 445    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
 446     *
 447     *   Because this instruction does not have a 16-bit floating-point type,
 448     *   the destination data type must be Word (W).
 449     *
 450     *   The destination must be DWord-aligned and specify a horizontal stride
 451     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
 452     *   each destination channel and the upper word is not modified.
 453     *
 454     * The above restriction implies that the f32to16 instruction must use
 455     * align1 mode, because only in align1 mode is it possible to specify
 456     * horizontal stride.  We choose here to defy the hardware docs and emit
 457     * align16 instructions.
 458     *
 459     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
 460     * instructions. I was partially successful in that the code passed all
 461     * tests.  However, the code was dubiously correct and fragile, and the
 462     * tests were not harsh enough to probe that frailty. Not trusting the
 463     * code, I chose instead to remain in align16 mode in defiance of the hw
 464     * docs).
 465     *
 466     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
 467     * simulator, emitting a f32to16 in align16 mode with UD as destination
 468     * data type is safe. The behavior differs from that specified in the PRM
 469     * in that the upper word of each destination channel is cleared to 0.
 470     */
 471
 472    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 473    src_reg tmp_src(tmp_dst);
 474
 475 #if 0
 476    /* Verify the undocumented behavior on which the following instructions
 477     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
 478     * then the result of the bit-or instruction below will be incorrect.
 479     *
 480     * You should inspect the disasm output in order to verify that the MOV is
 481     * not optimized away.
 482     */
 483    emit(MOV(tmp_dst, src_reg(0x12345678u)));
 484 #endif
 485
 486    /* Give tmp the form below, where "." means untouched.
 487     *
 488     *     w z          y          x w z          y          x
 489     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
 490     *
 491     * That the upper word of each write-channel be 0 is required for the
 492     * following bit-shift and bit-or instructions to work. Note that this
 493     * relies on the undocumented hardware behavior mentioned above.
 494     */
 495    tmp_dst.writemask = WRITEMASK_XY;
 496    emit(F32TO16(tmp_dst, src0));
 497
 498    /* Give the write-channels of dst the form:
 499     *   0xhhhh0000
 500     */
 501    tmp_src.swizzle = BRW_SWIZZLE_YYYY;
 502    emit(SHL(dst, tmp_src, src_reg(16u)));
 503
 504    /* Finally, give the write-channels of dst the form of packHalf2x16's
 505     * output:
 506     *   0xhhhhllll
 507     */
 508    tmp_src.swizzle = BRW_SWIZZLE_XXXX;
 509    emit(OR(dst, src_reg(dst), tmp_src));
 510 }
 511
 512 void
 513 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 514 {
 515    if (brw->gen < 7) {
 516       unreachable("ir_unop_unpack_half_2x16 should be lowered");
 517    }
 518
 519    assert(dst.type == BRW_REGISTER_TYPE_F);
 520    assert(src0.type == BRW_REGISTER_TYPE_UD);
 521
 522    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
 523     *
 524     *   Because this instruction does not have a 16-bit floating-point type,
 525     *   the source data type must be Word (W). The destination type must be
 526     *   F (Float).
 527     *
 528     * To use W as the source data type, we must adjust horizontal strides,
 529     * which is only possible in align1 mode. All my [chadv] attempts at
 530     * emitting align1 instructions for unpackHalf2x16 failed to pass the
 531     * Piglit tests, so I gave up.
 532     *
 533     * I've verified that, on gen7 hardware and the simulator, it is safe to
 534     * emit f16to32 in align16 mode with UD as source data type.
 535     */
 536
 537    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 538    src_reg tmp_src(tmp_dst);
 539
 540    tmp_dst.writemask = WRITEMASK_X;
 541    emit(AND(tmp_dst, src0, src_reg(0xffffu)));
 542
 543    tmp_dst.writemask = WRITEMASK_Y;
 544    emit(SHR(tmp_dst, src0, src_reg(16u)));
 545
 546    dst.writemask = WRITEMASK_XY;
 547    emit(F16TO32(dst, tmp_src));
 548 }
 549
 550 void
 551 vec4_visitor::visit_instructions(const exec_list *list)
 552 {
 553    foreach_in_list(ir_instruction, ir, list) {
 554       base_ir = ir;
 555       ir->accept(this);
 556    }
 557 }
 558
 559
 560 static int
 561 type_size(const struct glsl_type *type)
 562 {
 563    unsigned int i;
 564    int size;
 565
 566    switch (type->base_type) {
 567    case GLSL_TYPE_UINT:
 568    case GLSL_TYPE_INT:
 569    case GLSL_TYPE_FLOAT:
 570    case GLSL_TYPE_BOOL:
 571       if (type->is_matrix()) {
 572          return type->matrix_columns;
 573       } else {
 574          /* Regardless of size of vector, it gets a vec4. This is bad
 575           * packing for things like floats, but otherwise arrays become a
 576           * mess.  Hopefully a later pass over the code can pack scalars
 577           * down if appropriate.
 578           */
 579          return 1;
 580       }
 581    case GLSL_TYPE_ARRAY:
 582       assert(type->length > 0);
 583       return type_size(type->fields.array) * type->length;
 584    case GLSL_TYPE_STRUCT:
 585       size = 0;
 586       for (i = 0; i < type->length; i++) {
 587          size += type_size(type->fields.structure[i].type);
 588       }
 589       return size;
 590    case GLSL_TYPE_SAMPLER:
 591       /* Samplers take up one slot in UNIFORMS[], but they're baked in
 592        * at link time.
 593        */
 594       return 1;
 595    case GLSL_TYPE_ATOMIC_UINT:
 596       return 0;
 597    case GLSL_TYPE_IMAGE:
 598    case GLSL_TYPE_VOID:
 599    case GLSL_TYPE_ERROR:
 600    case GLSL_TYPE_INTERFACE:
 601       unreachable("not reached");
 602    }
 603
 604    return 0;
 605 }
 606
 607 int
 608 vec4_visitor::virtual_grf_alloc(int size)
 609 {
 610    if (virtual_grf_array_size <= virtual_grf_count) {
 611       if (virtual_grf_array_size == 0)
 612          virtual_grf_array_size = 16;
 613       else
 614          virtual_grf_array_size *= 2;
 615       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 616                                    virtual_grf_array_size);
 617       virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
 618                                      virtual_grf_array_size);
 619    }
 620    virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
 621    virtual_grf_reg_count += size;
 622    virtual_grf_sizes[virtual_grf_count] = size;
 623    return virtual_grf_count++;
 624 }
 625
 626 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 627 {
 628    init();
 629
 630    this->file = GRF;
 631    this->reg = v->virtual_grf_alloc(type_size(type));
 632
 633    if (type->is_array() || type->is_record()) {
 634       this->swizzle = BRW_SWIZZLE_NOOP;
 635    } else {
 636       this->swizzle = swizzle_for_size(type->vector_elements);
 637    }
 638
 639    this->type = brw_type_for_base_type(type);
 640 }
 641
 642 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 643 {
 644    init();
 645
 646    this->file = GRF;
 647    this->reg = v->virtual_grf_alloc(type_size(type));
 648
 649    if (type->is_array() || type->is_record()) {
 650       this->writemask = WRITEMASK_XYZW;
 651    } else {
 652       this->writemask = (1 << type->vector_elements) - 1;
 653    }
 654
 655    this->type = brw_type_for_base_type(type);
 656 }
 657
 658 /* Our support for uniforms is piggy-backed on the struct
 659  * gl_fragment_program, because that's where the values actually
 660  * get stored, rather than in some global gl_shader_program uniform
 661  * store.
 662  */
 663 void
 664 vec4_visitor::setup_uniform_values(ir_variable *ir)
 665 {
 666    int namelen = strlen(ir->name);
 667
 668    /* The data for our (non-builtin) uniforms is stored in a series of
 669     * gl_uniform_driver_storage structs for each subcomponent that
 670     * glGetUniformLocation() could name.  We know it's been set up in the same
 671     * order we'd walk the type, so walk the list of storage and find anything
 672     * with our name, or the prefix of a component that starts with our name.
 673     */
 674    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 675       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 676
 677       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 678           (storage->name[namelen] != 0 &&
 679            storage->name[namelen] != '.' &&
 680            storage->name[namelen] != '[')) {
 681          continue;
 682       }
 683
 684       gl_constant_value *components = storage->storage;
 685       unsigned vector_count = (MAX2(storage->array_elements, 1) *
 686                                storage->type->matrix_columns);
 687
 688       for (unsigned s = 0; s < vector_count; s++) {
 689          assert(uniforms < uniform_array_size);
 690          uniform_vector_size[uniforms] = storage->type->vector_elements;
 691
 692          int i;
 693          for (i = 0; i < uniform_vector_size[uniforms]; i++) {
 694             stage_prog_data->param[uniforms * 4 + i] = components;
 695             components++;
 696          }
 697          for (; i < 4; i++) {
 698             static gl_constant_value zero = { .f = 0.0 };
 699             stage_prog_data->param[uniforms * 4 + i] = &zero;
 700          }
 701
 702          uniforms++;
 703       }
 704    }
 705 }
 706
 707 void
 708 vec4_visitor::setup_uniform_clipplane_values()
 709 {
 710    gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
 711
 712    for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
 713       assert(this->uniforms < uniform_array_size);
 714       this->uniform_vector_size[this->uniforms] = 4;
 715       this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
 716       this->userplane[i].type = BRW_REGISTER_TYPE_F;
 717       for (int j = 0; j < 4; ++j) {
 718          stage_prog_data->param[this->uniforms * 4 + j] =
 719             (gl_constant_value *) &clip_planes[i][j];
 720       }
 721       ++this->uniforms;
 722    }
 723 }
 724
 725 /* Our support for builtin uniforms is even scarier than non-builtin.
 726  * It sits on top of the PROG_STATE_VAR parameters that are
 727  * automatically updated from GL context state.
 728  */
 729 void
 730 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
 731 {
 732    const ir_state_slot *const slots = ir->state_slots;
 733    assert(ir->state_slots != NULL);
 734
 735    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 736       /* This state reference has already been setup by ir_to_mesa,
 737        * but we'll get the same index back here.  We can reference
 738        * ParameterValues directly, since unlike brw_fs.cpp, we never
 739        * add new state references during compile.
 740        */
 741       int index = _mesa_add_state_reference(this->prog->Parameters,
 742                                             (gl_state_index *)slots[i].tokens);
 743       gl_constant_value *values =
 744          &this->prog->Parameters->ParameterValues[index][0];
 745
 746       assert(this->uniforms < uniform_array_size);
 747       this->uniform_vector_size[this->uniforms] = 0;
 748       /* Add each of the unique swizzled channels of the element.
 749        * This will end up matching the size of the glsl_type of this field.
 750        */
 751       int last_swiz = -1;
 752       for (unsigned int j = 0; j < 4; j++) {
 753          int swiz = GET_SWZ(slots[i].swizzle, j);
 754          last_swiz = swiz;
 755
 756          stage_prog_data->param[this->uniforms * 4 + j] = &values[swiz];
 757          assert(this->uniforms < uniform_array_size);
 758          if (swiz <= last_swiz)
 759             this->uniform_vector_size[this->uniforms]++;
 760       }
 761       this->uniforms++;
 762    }
 763 }
 764
 765 dst_reg *
 766 vec4_visitor::variable_storage(ir_variable *var)
 767 {
 768    return (dst_reg *)hash_table_find(this->variable_ht, var);
 769 }
 770
 771 void
 772 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir,
 773                                      enum brw_predicate *predicate)
 774 {
 775    ir_expression *expr = ir->as_expression();
 776
 777    *predicate = BRW_PREDICATE_NORMAL;
 778
 779    if (expr) {
 780       src_reg op[2];
 781       vec4_instruction *inst;
 782
 783       assert(expr->get_num_operands() <= 2);
 784       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 785          expr->operands[i]->accept(this);
 786          op[i] = this->result;
 787
 788          resolve_ud_negate(&op[i]);
 789       }
 790
 791       switch (expr->operation) {
 792       case ir_unop_logic_not:
 793          inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
 794          inst->conditional_mod = BRW_CONDITIONAL_Z;
 795          break;
 796
 797       case ir_binop_logic_xor:
 798          inst = emit(XOR(dst_null_d(), op[0], op[1]));
 799          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 800          break;
 801
 802       case ir_binop_logic_or:
 803          inst = emit(OR(dst_null_d(), op[0], op[1]));
 804          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 805          break;
 806
 807       case ir_binop_logic_and:
 808          inst = emit(AND(dst_null_d(), op[0], op[1]));
 809          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 810          break;
 811
 812       case ir_unop_f2b:
 813          if (brw->gen >= 6) {
 814             emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
 815          } else {
 816             inst = emit(MOV(dst_null_f(), op[0]));
 817             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 818          }
 819          break;
 820
 821       case ir_unop_i2b:
 822          if (brw->gen >= 6) {
 823             emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 824          } else {
 825             inst = emit(MOV(dst_null_d(), op[0]));
 826             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 827          }
 828          break;
 829
 830       case ir_binop_all_equal:
 831          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 832          *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
 833          break;
 834
 835       case ir_binop_any_nequal:
 836          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 837          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 838          break;
 839
 840       case ir_unop_any:
 841          inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 842          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 843          break;
 844
 845       case ir_binop_greater:
 846       case ir_binop_gequal:
 847       case ir_binop_less:
 848       case ir_binop_lequal:
 849       case ir_binop_equal:
 850       case ir_binop_nequal:
 851          emit(CMP(dst_null_d(), op[0], op[1],
 852                   brw_conditional_for_comparison(expr->operation)));
 853          break;
 854
 855       default:
 856          unreachable("not reached");
 857       }
 858       return;
 859    }
 860
 861    ir->accept(this);
 862
 863    resolve_ud_negate(&this->result);
 864
 865    if (brw->gen >= 6) {
 866       vec4_instruction *inst = emit(AND(dst_null_d(),
 867                                         this->result, src_reg(1)));
 868       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 869    } else {
 870       vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
 871       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 872    }
 873 }
 874
 875 /**
 876  * Emit a gen6 IF statement with the comparison folded into the IF
 877  * instruction.
 878  */
 879 void
 880 vec4_visitor::emit_if_gen6(ir_if *ir)
 881 {
 882    ir_expression *expr = ir->condition->as_expression();
 883
 884    if (expr) {
 885       src_reg op[2];
 886       dst_reg temp;
 887
 888       assert(expr->get_num_operands() <= 2);
 889       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 890          expr->operands[i]->accept(this);
 891          op[i] = this->result;
 892       }
 893
 894       switch (expr->operation) {
 895       case ir_unop_logic_not:
 896          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
 897          return;
 898
 899       case ir_binop_logic_xor:
 900          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
 901          return;
 902
 903       case ir_binop_logic_or:
 904          temp = dst_reg(this, glsl_type::bool_type);
 905          emit(OR(temp, op[0], op[1]));
 906          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 907          return;
 908
 909       case ir_binop_logic_and:
 910          temp = dst_reg(this, glsl_type::bool_type);
 911          emit(AND(temp, op[0], op[1]));
 912          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 913          return;
 914
 915       case ir_unop_f2b:
 916          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 917          return;
 918
 919       case ir_unop_i2b:
 920          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 921          return;
 922
 923       case ir_binop_greater:
 924       case ir_binop_gequal:
 925       case ir_binop_less:
 926       case ir_binop_lequal:
 927       case ir_binop_equal:
 928       case ir_binop_nequal:
 929          emit(IF(op[0], op[1],
 930                  brw_conditional_for_comparison(expr->operation)));
 931          return;
 932
 933       case ir_binop_all_equal:
 934          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 935          emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
 936          return;
 937
 938       case ir_binop_any_nequal:
 939          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 940          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 941          return;
 942
 943       case ir_unop_any:
 944          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 945          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 946          return;
 947
 948       default:
 949          unreachable("not reached");
 950       }
 951       return;
 952    }
 953
 954    ir->condition->accept(this);
 955
 956    emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
 957 }
 958
 959 void
 960 vec4_visitor::visit(ir_variable *ir)
 961 {
 962    dst_reg *reg = NULL;
 963
 964    if (variable_storage(ir))
 965       return;
 966
 967    switch (ir->data.mode) {
 968    case ir_var_shader_in:
 969       reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
 970       break;
 971
 972    case ir_var_shader_out:
 973       reg = new(mem_ctx) dst_reg(this, ir->type);
 974
 975       for (int i = 0; i < type_size(ir->type); i++) {
 976          output_reg[ir->data.location + i] = *reg;
 977          output_reg[ir->data.location + i].reg_offset = i;
 978          output_reg[ir->data.location + i].type =
 979             brw_type_for_base_type(ir->type->get_scalar_type());
 980          output_reg_annotation[ir->data.location + i] = ir->name;
 981       }
 982       break;
 983
 984    case ir_var_auto:
 985    case ir_var_temporary:
 986       reg = new(mem_ctx) dst_reg(this, ir->type);
 987       break;
 988
 989    case ir_var_uniform:
 990       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
 991
 992       /* Thanks to the lower_ubo_reference pass, we will see only
 993        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
 994        * variables, so no need for them to be in variable_ht.
 995        *
 996        * Atomic counters take no uniform storage, no need to do
 997        * anything here.
 998        */
 999       if (ir->is_in_uniform_block() || ir->type->contains_atomic())
1000          return;
1001
1002       /* Track how big the whole uniform variable is, in case we need to put a
1003        * copy of its data into pull constants for array access.
1004        */
1005       assert(this->uniforms < uniform_array_size);
1006       this->uniform_size[this->uniforms] = type_size(ir->type);
1007
1008       if (!strncmp(ir->name, "gl_", 3)) {
1009          setup_builtin_uniform_values(ir);
1010       } else {
1011          setup_uniform_values(ir);
1012       }
1013       break;
1014
1015    case ir_var_system_value:
1016       reg = make_reg_for_system_value(ir);
1017       break;
1018
1019    default:
1020       unreachable("not reached");
1021    }
1022
1023    reg->type = brw_type_for_base_type(ir->type);
1024    hash_table_insert(this->variable_ht, reg, ir);
1025 }
1026
1027 void
1028 vec4_visitor::visit(ir_loop *ir)
1029 {
1030    /* We don't want debugging output to print the whole body of the
1031     * loop as the annotation.
1032     */
1033    this->base_ir = NULL;
1034
1035    emit(BRW_OPCODE_DO);
1036
1037    visit_instructions(&ir->body_instructions);
1038
1039    emit(BRW_OPCODE_WHILE);
1040 }
1041
1042 void
1043 vec4_visitor::visit(ir_loop_jump *ir)
1044 {
1045    switch (ir->mode) {
1046    case ir_loop_jump::jump_break:
1047       emit(BRW_OPCODE_BREAK);
1048       break;
1049    case ir_loop_jump::jump_continue:
1050       emit(BRW_OPCODE_CONTINUE);
1051       break;
1052    }
1053 }
1054
1055
1056 void
1057 vec4_visitor::visit(ir_function_signature *)
1058 {
1059    unreachable("not reached");
1060 }
1061
1062 void
1063 vec4_visitor::visit(ir_function *ir)
1064 {
1065    /* Ignore function bodies other than main() -- we shouldn't see calls to
1066     * them since they should all be inlined.
1067     */
1068    if (strcmp(ir->name, "main") == 0) {
1069       const ir_function_signature *sig;
1070       exec_list empty;
1071
1072       sig = ir->matching_signature(NULL, &empty, false);
1073
1074       assert(sig);
1075
1076       visit_instructions(&sig->body);
1077    }
1078 }
1079
1080 bool
1081 vec4_visitor::try_emit_sat(ir_expression *ir)
1082 {
1083    ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
1084    if (!sat_src)
1085       return false;
1086
1087    sat_src->accept(this);
1088    src_reg src = this->result;
1089
1090    this->result = src_reg(this, ir->type);
1091    vec4_instruction *inst;
1092    inst = emit(MOV(dst_reg(this->result), src));
1093    inst->saturate = true;
1094
1095    return true;
1096 }
1097
1098 bool
1099 vec4_visitor::try_emit_mad(ir_expression *ir)
1100 {
1101    /* 3-src instructions were introduced in gen6. */
1102    if (brw->gen < 6)
1103       return false;
1104
1105    /* MAD can only handle floating-point data. */
1106    if (ir->type->base_type != GLSL_TYPE_FLOAT)
1107       return false;
1108
1109    ir_rvalue *nonmul = ir->operands[1];
1110    ir_expression *mul = ir->operands[0]->as_expression();
1111
1112    if (!mul || mul->operation != ir_binop_mul) {
1113       nonmul = ir->operands[0];
1114       mul = ir->operands[1]->as_expression();
1115
1116       if (!mul || mul->operation != ir_binop_mul)
1117          return false;
1118    }
1119
1120    nonmul->accept(this);
1121    src_reg src0 = fix_3src_operand(this->result);
1122
1123    mul->operands[0]->accept(this);
1124    src_reg src1 = fix_3src_operand(this->result);
1125
1126    mul->operands[1]->accept(this);
1127    src_reg src2 = fix_3src_operand(this->result);
1128
1129    this->result = src_reg(this, ir->type);
1130    emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1131
1132    return true;
1133 }
1134
1135 bool
1136 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1137 {
1138    ir_expression *const cmp = ir->operands[0]->as_expression();
1139
1140    if (cmp == NULL)
1141       return false;
1142
1143    switch (cmp->operation) {
1144    case ir_binop_less:
1145    case ir_binop_greater:
1146    case ir_binop_lequal:
1147    case ir_binop_gequal:
1148    case ir_binop_equal:
1149    case ir_binop_nequal:
1150       break;
1151
1152    default:
1153       return false;
1154    }
1155
1156    cmp->operands[0]->accept(this);
1157    const src_reg cmp_src0 = this->result;
1158
1159    cmp->operands[1]->accept(this);
1160    const src_reg cmp_src1 = this->result;
1161
1162    this->result = src_reg(this, ir->type);
1163
1164    emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1165             brw_conditional_for_comparison(cmp->operation)));
1166
1167    /* If the comparison is false, this->result will just happen to be zero.
1168     */
1169    vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1170                                        this->result, src_reg(1.0f));
1171    inst->predicate = BRW_PREDICATE_NORMAL;
1172    inst->predicate_inverse = true;
1173
1174    return true;
1175 }
1176
1177 void
1178 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
1179                           src_reg src0, src_reg src1)
1180 {
1181    vec4_instruction *inst;
1182
1183    if (brw->gen >= 6) {
1184       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1185       inst->conditional_mod = conditionalmod;
1186    } else {
1187       emit(CMP(dst, src0, src1, conditionalmod));
1188
1189       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1190       inst->predicate = BRW_PREDICATE_NORMAL;
1191    }
1192 }
1193
1194 void
1195 vec4_visitor::emit_lrp(const dst_reg &dst,
1196                        const src_reg &x, const src_reg &y, const src_reg &a)
1197 {
1198    if (brw->gen >= 6) {
1199       /* Note that the instruction's argument order is reversed from GLSL
1200        * and the IR.
1201        */
1202       emit(LRP(dst,
1203                fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
1204    } else {
1205       /* Earlier generations don't support three source operations, so we
1206        * need to emit x*(1-a) + y*a.
1207        */
1208       dst_reg y_times_a           = dst_reg(this, glsl_type::vec4_type);
1209       dst_reg one_minus_a         = dst_reg(this, glsl_type::vec4_type);
1210       dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1211       y_times_a.writemask           = dst.writemask;
1212       one_minus_a.writemask         = dst.writemask;
1213       x_times_one_minus_a.writemask = dst.writemask;
1214
1215       emit(MUL(y_times_a, y, a));
1216       emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1217       emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1218       emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1219    }
1220 }
1221
1222 void
1223 vec4_visitor::visit(ir_expression *ir)
1224 {
1225    unsigned int operand;
1226    src_reg op[Elements(ir->operands)];
1227    src_reg result_src;
1228    dst_reg result_dst;
1229    vec4_instruction *inst;
1230
1231    if (try_emit_sat(ir))
1232       return;
1233
1234    if (ir->operation == ir_binop_add) {
1235       if (try_emit_mad(ir))
1236          return;
1237    }
1238
1239    if (ir->operation == ir_unop_b2f) {
1240       if (try_emit_b2f_of_compare(ir))
1241          return;
1242    }
1243
1244    for (operand = 0; operand < ir->get_num_operands(); operand++) {
1245       this->result.file = BAD_FILE;
1246       ir->operands[operand]->accept(this);
1247       if (this->result.file == BAD_FILE) {
1248          fprintf(stderr, "Failed to get tree for expression operand:\n");
1249          ir->operands[operand]->fprint(stderr);
1250          exit(1);
1251       }
1252       op[operand] = this->result;
1253
1254       /* Matrix expression operands should have been broken down to vector
1255        * operations already.
1256        */
1257       assert(!ir->operands[operand]->type->is_matrix());
1258    }
1259
1260    int vector_elements = ir->operands[0]->type->vector_elements;
1261    if (ir->operands[1]) {
1262       vector_elements = MAX2(vector_elements,
1263                              ir->operands[1]->type->vector_elements);
1264    }
1265
1266    this->result.file = BAD_FILE;
1267
1268    /* Storage for our result.  Ideally for an assignment we'd be using
1269     * the actual storage for the result here, instead.
1270     */
1271    result_src = src_reg(this, ir->type);
1272    /* convenience for the emit functions below. */
1273    result_dst = dst_reg(result_src);
1274    /* If nothing special happens, this is the result. */
1275    this->result = result_src;
1276    /* Limit writes to the channels that will be used by result_src later.
1277     * This does limit this temp's use as a temporary for multi-instruction
1278     * sequences.
1279     */
1280    result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1281
1282    switch (ir->operation) {
1283    case ir_unop_logic_not:
1284       /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
1285        * ones complement of the whole register, not just bit 0.
1286        */
1287       emit(XOR(result_dst, op[0], src_reg(1)));
1288       break;
1289    case ir_unop_neg:
1290       op[0].negate = !op[0].negate;
1291       emit(MOV(result_dst, op[0]));
1292       break;
1293    case ir_unop_abs:
1294       op[0].abs = true;
1295       op[0].negate = false;
1296       emit(MOV(result_dst, op[0]));
1297       break;
1298
1299    case ir_unop_sign:
1300       if (ir->type->is_float()) {
1301          /* AND(val, 0x80000000) gives the sign bit.
1302           *
1303           * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1304           * zero.
1305           */
1306          emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1307
1308          op[0].type = BRW_REGISTER_TYPE_UD;
1309          result_dst.type = BRW_REGISTER_TYPE_UD;
1310          emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1311
1312          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1313          inst->predicate = BRW_PREDICATE_NORMAL;
1314
1315          this->result.type = BRW_REGISTER_TYPE_F;
1316       } else {
1317          /*  ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1318           *               -> non-negative val generates 0x00000000.
1319           *  Predicated OR sets 1 if val is positive.
1320           */
1321          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1322
1323          emit(ASR(result_dst, op[0], src_reg(31)));
1324
1325          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1326          inst->predicate = BRW_PREDICATE_NORMAL;
1327       }
1328       break;
1329
1330    case ir_unop_rcp:
1331       emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1332       break;
1333
1334    case ir_unop_exp2:
1335       emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1336       break;
1337    case ir_unop_log2:
1338       emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1339       break;
1340    case ir_unop_exp:
1341    case ir_unop_log:
1342       unreachable("not reached: should be handled by ir_explog_to_explog2");
1343    case ir_unop_sin:
1344    case ir_unop_sin_reduced:
1345       emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1346       break;
1347    case ir_unop_cos:
1348    case ir_unop_cos_reduced:
1349       emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1350       break;
1351
1352    case ir_unop_dFdx:
1353    case ir_unop_dFdy:
1354       unreachable("derivatives not valid in vertex shader");
1355
1356    case ir_unop_bitfield_reverse:
1357       emit(BFREV(result_dst, op[0]));
1358       break;
1359    case ir_unop_bit_count:
1360       emit(CBIT(result_dst, op[0]));
1361       break;
1362    case ir_unop_find_msb: {
1363       src_reg temp = src_reg(this, glsl_type::uint_type);
1364
1365       inst = emit(FBH(dst_reg(temp), op[0]));
1366       inst->dst.writemask = WRITEMASK_XYZW;
1367
1368       /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1369        * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1370        * subtract the result from 31 to convert the MSB count into an LSB count.
1371        */
1372
1373       /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1374       temp.swizzle = BRW_SWIZZLE_NOOP;
1375       emit(MOV(result_dst, temp));
1376
1377       src_reg src_tmp = src_reg(result_dst);
1378       emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1379
1380       src_tmp.negate = true;
1381       inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1382       inst->predicate = BRW_PREDICATE_NORMAL;
1383       break;
1384    }
1385    case ir_unop_find_lsb:
1386       emit(FBL(result_dst, op[0]));
1387       break;
1388
1389    case ir_unop_noise:
1390       unreachable("not reached: should be handled by lower_noise");
1391
1392    case ir_binop_add:
1393       emit(ADD(result_dst, op[0], op[1]));
1394       break;
1395    case ir_binop_sub:
1396       unreachable("not reached: should be handled by ir_sub_to_add_neg");
1397
1398    case ir_binop_mul:
1399       if (brw->gen < 8 && ir->type->is_integer()) {
1400          /* For integer multiplication, the MUL uses the low 16 bits of one of
1401           * the operands (src0 through SNB, src1 on IVB and later).  The MACH
1402           * accumulates in the contribution of the upper 16 bits of that
1403           * operand.  If we can determine that one of the args is in the low
1404           * 16 bits, though, we can just emit a single MUL.
1405           */
1406          if (ir->operands[0]->is_uint16_constant()) {
1407             if (brw->gen < 7)
1408                emit(MUL(result_dst, op[0], op[1]));
1409             else
1410                emit(MUL(result_dst, op[1], op[0]));
1411          } else if (ir->operands[1]->is_uint16_constant()) {
1412             if (brw->gen < 7)
1413                emit(MUL(result_dst, op[1], op[0]));
1414             else
1415                emit(MUL(result_dst, op[0], op[1]));
1416          } else {
1417             struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1418
1419             emit(MUL(acc, op[0], op[1]));
1420             emit(MACH(dst_null_d(), op[0], op[1]));
1421             emit(MOV(result_dst, src_reg(acc)));
1422          }
1423       } else {
1424          emit(MUL(result_dst, op[0], op[1]));
1425       }
1426       break;
1427    case ir_binop_imul_high: {
1428       struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1429
1430       emit(MUL(acc, op[0], op[1]));
1431       emit(MACH(result_dst, op[0], op[1]));
1432       break;
1433    }
1434    case ir_binop_div:
1435       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1436       assert(ir->type->is_integer());
1437       emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1438       break;
1439    case ir_binop_carry: {
1440       struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1441
1442       emit(ADDC(dst_null_ud(), op[0], op[1]));
1443       emit(MOV(result_dst, src_reg(acc)));
1444       break;
1445    }
1446    case ir_binop_borrow: {
1447       struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1448
1449       emit(SUBB(dst_null_ud(), op[0], op[1]));
1450       emit(MOV(result_dst, src_reg(acc)));
1451       break;
1452    }
1453    case ir_binop_mod:
1454       /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1455       assert(ir->type->is_integer());
1456       emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1457       break;
1458
1459    case ir_binop_less:
1460    case ir_binop_greater:
1461    case ir_binop_lequal:
1462    case ir_binop_gequal:
1463    case ir_binop_equal:
1464    case ir_binop_nequal: {
1465       emit(CMP(result_dst, op[0], op[1],
1466                brw_conditional_for_comparison(ir->operation)));
1467       emit(AND(result_dst, result_src, src_reg(0x1)));
1468       break;
1469    }
1470
1471    case ir_binop_all_equal:
1472       /* "==" operator producing a scalar boolean. */
1473       if (ir->operands[0]->type->is_vector() ||
1474           ir->operands[1]->type->is_vector()) {
1475          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1476          emit(MOV(result_dst, src_reg(0)));
1477          inst = emit(MOV(result_dst, src_reg(1)));
1478          inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1479       } else {
1480          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1481          emit(AND(result_dst, result_src, src_reg(0x1)));
1482       }
1483       break;
1484    case ir_binop_any_nequal:
1485       /* "!=" operator producing a scalar boolean. */
1486       if (ir->operands[0]->type->is_vector() ||
1487           ir->operands[1]->type->is_vector()) {
1488          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1489
1490          emit(MOV(result_dst, src_reg(0)));
1491          inst = emit(MOV(result_dst, src_reg(1)));
1492          inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1493       } else {
1494          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1495          emit(AND(result_dst, result_src, src_reg(0x1)));
1496       }
1497       break;
1498
1499    case ir_unop_any:
1500       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1501       emit(MOV(result_dst, src_reg(0)));
1502
1503       inst = emit(MOV(result_dst, src_reg(1)));
1504       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1505       break;
1506
1507    case ir_binop_logic_xor:
1508       emit(XOR(result_dst, op[0], op[1]));
1509       break;
1510
1511    case ir_binop_logic_or:
1512       emit(OR(result_dst, op[0], op[1]));
1513       break;
1514
1515    case ir_binop_logic_and:
1516       emit(AND(result_dst, op[0], op[1]));
1517       break;
1518
1519    case ir_binop_dot:
1520       assert(ir->operands[0]->type->is_vector());
1521       assert(ir->operands[0]->type == ir->operands[1]->type);
1522       emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1523       break;
1524
1525    case ir_unop_sqrt:
1526       emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1527       break;
1528    case ir_unop_rsq:
1529       emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1530       break;
1531
1532    case ir_unop_bitcast_i2f:
1533    case ir_unop_bitcast_u2f:
1534       this->result = op[0];
1535       this->result.type = BRW_REGISTER_TYPE_F;
1536       break;
1537
1538    case ir_unop_bitcast_f2i:
1539       this->result = op[0];
1540       this->result.type = BRW_REGISTER_TYPE_D;
1541       break;
1542
1543    case ir_unop_bitcast_f2u:
1544       this->result = op[0];
1545       this->result.type = BRW_REGISTER_TYPE_UD;
1546       break;
1547
1548    case ir_unop_i2f:
1549    case ir_unop_i2u:
1550    case ir_unop_u2i:
1551    case ir_unop_u2f:
1552    case ir_unop_b2f:
1553    case ir_unop_b2i:
1554    case ir_unop_f2i:
1555    case ir_unop_f2u:
1556       emit(MOV(result_dst, op[0]));
1557       break;
1558    case ir_unop_f2b:
1559    case ir_unop_i2b: {
1560       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1561       emit(AND(result_dst, result_src, src_reg(1)));
1562       break;
1563    }
1564
1565    case ir_unop_trunc:
1566       emit(RNDZ(result_dst, op[0]));
1567       break;
1568    case ir_unop_ceil:
1569       op[0].negate = !op[0].negate;
1570       inst = emit(RNDD(result_dst, op[0]));
1571       this->result.negate = true;
1572       break;
1573    case ir_unop_floor:
1574       inst = emit(RNDD(result_dst, op[0]));
1575       break;
1576    case ir_unop_fract:
1577       inst = emit(FRC(result_dst, op[0]));
1578       break;
1579    case ir_unop_round_even:
1580       emit(RNDE(result_dst, op[0]));
1581       break;
1582
1583    case ir_binop_min:
1584       emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1585       break;
1586    case ir_binop_max:
1587       emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1588       break;
1589
1590    case ir_binop_pow:
1591       emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1592       break;
1593
1594    case ir_unop_bit_not:
1595       inst = emit(NOT(result_dst, op[0]));
1596       break;
1597    case ir_binop_bit_and:
1598       inst = emit(AND(result_dst, op[0], op[1]));
1599       break;
1600    case ir_binop_bit_xor:
1601       inst = emit(XOR(result_dst, op[0], op[1]));
1602       break;
1603    case ir_binop_bit_or:
1604       inst = emit(OR(result_dst, op[0], op[1]));
1605       break;
1606
1607    case ir_binop_lshift:
1608       inst = emit(SHL(result_dst, op[0], op[1]));
1609       break;
1610
1611    case ir_binop_rshift:
1612       if (ir->type->base_type == GLSL_TYPE_INT)
1613          inst = emit(ASR(result_dst, op[0], op[1]));
1614       else
1615          inst = emit(SHR(result_dst, op[0], op[1]));
1616       break;
1617
1618    case ir_binop_bfm:
1619       emit(BFI1(result_dst, op[0], op[1]));
1620       break;
1621
1622    case ir_binop_ubo_load: {
1623       ir_constant *uniform_block = ir->operands[0]->as_constant();
1624       ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1625       unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1626       src_reg offset;
1627
1628       /* Now, load the vector from that offset. */
1629       assert(ir->type->is_vector() || ir->type->is_scalar());
1630
1631       src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1632       packed_consts.type = result.type;
1633       src_reg surf_index =
1634          src_reg(prog_data->base.binding_table.ubo_start + uniform_block->value.u[0]);
1635       if (const_offset_ir) {
1636          if (brw->gen >= 8) {
1637             /* Store the offset in a GRF so we can send-from-GRF. */
1638             offset = src_reg(this, glsl_type::int_type);
1639             emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1640          } else {
1641             /* Immediates are fine on older generations since they'll be moved
1642              * to a (potentially fake) MRF at the generator level.
1643              */
1644             offset = src_reg(const_offset / 16);
1645          }
1646       } else {
1647          offset = src_reg(this, glsl_type::uint_type);
1648          emit(SHR(dst_reg(offset), op[1], src_reg(4)));
1649       }
1650
1651       if (brw->gen >= 7) {
1652          dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1653          grf_offset.type = offset.type;
1654
1655          emit(MOV(grf_offset, offset));
1656
1657          emit(new(mem_ctx) vec4_instruction(this,
1658                                             VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1659                                             dst_reg(packed_consts),
1660                                             surf_index,
1661                                             src_reg(grf_offset)));
1662       } else {
1663          vec4_instruction *pull =
1664             emit(new(mem_ctx) vec4_instruction(this,
1665                                                VS_OPCODE_PULL_CONSTANT_LOAD,
1666                                                dst_reg(packed_consts),
1667                                                surf_index,
1668                                                offset));
1669          pull->base_mrf = 14;
1670          pull->mlen = 1;
1671       }
1672
1673       packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1674       packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1675                                             const_offset % 16 / 4,
1676                                             const_offset % 16 / 4,
1677                                             const_offset % 16 / 4);
1678
1679       /* UBO bools are any nonzero int.  We store bools as either 0 or 1. */
1680       if (ir->type->base_type == GLSL_TYPE_BOOL) {
1681          emit(CMP(result_dst, packed_consts, src_reg(0u),
1682                   BRW_CONDITIONAL_NZ));
1683          emit(AND(result_dst, result, src_reg(0x1)));
1684       } else {
1685          emit(MOV(result_dst, packed_consts));
1686       }
1687       break;
1688    }
1689
1690    case ir_binop_vector_extract:
1691       unreachable("should have been lowered by vec_index_to_cond_assign");
1692
1693    case ir_triop_fma:
1694       op[0] = fix_3src_operand(op[0]);
1695       op[1] = fix_3src_operand(op[1]);
1696       op[2] = fix_3src_operand(op[2]);
1697       /* Note that the instruction's argument order is reversed from GLSL
1698        * and the IR.
1699        */
1700       emit(MAD(result_dst, op[2], op[1], op[0]));
1701       break;
1702
1703    case ir_triop_lrp:
1704       emit_lrp(result_dst, op[0], op[1], op[2]);
1705       break;
1706
1707    case ir_triop_csel:
1708       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1709       inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1710       inst->predicate = BRW_PREDICATE_NORMAL;
1711       break;
1712
1713    case ir_triop_bfi:
1714       op[0] = fix_3src_operand(op[0]);
1715       op[1] = fix_3src_operand(op[1]);
1716       op[2] = fix_3src_operand(op[2]);
1717       emit(BFI2(result_dst, op[0], op[1], op[2]));
1718       break;
1719
1720    case ir_triop_bitfield_extract:
1721       op[0] = fix_3src_operand(op[0]);
1722       op[1] = fix_3src_operand(op[1]);
1723       op[2] = fix_3src_operand(op[2]);
1724       /* Note that the instruction's argument order is reversed from GLSL
1725        * and the IR.
1726        */
1727       emit(BFE(result_dst, op[2], op[1], op[0]));
1728       break;
1729
1730    case ir_triop_vector_insert:
1731       unreachable("should have been lowered by lower_vector_insert");
1732
1733    case ir_quadop_bitfield_insert:
1734       unreachable("not reached: should be handled by "
1735               "bitfield_insert_to_bfm_bfi\n");
1736
1737    case ir_quadop_vector:
1738       unreachable("not reached: should be handled by lower_quadop_vector");
1739
1740    case ir_unop_pack_half_2x16:
1741       emit_pack_half_2x16(result_dst, op[0]);
1742       break;
1743    case ir_unop_unpack_half_2x16:
1744       emit_unpack_half_2x16(result_dst, op[0]);
1745       break;
1746    case ir_unop_pack_snorm_2x16:
1747    case ir_unop_pack_snorm_4x8:
1748    case ir_unop_pack_unorm_2x16:
1749    case ir_unop_pack_unorm_4x8:
1750    case ir_unop_unpack_snorm_2x16:
1751    case ir_unop_unpack_snorm_4x8:
1752    case ir_unop_unpack_unorm_2x16:
1753    case ir_unop_unpack_unorm_4x8:
1754       unreachable("not reached: should be handled by lower_packing_builtins");
1755    case ir_unop_unpack_half_2x16_split_x:
1756    case ir_unop_unpack_half_2x16_split_y:
1757    case ir_binop_pack_half_2x16_split:
1758    case ir_unop_interpolate_at_centroid:
1759    case ir_binop_interpolate_at_sample:
1760    case ir_binop_interpolate_at_offset:
1761       unreachable("not reached: should not occur in vertex shader");
1762    case ir_binop_ldexp:
1763       unreachable("not reached: should be handled by ldexp_to_arith()");
1764    }
1765 }
1766
1767
1768 void
1769 vec4_visitor::visit(ir_swizzle *ir)
1770 {
1771    src_reg src;
1772    int i = 0;
1773    int swizzle[4];
1774
1775    /* Note that this is only swizzles in expressions, not those on the left
1776     * hand side of an assignment, which do write masking.  See ir_assignment
1777     * for that.
1778     */
1779
1780    ir->val->accept(this);
1781    src = this->result;
1782    assert(src.file != BAD_FILE);
1783
1784    for (i = 0; i < ir->type->vector_elements; i++) {
1785       switch (i) {
1786       case 0:
1787          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1788          break;
1789       case 1:
1790          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1791          break;
1792       case 2:
1793          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1794          break;
1795       case 3:
1796          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1797             break;
1798       }
1799    }
1800    for (; i < 4; i++) {
1801       /* Replicate the last channel out. */
1802       swizzle[i] = swizzle[ir->type->vector_elements - 1];
1803    }
1804
1805    src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1806
1807    this->result = src;
1808 }
1809
1810 void
1811 vec4_visitor::visit(ir_dereference_variable *ir)
1812 {
1813    const struct glsl_type *type = ir->type;
1814    dst_reg *reg = variable_storage(ir->var);
1815
1816    if (!reg) {
1817       fail("Failed to find variable storage for %s\n", ir->var->name);
1818       this->result = src_reg(brw_null_reg());
1819       return;
1820    }
1821
1822    this->result = src_reg(*reg);
1823
1824    /* System values get their swizzle from the dst_reg writemask */
1825    if (ir->var->data.mode == ir_var_system_value)
1826       return;
1827
1828    if (type->is_scalar() || type->is_vector() || type->is_matrix())
1829       this->result.swizzle = swizzle_for_size(type->vector_elements);
1830 }
1831
1832
1833 int
1834 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1835 {
1836    /* Under normal circumstances array elements are stored consecutively, so
1837     * the stride is equal to the size of the array element.
1838     */
1839    return type_size(ir->type);
1840 }
1841
1842
1843 void
1844 vec4_visitor::visit(ir_dereference_array *ir)
1845 {
1846    ir_constant *constant_index;
1847    src_reg src;
1848    int array_stride = compute_array_stride(ir);
1849
1850    constant_index = ir->array_index->constant_expression_value();
1851
1852    ir->array->accept(this);
1853    src = this->result;
1854
1855    if (constant_index) {
1856       src.reg_offset += constant_index->value.i[0] * array_stride;
1857    } else {
1858       /* Variable index array dereference.  It eats the "vec4" of the
1859        * base of the array and an index that offsets the Mesa register
1860        * index.
1861        */
1862       ir->array_index->accept(this);
1863
1864       src_reg index_reg;
1865
1866       if (array_stride == 1) {
1867          index_reg = this->result;
1868       } else {
1869          index_reg = src_reg(this, glsl_type::int_type);
1870
1871          emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
1872       }
1873
1874       if (src.reladdr) {
1875          src_reg temp = src_reg(this, glsl_type::int_type);
1876
1877          emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1878
1879          index_reg = temp;
1880       }
1881
1882       src.reladdr = ralloc(mem_ctx, src_reg);
1883       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1884    }
1885
1886    /* If the type is smaller than a vec4, replicate the last channel out. */
1887    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1888       src.swizzle = swizzle_for_size(ir->type->vector_elements);
1889    else
1890       src.swizzle = BRW_SWIZZLE_NOOP;
1891    src.type = brw_type_for_base_type(ir->type);
1892
1893    this->result = src;
1894 }
1895
1896 void
1897 vec4_visitor::visit(ir_dereference_record *ir)
1898 {
1899    unsigned int i;
1900    const glsl_type *struct_type = ir->record->type;
1901    int offset = 0;
1902
1903    ir->record->accept(this);
1904
1905    for (i = 0; i < struct_type->length; i++) {
1906       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1907          break;
1908       offset += type_size(struct_type->fields.structure[i].type);
1909    }
1910
1911    /* If the type is smaller than a vec4, replicate the last channel out. */
1912    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1913       this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1914    else
1915       this->result.swizzle = BRW_SWIZZLE_NOOP;
1916    this->result.type = brw_type_for_base_type(ir->type);
1917
1918    this->result.reg_offset += offset;
1919 }
1920
1921 /**
1922  * We want to be careful in assignment setup to hit the actual storage
1923  * instead of potentially using a temporary like we might with the
1924  * ir_dereference handler.
1925  */
1926 static dst_reg
1927 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1928 {
1929    /* The LHS must be a dereference.  If the LHS is a variable indexed array
1930     * access of a vector, it must be separated into a series conditional moves
1931     * before reaching this point (see ir_vec_index_to_cond_assign).
1932     */
1933    assert(ir->as_dereference());
1934    ir_dereference_array *deref_array = ir->as_dereference_array();
1935    if (deref_array) {
1936       assert(!deref_array->array->type->is_vector());
1937    }
1938
1939    /* Use the rvalue deref handler for the most part.  We'll ignore
1940     * swizzles in it and write swizzles using writemask, though.
1941     */
1942    ir->accept(v);
1943    return dst_reg(v->result);
1944 }
1945
1946 void
1947 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1948                               const struct glsl_type *type,
1949                               enum brw_predicate predicate)
1950 {
1951    if (type->base_type == GLSL_TYPE_STRUCT) {
1952       for (unsigned int i = 0; i < type->length; i++) {
1953          emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1954       }
1955       return;
1956    }
1957
1958    if (type->is_array()) {
1959       for (unsigned int i = 0; i < type->length; i++) {
1960          emit_block_move(dst, src, type->fields.array, predicate);
1961       }
1962       return;
1963    }
1964
1965    if (type->is_matrix()) {
1966       const struct glsl_type *vec_type;
1967
1968       vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1969                                          type->vector_elements, 1);
1970
1971       for (int i = 0; i < type->matrix_columns; i++) {
1972          emit_block_move(dst, src, vec_type, predicate);
1973       }
1974       return;
1975    }
1976
1977    assert(type->is_scalar() || type->is_vector());
1978
1979    dst->type = brw_type_for_base_type(type);
1980    src->type = dst->type;
1981
1982    dst->writemask = (1 << type->vector_elements) - 1;
1983
1984    src->swizzle = swizzle_for_size(type->vector_elements);
1985
1986    vec4_instruction *inst = emit(MOV(*dst, *src));
1987    inst->predicate = predicate;
1988
1989    dst->reg_offset++;
1990    src->reg_offset++;
1991 }
1992
1993
1994 /* If the RHS processing resulted in an instruction generating a
1995  * temporary value, and it would be easy to rewrite the instruction to
1996  * generate its result right into the LHS instead, do so.  This ends
1997  * up reliably removing instructions where it can be tricky to do so
1998  * later without real UD chain information.
1999  */
2000 bool
2001 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2002                                      dst_reg dst,
2003                                      src_reg src,
2004                                      vec4_instruction *pre_rhs_inst,
2005                                      vec4_instruction *last_rhs_inst)
2006 {
2007    /* This could be supported, but it would take more smarts. */
2008    if (ir->condition)
2009       return false;
2010
2011    if (pre_rhs_inst == last_rhs_inst)
2012       return false; /* No instructions generated to work with. */
2013
2014    /* Make sure the last instruction generated our source reg. */
2015    if (src.file != GRF ||
2016        src.file != last_rhs_inst->dst.file ||
2017        src.reg != last_rhs_inst->dst.reg ||
2018        src.reg_offset != last_rhs_inst->dst.reg_offset ||
2019        src.reladdr ||
2020        src.abs ||
2021        src.negate ||
2022        last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2023       return false;
2024
2025    /* Check that that last instruction fully initialized the channels
2026     * we want to use, in the order we want to use them.  We could
2027     * potentially reswizzle the operands of many instructions so that
2028     * we could handle out of order channels, but don't yet.
2029     */
2030
2031    for (unsigned i = 0; i < 4; i++) {
2032       if (dst.writemask & (1 << i)) {
2033          if (!(last_rhs_inst->dst.writemask & (1 << i)))
2034             return false;
2035
2036          if (BRW_GET_SWZ(src.swizzle, i) != i)
2037             return false;
2038       }
2039    }
2040
2041    /* Success!  Rewrite the instruction. */
2042    last_rhs_inst->dst.file = dst.file;
2043    last_rhs_inst->dst.reg = dst.reg;
2044    last_rhs_inst->dst.reg_offset = dst.reg_offset;
2045    last_rhs_inst->dst.reladdr = dst.reladdr;
2046    last_rhs_inst->dst.writemask &= dst.writemask;
2047
2048    return true;
2049 }
2050
2051 void
2052 vec4_visitor::visit(ir_assignment *ir)
2053 {
2054    dst_reg dst = get_assignment_lhs(ir->lhs, this);
2055    enum brw_predicate predicate = BRW_PREDICATE_NONE;
2056
2057    if (!ir->lhs->type->is_scalar() &&
2058        !ir->lhs->type->is_vector()) {
2059       ir->rhs->accept(this);
2060       src_reg src = this->result;
2061
2062       if (ir->condition) {
2063          emit_bool_to_cond_code(ir->condition, &predicate);
2064       }
2065
2066       /* emit_block_move doesn't account for swizzles in the source register.
2067        * This should be ok, since the source register is a structure or an
2068        * array, and those can't be swizzled.  But double-check to be sure.
2069        */
2070       assert(src.swizzle ==
2071              (ir->rhs->type->is_matrix()
2072               ? swizzle_for_size(ir->rhs->type->vector_elements)
2073               : BRW_SWIZZLE_NOOP));
2074
2075       emit_block_move(&dst, &src, ir->rhs->type, predicate);
2076       return;
2077    }
2078
2079    /* Now we're down to just a scalar/vector with writemasks. */
2080    int i;
2081
2082    vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2083    pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2084
2085    ir->rhs->accept(this);
2086
2087    last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2088
2089    src_reg src = this->result;
2090
2091    int swizzles[4];
2092    int first_enabled_chan = 0;
2093    int src_chan = 0;
2094
2095    assert(ir->lhs->type->is_vector() ||
2096           ir->lhs->type->is_scalar());
2097    dst.writemask = ir->write_mask;
2098
2099    for (int i = 0; i < 4; i++) {
2100       if (dst.writemask & (1 << i)) {
2101          first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
2102          break;
2103       }
2104    }
2105
2106    /* Swizzle a small RHS vector into the channels being written.
2107     *
2108     * glsl ir treats write_mask as dictating how many channels are
2109     * present on the RHS while in our instructions we need to make
2110     * those channels appear in the slots of the vec4 they're written to.
2111     */
2112    for (int i = 0; i < 4; i++) {
2113       if (dst.writemask & (1 << i))
2114          swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
2115       else
2116          swizzles[i] = first_enabled_chan;
2117    }
2118    src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2119                               swizzles[2], swizzles[3]);
2120
2121    if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2122       return;
2123    }
2124
2125    if (ir->condition) {
2126       emit_bool_to_cond_code(ir->condition, &predicate);
2127    }
2128
2129    for (i = 0; i < type_size(ir->lhs->type); i++) {
2130       vec4_instruction *inst = emit(MOV(dst, src));
2131       inst->predicate = predicate;
2132
2133       dst.reg_offset++;
2134       src.reg_offset++;
2135    }
2136 }
2137
2138 void
2139 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2140 {
2141    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2142       foreach_in_list(ir_constant, field_value, &ir->components) {
2143          emit_constant_values(dst, field_value);
2144       }
2145       return;
2146    }
2147
2148    if (ir->type->is_array()) {
2149       for (unsigned int i = 0; i < ir->type->length; i++) {
2150          emit_constant_values(dst, ir->array_elements[i]);
2151       }
2152       return;
2153    }
2154
2155    if (ir->type->is_matrix()) {
2156       for (int i = 0; i < ir->type->matrix_columns; i++) {
2157          float *vec = &ir->value.f[i * ir->type->vector_elements];
2158
2159          for (int j = 0; j < ir->type->vector_elements; j++) {
2160             dst->writemask = 1 << j;
2161             dst->type = BRW_REGISTER_TYPE_F;
2162
2163             emit(MOV(*dst, src_reg(vec[j])));
2164          }
2165          dst->reg_offset++;
2166       }
2167       return;
2168    }
2169
2170    int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2171
2172    for (int i = 0; i < ir->type->vector_elements; i++) {
2173       if (!(remaining_writemask & (1 << i)))
2174          continue;
2175
2176       dst->writemask = 1 << i;
2177       dst->type = brw_type_for_base_type(ir->type);
2178
2179       /* Find other components that match the one we're about to
2180        * write.  Emits fewer instructions for things like vec4(0.5,
2181        * 1.5, 1.5, 1.5).
2182        */
2183       for (int j = i + 1; j < ir->type->vector_elements; j++) {
2184          if (ir->type->base_type == GLSL_TYPE_BOOL) {
2185             if (ir->value.b[i] == ir->value.b[j])
2186                dst->writemask |= (1 << j);
2187          } else {
2188             /* u, i, and f storage all line up, so no need for a
2189              * switch case for comparing each type.
2190              */
2191             if (ir->value.u[i] == ir->value.u[j])
2192                dst->writemask |= (1 << j);
2193          }
2194       }
2195
2196       switch (ir->type->base_type) {
2197       case GLSL_TYPE_FLOAT:
2198          emit(MOV(*dst, src_reg(ir->value.f[i])));
2199          break;
2200       case GLSL_TYPE_INT:
2201          emit(MOV(*dst, src_reg(ir->value.i[i])));
2202          break;
2203       case GLSL_TYPE_UINT:
2204          emit(MOV(*dst, src_reg(ir->value.u[i])));
2205          break;
2206       case GLSL_TYPE_BOOL:
2207          emit(MOV(*dst, src_reg(ir->value.b[i])));
2208          break;
2209       default:
2210          unreachable("Non-float/uint/int/bool constant");
2211       }
2212
2213       remaining_writemask &= ~dst->writemask;
2214    }
2215    dst->reg_offset++;
2216 }
2217
2218 void
2219 vec4_visitor::visit(ir_constant *ir)
2220 {
2221    dst_reg dst = dst_reg(this, ir->type);
2222    this->result = src_reg(dst);
2223
2224    emit_constant_values(&dst, ir);
2225 }
2226
2227 void
2228 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2229 {
2230    ir_dereference *deref = static_cast<ir_dereference *>(
2231       ir->actual_parameters.get_head());
2232    ir_variable *location = deref->variable_referenced();
2233    unsigned surf_index = (prog_data->base.binding_table.abo_start +
2234                           location->data.atomic.buffer_index);
2235
2236    /* Calculate the surface offset */
2237    src_reg offset(this, glsl_type::uint_type);
2238    ir_dereference_array *deref_array = deref->as_dereference_array();
2239    if (deref_array) {
2240       deref_array->array_index->accept(this);
2241
2242       src_reg tmp(this, glsl_type::uint_type);
2243       emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2244       emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2245    } else {
2246       offset = location->data.atomic.offset;
2247    }
2248
2249    /* Emit the appropriate machine instruction */
2250    const char *callee = ir->callee->function_name();
2251    dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2252
2253    if (!strcmp("__intrinsic_atomic_read", callee)) {
2254       emit_untyped_surface_read(surf_index, dst, offset);
2255
2256    } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2257       emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2258                           src_reg(), src_reg());
2259
2260    } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2261       emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2262                           src_reg(), src_reg());
2263    }
2264 }
2265
2266 void
2267 vec4_visitor::visit(ir_call *ir)
2268 {
2269    const char *callee = ir->callee->function_name();
2270
2271    if (!strcmp("__intrinsic_atomic_read", callee) ||
2272        !strcmp("__intrinsic_atomic_increment", callee) ||
2273        !strcmp("__intrinsic_atomic_predecrement", callee)) {
2274       visit_atomic_counter_intrinsic(ir);
2275    } else {
2276       unreachable("Unsupported intrinsic.");
2277    }
2278 }
2279
2280 src_reg
2281 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, uint32_t sampler)
2282 {
2283    vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_MCS);
2284    inst->base_mrf = 2;
2285    inst->mlen = 1;
2286    inst->dst = dst_reg(this, glsl_type::uvec4_type);
2287    inst->dst.writemask = WRITEMASK_XYZW;
2288
2289    inst->src[1] = src_reg(sampler);
2290
2291    /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2292    int param_base = inst->base_mrf;
2293    int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2294    int zero_mask = 0xf & ~coord_mask;
2295
2296    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2297             coordinate));
2298
2299    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2300             src_reg(0)));
2301
2302    emit(inst);
2303    return src_reg(inst->dst);
2304 }
2305
2306 void
2307 vec4_visitor::visit(ir_texture *ir)
2308 {
2309    uint32_t sampler =
2310       _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2311
2312    /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2313     * emitting anything other than setting up the constant result.
2314     */
2315    if (ir->op == ir_tg4) {
2316       ir_constant *chan = ir->lod_info.component->as_constant();
2317       int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2318       if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2319          dst_reg result(this, ir->type);
2320          this->result = src_reg(result);
2321          emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2322          return;
2323       }
2324    }
2325
2326    /* Should be lowered by do_lower_texture_projection */
2327    assert(!ir->projector);
2328
2329    /* Should be lowered */
2330    assert(!ir->offset || !ir->offset->type->is_array());
2331
2332    /* Generate code to compute all the subexpression trees.  This has to be
2333     * done before loading any values into MRFs for the sampler message since
2334     * generating these values may involve SEND messages that need the MRFs.
2335     */
2336    src_reg coordinate;
2337    if (ir->coordinate) {
2338       ir->coordinate->accept(this);
2339       coordinate = this->result;
2340    }
2341
2342    src_reg shadow_comparitor;
2343    if (ir->shadow_comparitor) {
2344       ir->shadow_comparitor->accept(this);
2345       shadow_comparitor = this->result;
2346    }
2347
2348    bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2349    src_reg offset_value;
2350    if (has_nonconstant_offset) {
2351       ir->offset->accept(this);
2352       offset_value = src_reg(this->result);
2353    }
2354
2355    const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2356    src_reg lod, dPdx, dPdy, sample_index, mcs;
2357    switch (ir->op) {
2358    case ir_tex:
2359       lod = src_reg(0.0f);
2360       lod_type = glsl_type::float_type;
2361       break;
2362    case ir_txf:
2363    case ir_txl:
2364    case ir_txs:
2365       ir->lod_info.lod->accept(this);
2366       lod = this->result;
2367       lod_type = ir->lod_info.lod->type;
2368       break;
2369    case ir_query_levels:
2370       lod = src_reg(0);
2371       lod_type = glsl_type::int_type;
2372       break;
2373    case ir_txf_ms:
2374       ir->lod_info.sample_index->accept(this);
2375       sample_index = this->result;
2376       sample_index_type = ir->lod_info.sample_index->type;
2377
2378       if (brw->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2379          mcs = emit_mcs_fetch(ir, coordinate, sampler);
2380       else
2381          mcs = src_reg(0u);
2382       break;
2383    case ir_txd:
2384       ir->lod_info.grad.dPdx->accept(this);
2385       dPdx = this->result;
2386
2387       ir->lod_info.grad.dPdy->accept(this);
2388       dPdy = this->result;
2389
2390       lod_type = ir->lod_info.grad.dPdx->type;
2391       break;
2392    case ir_txb:
2393    case ir_lod:
2394    case ir_tg4:
2395       break;
2396    }
2397
2398    enum opcode opcode;
2399    switch (ir->op) {
2400    case ir_tex: opcode = SHADER_OPCODE_TXL; break;
2401    case ir_txl: opcode = SHADER_OPCODE_TXL; break;
2402    case ir_txd: opcode = SHADER_OPCODE_TXD; break;
2403    case ir_txf: opcode = SHADER_OPCODE_TXF; break;
2404    case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
2405    case ir_txs: opcode = SHADER_OPCODE_TXS; break;
2406    case ir_tg4: opcode = has_nonconstant_offset
2407                          ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
2408    case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
2409    case ir_txb:
2410       unreachable("TXB is not valid for vertex shaders.");
2411    case ir_lod:
2412       unreachable("LOD is not valid for vertex shaders.");
2413    default:
2414       unreachable("Unrecognized tex op");
2415    }
2416
2417    vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, opcode);
2418
2419    if (ir->offset != NULL && ir->op != ir_txf)
2420       inst->texture_offset = brw_texture_offset(ctx, ir->offset->as_constant());
2421
2422    /* Stuff the channel select bits in the top of the texture offset */
2423    if (ir->op == ir_tg4)
2424       inst->texture_offset |= gather_channel(ir, sampler) << 16;
2425
2426    /* The message header is necessary for:
2427     * - Gen4 (always)
2428     * - Texel offsets
2429     * - Gather channel selection
2430     * - Sampler indices too large to fit in a 4-bit value.
2431     */
2432    inst->header_present =
2433       brw->gen < 5 || inst->texture_offset != 0 || ir->op == ir_tg4 ||
2434       sampler >= 16;
2435    inst->base_mrf = 2;
2436    inst->mlen = inst->header_present + 1; /* always at least one */
2437    inst->dst = dst_reg(this, ir->type);
2438    inst->dst.writemask = WRITEMASK_XYZW;
2439    inst->shadow_compare = ir->shadow_comparitor != NULL;
2440
2441    inst->src[1] = src_reg(sampler);
2442
2443    /* MRF for the first parameter */
2444    int param_base = inst->base_mrf + inst->header_present;
2445
2446    if (ir->op == ir_txs || ir->op == ir_query_levels) {
2447       int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2448       emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2449    } else {
2450       /* Load the coordinate */
2451       /* FINISHME: gl_clamp_mask and saturate */
2452       int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2453       int zero_mask = 0xf & ~coord_mask;
2454
2455       emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2456                coordinate));
2457
2458       if (zero_mask != 0) {
2459          emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2460                   src_reg(0)));
2461       }
2462       /* Load the shadow comparitor */
2463       if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2464          emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2465                           WRITEMASK_X),
2466                   shadow_comparitor));
2467          inst->mlen++;
2468       }
2469
2470       /* Load the LOD info */
2471       if (ir->op == ir_tex || ir->op == ir_txl) {
2472          int mrf, writemask;
2473          if (brw->gen >= 5) {
2474             mrf = param_base + 1;
2475             if (ir->shadow_comparitor) {
2476                writemask = WRITEMASK_Y;
2477                /* mlen already incremented */
2478             } else {
2479                writemask = WRITEMASK_X;
2480                inst->mlen++;
2481             }
2482          } else /* brw->gen == 4 */ {
2483             mrf = param_base;
2484             writemask = WRITEMASK_W;
2485          }
2486          emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2487       } else if (ir->op == ir_txf) {
2488          emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2489       } else if (ir->op == ir_txf_ms) {
2490          emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2491                   sample_index));
2492          if (brw->gen >= 7)
2493             /* MCS data is in the first channel of `mcs`, but we need to get it into
2494              * the .y channel of the second vec4 of params, so replicate .x across
2495              * the whole vec4 and then mask off everything except .y
2496              */
2497             mcs.swizzle = BRW_SWIZZLE_XXXX;
2498             emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2499                      mcs));
2500          inst->mlen++;
2501       } else if (ir->op == ir_txd) {
2502          const glsl_type *type = lod_type;
2503
2504          if (brw->gen >= 5) {
2505             dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2506             dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2507             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2508             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2509             inst->mlen++;
2510
2511             if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2512                dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2513                dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2514                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2515                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2516                inst->mlen++;
2517
2518                if (ir->shadow_comparitor) {
2519                   emit(MOV(dst_reg(MRF, param_base + 2,
2520                                    ir->shadow_comparitor->type, WRITEMASK_Z),
2521                            shadow_comparitor));
2522                }
2523             }
2524          } else /* brw->gen == 4 */ {
2525             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2526             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2527             inst->mlen += 2;
2528          }
2529       } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2530          if (ir->shadow_comparitor) {
2531             emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2532                      shadow_comparitor));
2533          }
2534
2535          emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2536                   offset_value));
2537          inst->mlen++;
2538       }
2539    }
2540
2541    emit(inst);
2542
2543    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2544     * spec requires layers.
2545     */
2546    if (ir->op == ir_txs) {
2547       glsl_type const *type = ir->sampler->type;
2548       if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2549           type->sampler_array) {
2550          emit_math(SHADER_OPCODE_INT_QUOTIENT,
2551                    writemask(inst->dst, WRITEMASK_Z),
2552                    src_reg(inst->dst), src_reg(6));
2553       }
2554    }
2555
2556    if (brw->gen == 6 && ir->op == ir_tg4) {
2557       emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2558    }
2559
2560    swizzle_result(ir, src_reg(inst->dst), sampler);
2561 }
2562
2563 /**
2564  * Apply workarounds for Gen6 gather with UINT/SINT
2565  */
2566 void
2567 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2568 {
2569    if (!wa)
2570       return;
2571
2572    int width = (wa & WA_8BIT) ? 8 : 16;
2573    dst_reg dst_f = dst;
2574    dst_f.type = BRW_REGISTER_TYPE_F;
2575
2576    /* Convert from UNORM to UINT */
2577    emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2578    emit(MOV(dst, src_reg(dst_f)));
2579
2580    if (wa & WA_SIGN) {
2581       /* Reinterpret the UINT value as a signed INT value by
2582        * shifting the sign bit into place, then shifting back
2583        * preserving sign.
2584        */
2585       emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2586       emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2587    }
2588 }
2589
2590 /**
2591  * Set up the gather channel based on the swizzle, for gather4.
2592  */
2593 uint32_t
2594 vec4_visitor::gather_channel(ir_texture *ir, uint32_t sampler)
2595 {
2596    ir_constant *chan = ir->lod_info.component->as_constant();
2597    int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2598    switch (swiz) {
2599       case SWIZZLE_X: return 0;
2600       case SWIZZLE_Y:
2601          /* gather4 sampler is broken for green channel on RG32F --
2602           * we must ask for blue instead.
2603           */
2604          if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2605             return 2;
2606          return 1;
2607       case SWIZZLE_Z: return 2;
2608       case SWIZZLE_W: return 3;
2609       default:
2610          unreachable("Not reached"); /* zero, one swizzles handled already */
2611    }
2612 }
2613
2614 void
2615 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, uint32_t sampler)
2616 {
2617    int s = key->tex.swizzles[sampler];
2618
2619    this->result = src_reg(this, ir->type);
2620    dst_reg swizzled_result(this->result);
2621
2622    if (ir->op == ir_query_levels) {
2623       /* # levels is in .w */
2624       orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2625       emit(MOV(swizzled_result, orig_val));
2626       return;
2627    }
2628
2629    if (ir->op == ir_txs || ir->type == glsl_type::float_type
2630                         || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2631       emit(MOV(swizzled_result, orig_val));
2632       return;
2633    }
2634
2635
2636    int zero_mask = 0, one_mask = 0, copy_mask = 0;
2637    int swizzle[4] = {0};
2638
2639    for (int i = 0; i < 4; i++) {
2640       switch (GET_SWZ(s, i)) {
2641       case SWIZZLE_ZERO:
2642          zero_mask |= (1 << i);
2643          break;
2644       case SWIZZLE_ONE:
2645          one_mask |= (1 << i);
2646          break;
2647       default:
2648          copy_mask |= (1 << i);
2649          swizzle[i] = GET_SWZ(s, i);
2650          break;
2651       }
2652    }
2653
2654    if (copy_mask) {
2655       orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2656       swizzled_result.writemask = copy_mask;
2657       emit(MOV(swizzled_result, orig_val));
2658    }
2659
2660    if (zero_mask) {
2661       swizzled_result.writemask = zero_mask;
2662       emit(MOV(swizzled_result, src_reg(0.0f)));
2663    }
2664
2665    if (one_mask) {
2666       swizzled_result.writemask = one_mask;
2667       emit(MOV(swizzled_result, src_reg(1.0f)));
2668    }
2669 }
2670
2671 void
2672 vec4_visitor::visit(ir_return *)
2673 {
2674    unreachable("not reached");
2675 }
2676
2677 void
2678 vec4_visitor::visit(ir_discard *)
2679 {
2680    unreachable("not reached");
2681 }
2682
2683 void
2684 vec4_visitor::visit(ir_if *ir)
2685 {
2686    /* Don't point the annotation at the if statement, because then it plus
2687     * the then and else blocks get printed.
2688     */
2689    this->base_ir = ir->condition;
2690
2691    if (brw->gen == 6) {
2692       emit_if_gen6(ir);
2693    } else {
2694       enum brw_predicate predicate;
2695       emit_bool_to_cond_code(ir->condition, &predicate);
2696       emit(IF(predicate));
2697    }
2698
2699    visit_instructions(&ir->then_instructions);
2700
2701    if (!ir->else_instructions.is_empty()) {
2702       this->base_ir = ir->condition;
2703       emit(BRW_OPCODE_ELSE);
2704
2705       visit_instructions(&ir->else_instructions);
2706    }
2707
2708    this->base_ir = ir->condition;
2709    emit(BRW_OPCODE_ENDIF);
2710 }
2711
2712 void
2713 vec4_visitor::visit(ir_emit_vertex *)
2714 {
2715    unreachable("not reached");
2716 }
2717
2718 void
2719 vec4_visitor::visit(ir_end_primitive *)
2720 {
2721    unreachable("not reached");
2722 }
2723
2724 void
2725 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2726                                   dst_reg dst, src_reg offset,
2727                                   src_reg src0, src_reg src1)
2728 {
2729    unsigned mlen = 0;
2730
2731    /* Set the atomic operation offset. */
2732    emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2733    mlen++;
2734
2735    /* Set the atomic operation arguments. */
2736    if (src0.file != BAD_FILE) {
2737       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2738       mlen++;
2739    }
2740
2741    if (src1.file != BAD_FILE) {
2742       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2743       mlen++;
2744    }
2745
2746    /* Emit the instruction.  Note that this maps to the normal SIMD8
2747     * untyped atomic message on Ivy Bridge, but that's OK because
2748     * unused channels will be masked out.
2749     */
2750    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
2751                                  src_reg(atomic_op), src_reg(surf_index));
2752    inst->base_mrf = 0;
2753    inst->mlen = mlen;
2754 }
2755
2756 void
2757 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
2758                                         src_reg offset)
2759 {
2760    /* Set the surface read offset. */
2761    emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
2762
2763    /* Emit the instruction.  Note that this maps to the normal SIMD8
2764     * untyped surface read message, but that's OK because unused
2765     * channels will be masked out.
2766     */
2767    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ,
2768                                  dst, src_reg(surf_index));
2769    inst->base_mrf = 0;
2770    inst->mlen = 1;
2771 }
2772
2773 void
2774 vec4_visitor::emit_ndc_computation()
2775 {
2776    /* Get the position */
2777    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2778
2779    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2780    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2781    output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2782
2783    current_annotation = "NDC";
2784    dst_reg ndc_w = ndc;
2785    ndc_w.writemask = WRITEMASK_W;
2786    src_reg pos_w = pos;
2787    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2788    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2789
2790    dst_reg ndc_xyz = ndc;
2791    ndc_xyz.writemask = WRITEMASK_XYZ;
2792
2793    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2794 }
2795
2796 void
2797 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2798 {
2799    if (brw->gen < 6 &&
2800        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2801         key->userclip_active || brw->has_negative_rhw_bug)) {
2802       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2803       dst_reg header1_w = header1;
2804       header1_w.writemask = WRITEMASK_W;
2805
2806       emit(MOV(header1, 0u));
2807
2808       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2809          src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2810
2811          current_annotation = "Point size";
2812          emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2813          emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2814       }
2815
2816       if (key->userclip_active) {
2817          current_annotation = "Clipping flags";
2818          dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
2819          dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
2820
2821          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
2822          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
2823          emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
2824
2825          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
2826          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
2827          emit(SHL(flags1, src_reg(flags1), src_reg(4)));
2828          emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
2829       }
2830
2831       /* i965 clipping workaround:
2832        * 1) Test for -ve rhw
2833        * 2) If set,
2834        *      set ndc = (0,0,0,0)
2835        *      set ucp[6] = 1
2836        *
2837        * Later, clipping will detect ucp[6] and ensure the primitive is
2838        * clipped against all fixed planes.
2839        */
2840       if (brw->has_negative_rhw_bug) {
2841          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
2842          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
2843          emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
2844          vec4_instruction *inst;
2845          inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
2846          inst->predicate = BRW_PREDICATE_NORMAL;
2847          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
2848          inst->predicate = BRW_PREDICATE_NORMAL;
2849       }
2850
2851       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2852    } else if (brw->gen < 6) {
2853       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2854    } else {
2855       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2856       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2857          emit(MOV(brw_writemask(reg, WRITEMASK_W),
2858                   src_reg(output_reg[VARYING_SLOT_PSIZ])));
2859       }
2860       if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
2861          emit(MOV(retype(brw_writemask(reg, WRITEMASK_Y), BRW_REGISTER_TYPE_D),
2862                   src_reg(output_reg[VARYING_SLOT_LAYER])));
2863       }
2864       if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
2865          emit(MOV(retype(brw_writemask(reg, WRITEMASK_Z), BRW_REGISTER_TYPE_D),
2866                   src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
2867       }
2868    }
2869 }
2870
2871 void
2872 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
2873 {
2874    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2875     *
2876     *     "If a linked set of shaders forming the vertex stage contains no
2877     *     static write to gl_ClipVertex or gl_ClipDistance, but the
2878     *     application has requested clipping against user clip planes through
2879     *     the API, then the coordinate written to gl_Position is used for
2880     *     comparison against the user clip planes."
2881     *
2882     * This function is only called if the shader didn't write to
2883     * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
2884     * if the user wrote to it; otherwise we use gl_Position.
2885     */
2886    gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
2887    if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
2888       clip_vertex = VARYING_SLOT_POS;
2889    }
2890
2891    for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
2892         ++i) {
2893       reg.writemask = 1 << i;
2894       emit(DP4(reg,
2895                src_reg(output_reg[clip_vertex]),
2896                src_reg(this->userplane[i + offset])));
2897    }
2898 }
2899
2900 void
2901 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
2902 {
2903    assert (varying < VARYING_SLOT_MAX);
2904    reg.type = output_reg[varying].type;
2905    current_annotation = output_reg_annotation[varying];
2906    /* Copy the register, saturating if necessary */
2907    vec4_instruction *inst = emit(MOV(reg,
2908                                      src_reg(output_reg[varying])));
2909    if ((varying == VARYING_SLOT_COL0 ||
2910         varying == VARYING_SLOT_COL1 ||
2911         varying == VARYING_SLOT_BFC0 ||
2912         varying == VARYING_SLOT_BFC1) &&
2913        key->clamp_vertex_color) {
2914       inst->saturate = true;
2915    }
2916 }
2917
2918 void
2919 vec4_visitor::emit_urb_slot(int mrf, int varying)
2920 {
2921    struct brw_reg hw_reg = brw_message_reg(mrf);
2922    dst_reg reg = dst_reg(MRF, mrf);
2923    reg.type = BRW_REGISTER_TYPE_F;
2924
2925    switch (varying) {
2926    case VARYING_SLOT_PSIZ:
2927       /* PSIZ is always in slot 0, and is coupled with other flags. */
2928       current_annotation = "indices, point width, clip flags";
2929       emit_psiz_and_flags(hw_reg);
2930       break;
2931    case BRW_VARYING_SLOT_NDC:
2932       current_annotation = "NDC";
2933       emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
2934       break;
2935    case VARYING_SLOT_POS:
2936       current_annotation = "gl_Position";
2937       emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
2938       break;
2939    case VARYING_SLOT_EDGE:
2940       /* This is present when doing unfilled polygons.  We're supposed to copy
2941        * the edge flag from the user-provided vertex array
2942        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
2943        * of that attribute (starts as 1.0f).  This is then used in clipping to
2944        * determine which edges should be drawn as wireframe.
2945        */
2946       current_annotation = "edge flag";
2947       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
2948                                     glsl_type::float_type, WRITEMASK_XYZW))));
2949       break;
2950    case BRW_VARYING_SLOT_PAD:
2951       /* No need to write to this slot */
2952       break;
2953    default:
2954       emit_generic_urb_slot(reg, varying);
2955       break;
2956    }
2957 }
2958
2959 static int
2960 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
2961 {
2962    if (brw->gen >= 6) {
2963       /* URB data written (does not include the message header reg) must
2964        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
2965        * section 5.4.3.2.2: URB_INTERLEAVED.
2966        *
2967        * URB entries are allocated on a multiple of 1024 bits, so an
2968        * extra 128 bits written here to make the end align to 256 is
2969        * no problem.
2970        */
2971       if ((mlen % 2) != 1)
2972          mlen++;
2973    }
2974
2975    return mlen;
2976 }
2977
2978
2979 /**
2980  * Generates the VUE payload plus the necessary URB write instructions to
2981  * output it.
2982  *
2983  * The VUE layout is documented in Volume 2a.
2984  */
2985 void
2986 vec4_visitor::emit_vertex()
2987 {
2988    /* MRF 0 is reserved for the debugger, so start with message header
2989     * in MRF 1.
2990     */
2991    int base_mrf = 1;
2992    int mrf = base_mrf;
2993    /* In the process of generating our URB write message contents, we
2994     * may need to unspill a register or load from an array.  Those
2995     * reads would use MRFs 14-15.
2996     */
2997    int max_usable_mrf = 13;
2998
2999    /* The following assertion verifies that max_usable_mrf causes an
3000     * even-numbered amount of URB write data, which will meet gen6's
3001     * requirements for length alignment.
3002     */
3003    assert ((max_usable_mrf - base_mrf) % 2 == 0);
3004
3005    /* First mrf is the g0-based message header containing URB handles and
3006     * such.
3007     */
3008    emit_urb_write_header(mrf++);
3009
3010    if (brw->gen < 6) {
3011       emit_ndc_computation();
3012    }
3013
3014    /* Lower legacy ff and ClipVertex clipping to clip distances */
3015    if (key->userclip_active && !prog->UsesClipDistanceOut) {
3016       current_annotation = "user clip distances";
3017
3018       output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3019       output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3020
3021       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3022       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3023    }
3024
3025    /* We may need to split this up into several URB writes, so do them in a
3026     * loop.
3027     */
3028    int slot = 0;
3029    bool complete = false;
3030    do {
3031       /* URB offset is in URB row increments, and each of our MRFs is half of
3032        * one of those, since we're doing interleaved writes.
3033        */
3034       int offset = slot / 2;
3035
3036       mrf = base_mrf + 1;
3037       for (; slot < prog_data->vue_map.num_slots; ++slot) {
3038          emit_urb_slot(mrf++, prog_data->vue_map.slot_to_varying[slot]);
3039
3040          /* If this was max_usable_mrf, we can't fit anything more into this
3041           * URB WRITE.
3042           */
3043          if (mrf > max_usable_mrf) {
3044             slot++;
3045             break;
3046          }
3047       }
3048
3049       complete = slot >= prog_data->vue_map.num_slots;
3050       current_annotation = "URB write";
3051       vec4_instruction *inst = emit_urb_write_opcode(complete);
3052       inst->base_mrf = base_mrf;
3053       inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
3054       inst->offset += offset;
3055    } while(!complete);
3056 }
3057
3058
3059 src_reg
3060 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
3061                                  src_reg *reladdr, int reg_offset)
3062 {
3063    /* Because we store the values to scratch interleaved like our
3064     * vertex data, we need to scale the vec4 index by 2.
3065     */
3066    int message_header_scale = 2;
3067
3068    /* Pre-gen6, the message header uses byte offsets instead of vec4
3069     * (16-byte) offset units.
3070     */
3071    if (brw->gen < 6)
3072       message_header_scale *= 16;
3073
3074    if (reladdr) {
3075       src_reg index = src_reg(this, glsl_type::int_type);
3076
3077       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3078       emit_before(inst, MUL(dst_reg(index),
3079                             index, src_reg(message_header_scale)));
3080
3081       return index;
3082    } else {
3083       return src_reg(reg_offset * message_header_scale);
3084    }
3085 }
3086
3087 src_reg
3088 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
3089                                        src_reg *reladdr, int reg_offset)
3090 {
3091    if (reladdr) {
3092       src_reg index = src_reg(this, glsl_type::int_type);
3093
3094       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3095
3096       /* Pre-gen6, the message header uses byte offsets instead of vec4
3097        * (16-byte) offset units.
3098        */
3099       if (brw->gen < 6) {
3100          emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
3101       }
3102
3103       return index;
3104    } else if (brw->gen >= 8) {
3105       /* Store the offset in a GRF so we can send-from-GRF. */
3106       src_reg offset = src_reg(this, glsl_type::int_type);
3107       emit_before(inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3108       return offset;
3109    } else {
3110       int message_header_scale = brw->gen < 6 ? 16 : 1;
3111       return src_reg(reg_offset * message_header_scale);
3112    }
3113 }
3114
3115 /**
3116  * Emits an instruction before @inst to load the value named by @orig_src
3117  * from scratch space at @base_offset to @temp.
3118  *
3119  * @base_offset is measured in 32-byte units (the size of a register).
3120  */
3121 void
3122 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
3123                                 dst_reg temp, src_reg orig_src,
3124                                 int base_offset)
3125 {
3126    int reg_offset = base_offset + orig_src.reg_offset;
3127    src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
3128
3129    emit_before(inst, SCRATCH_READ(temp, index));
3130 }
3131
3132 /**
3133  * Emits an instruction after @inst to store the value to be written
3134  * to @orig_dst to scratch space at @base_offset, from @temp.
3135  *
3136  * @base_offset is measured in 32-byte units (the size of a register).
3137  */
3138 void
3139 vec4_visitor::emit_scratch_write(vec4_instruction *inst, int base_offset)
3140 {
3141    int reg_offset = base_offset + inst->dst.reg_offset;
3142    src_reg index = get_scratch_offset(inst, inst->dst.reladdr, reg_offset);
3143
3144    /* Create a temporary register to store *inst's result in.
3145     *
3146     * We have to be careful in MOVing from our temporary result register in
3147     * the scratch write.  If we swizzle from channels of the temporary that
3148     * weren't initialized, it will confuse live interval analysis, which will
3149     * make spilling fail to make progress.
3150     */
3151    src_reg temp = src_reg(this, glsl_type::vec4_type);
3152    temp.type = inst->dst.type;
3153    int first_writemask_chan = ffs(inst->dst.writemask) - 1;
3154    int swizzles[4];
3155    for (int i = 0; i < 4; i++)
3156       if (inst->dst.writemask & (1 << i))
3157          swizzles[i] = i;
3158       else
3159          swizzles[i] = first_writemask_chan;
3160    temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
3161                                swizzles[2], swizzles[3]);
3162
3163    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3164                                        inst->dst.writemask));
3165    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3166    write->predicate = inst->predicate;
3167    write->ir = inst->ir;
3168    write->annotation = inst->annotation;
3169    inst->insert_after(write);
3170
3171    inst->dst.file = temp.file;
3172    inst->dst.reg = temp.reg;
3173    inst->dst.reg_offset = temp.reg_offset;
3174    inst->dst.reladdr = NULL;
3175 }
3176
3177 /**
3178  * We can't generally support array access in GRF space, because a
3179  * single instruction's destination can only span 2 contiguous
3180  * registers.  So, we send all GRF arrays that get variable index
3181  * access to scratch space.
3182  */
3183 void
3184 vec4_visitor::move_grf_array_access_to_scratch()
3185 {
3186    int scratch_loc[this->virtual_grf_count];
3187
3188    for (int i = 0; i < this->virtual_grf_count; i++) {
3189       scratch_loc[i] = -1;
3190    }
3191
3192    /* First, calculate the set of virtual GRFs that need to be punted
3193     * to scratch due to having any array access on them, and where in
3194     * scratch.
3195     */
3196    foreach_in_list(vec4_instruction, inst, &instructions) {
3197       if (inst->dst.file == GRF && inst->dst.reladdr &&
3198           scratch_loc[inst->dst.reg] == -1) {
3199          scratch_loc[inst->dst.reg] = c->last_scratch;
3200          c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
3201       }
3202
3203       for (int i = 0 ; i < 3; i++) {
3204          src_reg *src = &inst->src[i];
3205
3206          if (src->file == GRF && src->reladdr &&
3207              scratch_loc[src->reg] == -1) {
3208             scratch_loc[src->reg] = c->last_scratch;
3209             c->last_scratch += this->virtual_grf_sizes[src->reg];
3210          }
3211       }
3212    }
3213
3214    /* Now, for anything that will be accessed through scratch, rewrite
3215     * it to load/store.  Note that this is a _safe list walk, because
3216     * we may generate a new scratch_write instruction after the one
3217     * we're processing.
3218     */
3219    foreach_in_list_safe(vec4_instruction, inst, &instructions) {
3220       /* Set up the annotation tracking for new generated instructions. */
3221       base_ir = inst->ir;
3222       current_annotation = inst->annotation;
3223
3224       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
3225          emit_scratch_write(inst, scratch_loc[inst->dst.reg]);
3226       }
3227
3228       for (int i = 0 ; i < 3; i++) {
3229          if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
3230             continue;
3231
3232          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3233
3234          emit_scratch_read(inst, temp, inst->src[i],
3235                            scratch_loc[inst->src[i].reg]);
3236
3237          inst->src[i].file = temp.file;
3238          inst->src[i].reg = temp.reg;
3239          inst->src[i].reg_offset = temp.reg_offset;
3240          inst->src[i].reladdr = NULL;
3241       }
3242    }
3243 }
3244
3245 /**
3246  * Emits an instruction before @inst to load the value named by @orig_src
3247  * from the pull constant buffer (surface) at @base_offset to @temp.
3248  */
3249 void
3250 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
3251                                       dst_reg temp, src_reg orig_src,
3252                                       int base_offset)
3253 {
3254    int reg_offset = base_offset + orig_src.reg_offset;
3255    src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3256    src_reg offset = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
3257    vec4_instruction *load;
3258
3259    if (brw->gen >= 7) {
3260       dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
3261       grf_offset.type = offset.type;
3262       emit_before(inst, MOV(grf_offset, offset));
3263
3264       load = new(mem_ctx) vec4_instruction(this,
3265                                            VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
3266                                            temp, index, src_reg(grf_offset));
3267    } else {
3268       load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
3269                                            temp, index, offset);
3270       load->base_mrf = 14;
3271       load->mlen = 1;
3272    }
3273    emit_before(inst, load);
3274 }
3275
3276 /**
3277  * Implements array access of uniforms by inserting a
3278  * PULL_CONSTANT_LOAD instruction.
3279  *
3280  * Unlike temporary GRF array access (where we don't support it due to
3281  * the difficulty of doing relative addressing on instruction
3282  * destinations), we could potentially do array access of uniforms
3283  * that were loaded in GRF space as push constants.  In real-world
3284  * usage we've seen, though, the arrays being used are always larger
3285  * than we could load as push constants, so just always move all
3286  * uniform array access out to a pull constant buffer.
3287  */
3288 void
3289 vec4_visitor::move_uniform_array_access_to_pull_constants()
3290 {
3291    int pull_constant_loc[this->uniforms];
3292
3293    for (int i = 0; i < this->uniforms; i++) {
3294       pull_constant_loc[i] = -1;
3295    }
3296
3297    /* Walk through and find array access of uniforms.  Put a copy of that
3298     * uniform in the pull constant buffer.
3299     *
3300     * Note that we don't move constant-indexed accesses to arrays.  No
3301     * testing has been done of the performance impact of this choice.
3302     */
3303    foreach_in_list_safe(vec4_instruction, inst, &instructions) {
3304       for (int i = 0 ; i < 3; i++) {
3305          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3306             continue;
3307
3308          int uniform = inst->src[i].reg;
3309
3310          /* If this array isn't already present in the pull constant buffer,
3311           * add it.
3312           */
3313          if (pull_constant_loc[uniform] == -1) {
3314             const gl_constant_value **values =
3315                &stage_prog_data->param[uniform * 4];
3316
3317             pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3318
3319             assert(uniform < uniform_array_size);
3320             for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3321                stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3322                   = values[j];
3323             }
3324          }
3325
3326          /* Set up the annotation tracking for new generated instructions. */
3327          base_ir = inst->ir;
3328          current_annotation = inst->annotation;
3329
3330          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3331
3332          emit_pull_constant_load(inst, temp, inst->src[i],
3333                                  pull_constant_loc[uniform]);
3334
3335          inst->src[i].file = temp.file;
3336          inst->src[i].reg = temp.reg;
3337          inst->src[i].reg_offset = temp.reg_offset;
3338          inst->src[i].reladdr = NULL;
3339       }
3340    }
3341
3342    /* Now there are no accesses of the UNIFORM file with a reladdr, so
3343     * no need to track them as larger-than-vec4 objects.  This will be
3344     * relied on in cutting out unused uniform vectors from push
3345     * constants.
3346     */
3347    split_uniform_registers();
3348 }
3349
3350 void
3351 vec4_visitor::resolve_ud_negate(src_reg *reg)
3352 {
3353    if (reg->type != BRW_REGISTER_TYPE_UD ||
3354        !reg->negate)
3355       return;
3356
3357    src_reg temp = src_reg(this, glsl_type::uvec4_type);
3358    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3359    *reg = temp;
3360 }
3361
3362 vec4_visitor::vec4_visitor(struct brw_context *brw,
3363                            struct brw_vec4_compile *c,
3364                            struct gl_program *prog,
3365                            const struct brw_vec4_prog_key *key,
3366                            struct brw_vec4_prog_data *prog_data,
3367                            struct gl_shader_program *shader_prog,
3368                            gl_shader_stage stage,
3369                            void *mem_ctx,
3370                            bool debug_flag,
3371                            bool no_spills,
3372                            shader_time_shader_type st_base,
3373                            shader_time_shader_type st_written,
3374                            shader_time_shader_type st_reset)
3375    : backend_visitor(brw, shader_prog, prog, &prog_data->base, stage),
3376      c(c),
3377      key(key),
3378      prog_data(prog_data),
3379      sanity_param_count(0),
3380      fail_msg(NULL),
3381      first_non_payload_grf(0),
3382      need_all_constants_in_pull_buffer(false),
3383      debug_flag(debug_flag),
3384      no_spills(no_spills),
3385      st_base(st_base),
3386      st_written(st_written),
3387      st_reset(st_reset)
3388 {
3389    this->mem_ctx = mem_ctx;
3390    this->failed = false;
3391
3392    this->base_ir = NULL;
3393    this->current_annotation = NULL;
3394    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3395
3396    this->variable_ht = hash_table_ctor(0,
3397                                        hash_table_pointer_hash,
3398                                        hash_table_pointer_compare);
3399
3400    this->virtual_grf_start = NULL;
3401    this->virtual_grf_end = NULL;
3402    this->virtual_grf_sizes = NULL;
3403    this->virtual_grf_count = 0;
3404    this->virtual_grf_reg_map = NULL;
3405    this->virtual_grf_reg_count = 0;
3406    this->virtual_grf_array_size = 0;
3407    this->live_intervals_valid = false;
3408
3409    this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3410
3411    this->uniforms = 0;
3412
3413    /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3414     * at least one. See setup_uniforms() in brw_vec4.cpp.
3415     */
3416    this->uniform_array_size = 1;
3417    if (prog_data) {
3418       this->uniform_array_size = MAX2(stage_prog_data->nr_params, 1);
3419    }
3420
3421    this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3422    this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3423 }
3424
3425 vec4_visitor::~vec4_visitor()
3426 {
3427    hash_table_dtor(this->variable_ht);
3428 }
3429
3430
3431 void
3432 vec4_visitor::fail(const char *format, ...)
3433 {
3434    va_list va;
3435    char *msg;
3436
3437    if (failed)
3438       return;
3439
3440    failed = true;
3441
3442    va_start(va, format);
3443    msg = ralloc_vasprintf(mem_ctx, format, va);
3444    va_end(va);
3445    msg = ralloc_asprintf(mem_ctx, "vec4 compile failed: %s\n", msg);
3446
3447    this->fail_msg = msg;
3448
3449    if (debug_flag) {
3450       fprintf(stderr, "%s",  msg);
3451    }
3452 }
3453
3454 } /* namespace brw */