src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "glsl/ir_uniform.h"
  26 extern "C" {
  27 #include "program/sampler.h"
  28 }
  29
  30 namespace brw {
  31
  32 vec4_instruction::vec4_instruction(vec4_visitor *v,
  33                                    enum opcode opcode, const dst_reg &dst,
  34                                    const src_reg &src0, const src_reg &src1,
  35                                    const src_reg &src2)
  36 {
  37    this->opcode = opcode;
  38    this->dst = dst;
  39    this->src[0] = src0;
  40    this->src[1] = src1;
  41    this->src[2] = src2;
  42    this->saturate = false;
  43    this->force_writemask_all = false;
  44    this->no_dd_clear = false;
  45    this->no_dd_check = false;
  46    this->writes_accumulator = false;
  47    this->conditional_mod = BRW_CONDITIONAL_NONE;
  48    this->texture_offset = 0;
  49    this->target = 0;
  50    this->shadow_compare = false;
  51    this->ir = v->base_ir;
  52    this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
  53    this->header_present = false;
  54    this->mlen = 0;
  55    this->base_mrf = 0;
  56    this->offset = 0;
  57    this->annotation = v->current_annotation;
  58 }
  59
  60 vec4_instruction *
  61 vec4_visitor::emit(vec4_instruction *inst)
  62 {
  63    this->instructions.push_tail(inst);
  64
  65    return inst;
  66 }
  67
  68 vec4_instruction *
  69 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
  70 {
  71    new_inst->ir = inst->ir;
  72    new_inst->annotation = inst->annotation;
  73
  74    inst->insert_before(new_inst);
  75
  76    return inst;
  77 }
  78
  79 vec4_instruction *
  80 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
  81                    src_reg src0, src_reg src1, src_reg src2)
  82 {
  83    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
  84                                              src0, src1, src2));
  85 }
  86
  87
  88 vec4_instruction *
  89 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
  90 {
  91    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
  92 }
  93
  94 vec4_instruction *
  95 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
  96 {
  97    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
  98 }
  99
 100 vec4_instruction *
 101 vec4_visitor::emit(enum opcode opcode, dst_reg dst)
 102 {
 103    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst));
 104 }
 105
 106 vec4_instruction *
 107 vec4_visitor::emit(enum opcode opcode)
 108 {
 109    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
 110 }
 111
 112 #define ALU1(op)                                                        \
 113    vec4_instruction *                                                   \
 114    vec4_visitor::op(const dst_reg &dst, const src_reg &src0)            \
 115    {                                                                    \
 116       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 117                                            src0);                       \
 118    }
 119
 120 #define ALU2(op)                                                        \
 121    vec4_instruction *                                                   \
 122    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 123                     const src_reg &src1)                                \
 124    {                                                                    \
 125       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 126                                            src0, src1);                 \
 127    }
 128
 129 #define ALU2_ACC(op)                                                    \
 130    vec4_instruction *                                                   \
 131    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 132                     const src_reg &src1)                                \
 133    {                                                                    \
 134       vec4_instruction *inst = new(mem_ctx) vec4_instruction(this,     \
 135                        BRW_OPCODE_##op, dst, src0, src1);               \
 136       inst->writes_accumulator = true;                                 \
 137       return inst;                                                     \
 138    }
 139
 140 #define ALU3(op)                                                        \
 141    vec4_instruction *                                                   \
 142    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 143                     const src_reg &src1, const src_reg &src2)           \
 144    {                                                                    \
 145       assert(brw->gen >= 6);                                            \
 146       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 147                                            src0, src1, src2);           \
 148    }
 149
 150 ALU1(NOT)
 151 ALU1(MOV)
 152 ALU1(FRC)
 153 ALU1(RNDD)
 154 ALU1(RNDE)
 155 ALU1(RNDZ)
 156 ALU1(F32TO16)
 157 ALU1(F16TO32)
 158 ALU2(ADD)
 159 ALU2(MUL)
 160 ALU2_ACC(MACH)
 161 ALU2(AND)
 162 ALU2(OR)
 163 ALU2(XOR)
 164 ALU2(DP3)
 165 ALU2(DP4)
 166 ALU2(DPH)
 167 ALU2(SHL)
 168 ALU2(SHR)
 169 ALU2(ASR)
 170 ALU3(LRP)
 171 ALU1(BFREV)
 172 ALU3(BFE)
 173 ALU2(BFI1)
 174 ALU3(BFI2)
 175 ALU1(FBH)
 176 ALU1(FBL)
 177 ALU1(CBIT)
 178 ALU3(MAD)
 179 ALU2_ACC(ADDC)
 180 ALU2_ACC(SUBB)
 181 ALU2(MAC)
 182
 183 /** Gen4 predicated IF. */
 184 vec4_instruction *
 185 vec4_visitor::IF(enum brw_predicate predicate)
 186 {
 187    vec4_instruction *inst;
 188
 189    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
 190    inst->predicate = predicate;
 191
 192    return inst;
 193 }
 194
 195 /** Gen6 IF with embedded comparison. */
 196 vec4_instruction *
 197 vec4_visitor::IF(src_reg src0, src_reg src1,
 198                  enum brw_conditional_mod condition)
 199 {
 200    assert(brw->gen == 6);
 201
 202    vec4_instruction *inst;
 203
 204    resolve_ud_negate(&src0);
 205    resolve_ud_negate(&src1);
 206
 207    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
 208                                         src0, src1);
 209    inst->conditional_mod = condition;
 210
 211    return inst;
 212 }
 213
 214 /**
 215  * CMP: Sets the low bit of the destination channels with the result
 216  * of the comparison, while the upper bits are undefined, and updates
 217  * the flag register with the packed 16 bits of the result.
 218  */
 219 vec4_instruction *
 220 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
 221                   enum brw_conditional_mod condition)
 222 {
 223    vec4_instruction *inst;
 224
 225    /* original gen4 does type conversion to the destination type
 226     * before before comparison, producing garbage results for floating
 227     * point comparisons.
 228     */
 229    if (brw->gen == 4) {
 230       dst.type = src0.type;
 231       if (dst.file == HW_REG)
 232          dst.fixed_hw_reg.type = dst.type;
 233    }
 234
 235    resolve_ud_negate(&src0);
 236    resolve_ud_negate(&src1);
 237
 238    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
 239    inst->conditional_mod = condition;
 240
 241    return inst;
 242 }
 243
 244 vec4_instruction *
 245 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
 246 {
 247    vec4_instruction *inst;
 248
 249    inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_READ,
 250                                         dst, index);
 251    inst->base_mrf = 14;
 252    inst->mlen = 2;
 253
 254    return inst;
 255 }
 256
 257 vec4_instruction *
 258 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
 259                             const src_reg &index)
 260 {
 261    vec4_instruction *inst;
 262
 263    inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_WRITE,
 264                                         dst, src, index);
 265    inst->base_mrf = 13;
 266    inst->mlen = 3;
 267
 268    return inst;
 269 }
 270
 271 void
 272 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
 273 {
 274    static enum opcode dot_opcodes[] = {
 275       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
 276    };
 277
 278    emit(dot_opcodes[elements - 2], dst, src0, src1);
 279 }
 280
 281 src_reg
 282 vec4_visitor::fix_3src_operand(src_reg src)
 283 {
 284    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
 285     * able to use vertical stride of zero to replicate the vec4 uniform, like
 286     *
 287     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
 288     *
 289     * But you can't, since vertical stride is always four in three-source
 290     * instructions. Instead, insert a MOV instruction to do the replication so
 291     * that the three-source instruction can consume it.
 292     */
 293
 294    /* The MOV is only needed if the source is a uniform or immediate. */
 295    if (src.file != UNIFORM && src.file != IMM)
 296       return src;
 297
 298    if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
 299       return src;
 300
 301    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 302    expanded.type = src.type;
 303    emit(MOV(expanded, src));
 304    return src_reg(expanded);
 305 }
 306
 307 src_reg
 308 vec4_visitor::fix_math_operand(src_reg src)
 309 {
 310    /* The gen6 math instruction ignores the source modifiers --
 311     * swizzle, abs, negate, and at least some parts of the register
 312     * region description.
 313     *
 314     * Rather than trying to enumerate all these cases, *always* expand the
 315     * operand to a temp GRF for gen6.
 316     *
 317     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 318     * can't use.
 319     */
 320
 321    if (brw->gen == 7 && src.file != IMM)
 322       return src;
 323
 324    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 325    expanded.type = src.type;
 326    emit(MOV(expanded, src));
 327    return src_reg(expanded);
 328 }
 329
 330 void
 331 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
 332 {
 333    src = fix_math_operand(src);
 334
 335    if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
 336       /* The gen6 math instruction must be align1, so we can't do
 337        * writemasks.
 338        */
 339       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 340
 341       emit(opcode, temp_dst, src);
 342
 343       emit(MOV(dst, src_reg(temp_dst)));
 344    } else {
 345       emit(opcode, dst, src);
 346    }
 347 }
 348
 349 void
 350 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
 351 {
 352    vec4_instruction *inst = emit(opcode, dst, src);
 353    inst->base_mrf = 1;
 354    inst->mlen = 1;
 355 }
 356
 357 void
 358 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
 359 {
 360    switch (opcode) {
 361    case SHADER_OPCODE_RCP:
 362    case SHADER_OPCODE_RSQ:
 363    case SHADER_OPCODE_SQRT:
 364    case SHADER_OPCODE_EXP2:
 365    case SHADER_OPCODE_LOG2:
 366    case SHADER_OPCODE_SIN:
 367    case SHADER_OPCODE_COS:
 368       break;
 369    default:
 370       unreachable("not reached: bad math opcode");
 371    }
 372
 373    if (brw->gen >= 8) {
 374       emit(opcode, dst, src);
 375    } else if (brw->gen >= 6) {
 376       emit_math1_gen6(opcode, dst, src);
 377    } else {
 378       emit_math1_gen4(opcode, dst, src);
 379    }
 380 }
 381
 382 void
 383 vec4_visitor::emit_math2_gen6(enum opcode opcode,
 384                               dst_reg dst, src_reg src0, src_reg src1)
 385 {
 386    src0 = fix_math_operand(src0);
 387    src1 = fix_math_operand(src1);
 388
 389    if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
 390       /* The gen6 math instruction must be align1, so we can't do
 391        * writemasks.
 392        */
 393       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 394       temp_dst.type = dst.type;
 395
 396       emit(opcode, temp_dst, src0, src1);
 397
 398       emit(MOV(dst, src_reg(temp_dst)));
 399    } else {
 400       emit(opcode, dst, src0, src1);
 401    }
 402 }
 403
 404 void
 405 vec4_visitor::emit_math2_gen4(enum opcode opcode,
 406                               dst_reg dst, src_reg src0, src_reg src1)
 407 {
 408    vec4_instruction *inst = emit(opcode, dst, src0, src1);
 409    inst->base_mrf = 1;
 410    inst->mlen = 2;
 411 }
 412
 413 void
 414 vec4_visitor::emit_math(enum opcode opcode,
 415                         dst_reg dst, src_reg src0, src_reg src1)
 416 {
 417    switch (opcode) {
 418    case SHADER_OPCODE_POW:
 419    case SHADER_OPCODE_INT_QUOTIENT:
 420    case SHADER_OPCODE_INT_REMAINDER:
 421       break;
 422    default:
 423       unreachable("not reached: unsupported binary math opcode");
 424    }
 425
 426    if (brw->gen >= 8) {
 427       emit(opcode, dst, src0, src1);
 428    } else if (brw->gen >= 6) {
 429       emit_math2_gen6(opcode, dst, src0, src1);
 430    } else {
 431       emit_math2_gen4(opcode, dst, src0, src1);
 432    }
 433 }
 434
 435 void
 436 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 437 {
 438    if (brw->gen < 7) {
 439       unreachable("ir_unop_pack_half_2x16 should be lowered");
 440    }
 441
 442    assert(dst.type == BRW_REGISTER_TYPE_UD);
 443    assert(src0.type == BRW_REGISTER_TYPE_F);
 444
 445    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
 446     *
 447     *   Because this instruction does not have a 16-bit floating-point type,
 448     *   the destination data type must be Word (W).
 449     *
 450     *   The destination must be DWord-aligned and specify a horizontal stride
 451     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
 452     *   each destination channel and the upper word is not modified.
 453     *
 454     * The above restriction implies that the f32to16 instruction must use
 455     * align1 mode, because only in align1 mode is it possible to specify
 456     * horizontal stride.  We choose here to defy the hardware docs and emit
 457     * align16 instructions.
 458     *
 459     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
 460     * instructions. I was partially successful in that the code passed all
 461     * tests.  However, the code was dubiously correct and fragile, and the
 462     * tests were not harsh enough to probe that frailty. Not trusting the
 463     * code, I chose instead to remain in align16 mode in defiance of the hw
 464     * docs).
 465     *
 466     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
 467     * simulator, emitting a f32to16 in align16 mode with UD as destination
 468     * data type is safe. The behavior differs from that specified in the PRM
 469     * in that the upper word of each destination channel is cleared to 0.
 470     */
 471
 472    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 473    src_reg tmp_src(tmp_dst);
 474
 475 #if 0
 476    /* Verify the undocumented behavior on which the following instructions
 477     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
 478     * then the result of the bit-or instruction below will be incorrect.
 479     *
 480     * You should inspect the disasm output in order to verify that the MOV is
 481     * not optimized away.
 482     */
 483    emit(MOV(tmp_dst, src_reg(0x12345678u)));
 484 #endif
 485
 486    /* Give tmp the form below, where "." means untouched.
 487     *
 488     *     w z          y          x w z          y          x
 489     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
 490     *
 491     * That the upper word of each write-channel be 0 is required for the
 492     * following bit-shift and bit-or instructions to work. Note that this
 493     * relies on the undocumented hardware behavior mentioned above.
 494     */
 495    tmp_dst.writemask = WRITEMASK_XY;
 496    emit(F32TO16(tmp_dst, src0));
 497
 498    /* Give the write-channels of dst the form:
 499     *   0xhhhh0000
 500     */
 501    tmp_src.swizzle = BRW_SWIZZLE_YYYY;
 502    emit(SHL(dst, tmp_src, src_reg(16u)));
 503
 504    /* Finally, give the write-channels of dst the form of packHalf2x16's
 505     * output:
 506     *   0xhhhhllll
 507     */
 508    tmp_src.swizzle = BRW_SWIZZLE_XXXX;
 509    emit(OR(dst, src_reg(dst), tmp_src));
 510 }
 511
 512 void
 513 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 514 {
 515    if (brw->gen < 7) {
 516       unreachable("ir_unop_unpack_half_2x16 should be lowered");
 517    }
 518
 519    assert(dst.type == BRW_REGISTER_TYPE_F);
 520    assert(src0.type == BRW_REGISTER_TYPE_UD);
 521
 522    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
 523     *
 524     *   Because this instruction does not have a 16-bit floating-point type,
 525     *   the source data type must be Word (W). The destination type must be
 526     *   F (Float).
 527     *
 528     * To use W as the source data type, we must adjust horizontal strides,
 529     * which is only possible in align1 mode. All my [chadv] attempts at
 530     * emitting align1 instructions for unpackHalf2x16 failed to pass the
 531     * Piglit tests, so I gave up.
 532     *
 533     * I've verified that, on gen7 hardware and the simulator, it is safe to
 534     * emit f16to32 in align16 mode with UD as source data type.
 535     */
 536
 537    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 538    src_reg tmp_src(tmp_dst);
 539
 540    tmp_dst.writemask = WRITEMASK_X;
 541    emit(AND(tmp_dst, src0, src_reg(0xffffu)));
 542
 543    tmp_dst.writemask = WRITEMASK_Y;
 544    emit(SHR(tmp_dst, src0, src_reg(16u)));
 545
 546    dst.writemask = WRITEMASK_XY;
 547    emit(F16TO32(dst, tmp_src));
 548 }
 549
 550 void
 551 vec4_visitor::visit_instructions(const exec_list *list)
 552 {
 553    foreach_in_list(ir_instruction, ir, list) {
 554       base_ir = ir;
 555       ir->accept(this);
 556    }
 557 }
 558
 559
 560 static int
 561 type_size(const struct glsl_type *type)
 562 {
 563    unsigned int i;
 564    int size;
 565
 566    switch (type->base_type) {
 567    case GLSL_TYPE_UINT:
 568    case GLSL_TYPE_INT:
 569    case GLSL_TYPE_FLOAT:
 570    case GLSL_TYPE_BOOL:
 571       if (type->is_matrix()) {
 572          return type->matrix_columns;
 573       } else {
 574          /* Regardless of size of vector, it gets a vec4. This is bad
 575           * packing for things like floats, but otherwise arrays become a
 576           * mess.  Hopefully a later pass over the code can pack scalars
 577           * down if appropriate.
 578           */
 579          return 1;
 580       }
 581    case GLSL_TYPE_ARRAY:
 582       assert(type->length > 0);
 583       return type_size(type->fields.array) * type->length;
 584    case GLSL_TYPE_STRUCT:
 585       size = 0;
 586       for (i = 0; i < type->length; i++) {
 587          size += type_size(type->fields.structure[i].type);
 588       }
 589       return size;
 590    case GLSL_TYPE_SAMPLER:
 591       /* Samplers take up one slot in UNIFORMS[], but they're baked in
 592        * at link time.
 593        */
 594       return 1;
 595    case GLSL_TYPE_ATOMIC_UINT:
 596       return 0;
 597    case GLSL_TYPE_IMAGE:
 598    case GLSL_TYPE_VOID:
 599    case GLSL_TYPE_ERROR:
 600    case GLSL_TYPE_INTERFACE:
 601       unreachable("not reached");
 602    }
 603
 604    return 0;
 605 }
 606
 607 int
 608 vec4_visitor::virtual_grf_alloc(int size)
 609 {
 610    if (virtual_grf_array_size <= virtual_grf_count) {
 611       if (virtual_grf_array_size == 0)
 612          virtual_grf_array_size = 16;
 613       else
 614          virtual_grf_array_size *= 2;
 615       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 616                                    virtual_grf_array_size);
 617       virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
 618                                      virtual_grf_array_size);
 619    }
 620    virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
 621    virtual_grf_reg_count += size;
 622    virtual_grf_sizes[virtual_grf_count] = size;
 623    return virtual_grf_count++;
 624 }
 625
 626 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 627 {
 628    init();
 629
 630    this->file = GRF;
 631    this->reg = v->virtual_grf_alloc(type_size(type));
 632
 633    if (type->is_array() || type->is_record()) {
 634       this->swizzle = BRW_SWIZZLE_NOOP;
 635    } else {
 636       this->swizzle = swizzle_for_size(type->vector_elements);
 637    }
 638
 639    this->type = brw_type_for_base_type(type);
 640 }
 641
 642 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 643 {
 644    init();
 645
 646    this->file = GRF;
 647    this->reg = v->virtual_grf_alloc(type_size(type));
 648
 649    if (type->is_array() || type->is_record()) {
 650       this->writemask = WRITEMASK_XYZW;
 651    } else {
 652       this->writemask = (1 << type->vector_elements) - 1;
 653    }
 654
 655    this->type = brw_type_for_base_type(type);
 656 }
 657
 658 /* Our support for uniforms is piggy-backed on the struct
 659  * gl_fragment_program, because that's where the values actually
 660  * get stored, rather than in some global gl_shader_program uniform
 661  * store.
 662  */
 663 void
 664 vec4_visitor::setup_uniform_values(ir_variable *ir)
 665 {
 666    int namelen = strlen(ir->name);
 667
 668    /* The data for our (non-builtin) uniforms is stored in a series of
 669     * gl_uniform_driver_storage structs for each subcomponent that
 670     * glGetUniformLocation() could name.  We know it's been set up in the same
 671     * order we'd walk the type, so walk the list of storage and find anything
 672     * with our name, or the prefix of a component that starts with our name.
 673     */
 674    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 675       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 676
 677       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 678           (storage->name[namelen] != 0 &&
 679            storage->name[namelen] != '.' &&
 680            storage->name[namelen] != '[')) {
 681          continue;
 682       }
 683
 684       gl_constant_value *components = storage->storage;
 685       unsigned vector_count = (MAX2(storage->array_elements, 1) *
 686                                storage->type->matrix_columns);
 687
 688       for (unsigned s = 0; s < vector_count; s++) {
 689          assert(uniforms < uniform_array_size);
 690          uniform_vector_size[uniforms] = storage->type->vector_elements;
 691
 692          int i;
 693          for (i = 0; i < uniform_vector_size[uniforms]; i++) {
 694             stage_prog_data->param[uniforms * 4 + i] = components;
 695             components++;
 696          }
 697          for (; i < 4; i++) {
 698             static gl_constant_value zero = { 0.0 };
 699             stage_prog_data->param[uniforms * 4 + i] = &zero;
 700          }
 701
 702          uniforms++;
 703       }
 704    }
 705 }
 706
 707 void
 708 vec4_visitor::setup_uniform_clipplane_values()
 709 {
 710    gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
 711
 712    for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
 713       assert(this->uniforms < uniform_array_size);
 714       this->uniform_vector_size[this->uniforms] = 4;
 715       this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
 716       this->userplane[i].type = BRW_REGISTER_TYPE_F;
 717       for (int j = 0; j < 4; ++j) {
 718          stage_prog_data->param[this->uniforms * 4 + j] =
 719             (gl_constant_value *) &clip_planes[i][j];
 720       }
 721       ++this->uniforms;
 722    }
 723 }
 724
 725 /* Our support for builtin uniforms is even scarier than non-builtin.
 726  * It sits on top of the PROG_STATE_VAR parameters that are
 727  * automatically updated from GL context state.
 728  */
 729 void
 730 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
 731 {
 732    const ir_state_slot *const slots = ir->state_slots;
 733    assert(ir->state_slots != NULL);
 734
 735    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 736       /* This state reference has already been setup by ir_to_mesa,
 737        * but we'll get the same index back here.  We can reference
 738        * ParameterValues directly, since unlike brw_fs.cpp, we never
 739        * add new state references during compile.
 740        */
 741       int index = _mesa_add_state_reference(this->prog->Parameters,
 742                                             (gl_state_index *)slots[i].tokens);
 743       gl_constant_value *values =
 744          &this->prog->Parameters->ParameterValues[index][0];
 745
 746       assert(this->uniforms < uniform_array_size);
 747       this->uniform_vector_size[this->uniforms] = 0;
 748       /* Add each of the unique swizzled channels of the element.
 749        * This will end up matching the size of the glsl_type of this field.
 750        */
 751       int last_swiz = -1;
 752       for (unsigned int j = 0; j < 4; j++) {
 753          int swiz = GET_SWZ(slots[i].swizzle, j);
 754          last_swiz = swiz;
 755
 756          stage_prog_data->param[this->uniforms * 4 + j] = &values[swiz];
 757          assert(this->uniforms < uniform_array_size);
 758          if (swiz <= last_swiz)
 759             this->uniform_vector_size[this->uniforms]++;
 760       }
 761       this->uniforms++;
 762    }
 763 }
 764
 765 dst_reg *
 766 vec4_visitor::variable_storage(ir_variable *var)
 767 {
 768    return (dst_reg *)hash_table_find(this->variable_ht, var);
 769 }
 770
 771 void
 772 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir,
 773                                      enum brw_predicate *predicate)
 774 {
 775    ir_expression *expr = ir->as_expression();
 776
 777    *predicate = BRW_PREDICATE_NORMAL;
 778
 779    if (expr) {
 780       src_reg op[2];
 781       vec4_instruction *inst;
 782
 783       assert(expr->get_num_operands() <= 2);
 784       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 785          expr->operands[i]->accept(this);
 786          op[i] = this->result;
 787
 788          resolve_ud_negate(&op[i]);
 789       }
 790
 791       switch (expr->operation) {
 792       case ir_unop_logic_not:
 793          inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
 794          inst->conditional_mod = BRW_CONDITIONAL_Z;
 795          break;
 796
 797       case ir_binop_logic_xor:
 798          inst = emit(XOR(dst_null_d(), op[0], op[1]));
 799          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 800          break;
 801
 802       case ir_binop_logic_or:
 803          inst = emit(OR(dst_null_d(), op[0], op[1]));
 804          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 805          break;
 806
 807       case ir_binop_logic_and:
 808          inst = emit(AND(dst_null_d(), op[0], op[1]));
 809          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 810          break;
 811
 812       case ir_unop_f2b:
 813          if (brw->gen >= 6) {
 814             emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
 815          } else {
 816             inst = emit(MOV(dst_null_f(), op[0]));
 817             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 818          }
 819          break;
 820
 821       case ir_unop_i2b:
 822          if (brw->gen >= 6) {
 823             emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 824          } else {
 825             inst = emit(MOV(dst_null_d(), op[0]));
 826             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 827          }
 828          break;
 829
 830       case ir_binop_all_equal:
 831          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 832          *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
 833          break;
 834
 835       case ir_binop_any_nequal:
 836          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 837          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 838          break;
 839
 840       case ir_unop_any:
 841          inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 842          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 843          break;
 844
 845       case ir_binop_greater:
 846       case ir_binop_gequal:
 847       case ir_binop_less:
 848       case ir_binop_lequal:
 849       case ir_binop_equal:
 850       case ir_binop_nequal:
 851          emit(CMP(dst_null_d(), op[0], op[1],
 852                   brw_conditional_for_comparison(expr->operation)));
 853          break;
 854
 855       default:
 856          unreachable("not reached");
 857       }
 858       return;
 859    }
 860
 861    ir->accept(this);
 862
 863    resolve_ud_negate(&this->result);
 864
 865    if (brw->gen >= 6) {
 866       vec4_instruction *inst = emit(AND(dst_null_d(),
 867                                         this->result, src_reg(1)));
 868       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 869    } else {
 870       vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
 871       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 872    }
 873 }
 874
 875 /**
 876  * Emit a gen6 IF statement with the comparison folded into the IF
 877  * instruction.
 878  */
 879 void
 880 vec4_visitor::emit_if_gen6(ir_if *ir)
 881 {
 882    ir_expression *expr = ir->condition->as_expression();
 883
 884    if (expr) {
 885       src_reg op[2];
 886       dst_reg temp;
 887
 888       assert(expr->get_num_operands() <= 2);
 889       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 890          expr->operands[i]->accept(this);
 891          op[i] = this->result;
 892       }
 893
 894       switch (expr->operation) {
 895       case ir_unop_logic_not:
 896          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
 897          return;
 898
 899       case ir_binop_logic_xor:
 900          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
 901          return;
 902
 903       case ir_binop_logic_or:
 904          temp = dst_reg(this, glsl_type::bool_type);
 905          emit(OR(temp, op[0], op[1]));
 906          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 907          return;
 908
 909       case ir_binop_logic_and:
 910          temp = dst_reg(this, glsl_type::bool_type);
 911          emit(AND(temp, op[0], op[1]));
 912          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 913          return;
 914
 915       case ir_unop_f2b:
 916          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 917          return;
 918
 919       case ir_unop_i2b:
 920          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 921          return;
 922
 923       case ir_binop_greater:
 924       case ir_binop_gequal:
 925       case ir_binop_less:
 926       case ir_binop_lequal:
 927       case ir_binop_equal:
 928       case ir_binop_nequal:
 929          emit(IF(op[0], op[1],
 930                  brw_conditional_for_comparison(expr->operation)));
 931          return;
 932
 933       case ir_binop_all_equal:
 934          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 935          emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
 936          return;
 937
 938       case ir_binop_any_nequal:
 939          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 940          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 941          return;
 942
 943       case ir_unop_any:
 944          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 945          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 946          return;
 947
 948       default:
 949          unreachable("not reached");
 950       }
 951       return;
 952    }
 953
 954    ir->condition->accept(this);
 955
 956    emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
 957 }
 958
 959 void
 960 vec4_visitor::visit(ir_variable *ir)
 961 {
 962    dst_reg *reg = NULL;
 963
 964    if (variable_storage(ir))
 965       return;
 966
 967    switch (ir->data.mode) {
 968    case ir_var_shader_in:
 969       reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
 970       break;
 971
 972    case ir_var_shader_out:
 973       reg = new(mem_ctx) dst_reg(this, ir->type);
 974
 975       for (int i = 0; i < type_size(ir->type); i++) {
 976          output_reg[ir->data.location + i] = *reg;
 977          output_reg[ir->data.location + i].reg_offset = i;
 978          output_reg[ir->data.location + i].type =
 979             brw_type_for_base_type(ir->type->get_scalar_type());
 980          output_reg_annotation[ir->data.location + i] = ir->name;
 981       }
 982       break;
 983
 984    case ir_var_auto:
 985    case ir_var_temporary:
 986       reg = new(mem_ctx) dst_reg(this, ir->type);
 987       break;
 988
 989    case ir_var_uniform:
 990       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
 991
 992       /* Thanks to the lower_ubo_reference pass, we will see only
 993        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
 994        * variables, so no need for them to be in variable_ht.
 995        *
 996        * Atomic counters take no uniform storage, no need to do
 997        * anything here.
 998        */
 999       if (ir->is_in_uniform_block() || ir->type->contains_atomic())
1000          return;
1001
1002       /* Track how big the whole uniform variable is, in case we need to put a
1003        * copy of its data into pull constants for array access.
1004        */
1005       assert(this->uniforms < uniform_array_size);
1006       this->uniform_size[this->uniforms] = type_size(ir->type);
1007
1008       if (!strncmp(ir->name, "gl_", 3)) {
1009          setup_builtin_uniform_values(ir);
1010       } else {
1011          setup_uniform_values(ir);
1012       }
1013       break;
1014
1015    case ir_var_system_value:
1016       reg = make_reg_for_system_value(ir);
1017       break;
1018
1019    default:
1020       unreachable("not reached");
1021    }
1022
1023    reg->type = brw_type_for_base_type(ir->type);
1024    hash_table_insert(this->variable_ht, reg, ir);
1025 }
1026
1027 void
1028 vec4_visitor::visit(ir_loop *ir)
1029 {
1030    /* We don't want debugging output to print the whole body of the
1031     * loop as the annotation.
1032     */
1033    this->base_ir = NULL;
1034
1035    emit(BRW_OPCODE_DO);
1036
1037    visit_instructions(&ir->body_instructions);
1038
1039    emit(BRW_OPCODE_WHILE);
1040 }
1041
1042 void
1043 vec4_visitor::visit(ir_loop_jump *ir)
1044 {
1045    switch (ir->mode) {
1046    case ir_loop_jump::jump_break:
1047       emit(BRW_OPCODE_BREAK);
1048       break;
1049    case ir_loop_jump::jump_continue:
1050       emit(BRW_OPCODE_CONTINUE);
1051       break;
1052    }
1053 }
1054
1055
1056 void
1057 vec4_visitor::visit(ir_function_signature *)
1058 {
1059    unreachable("not reached");
1060 }
1061
1062 void
1063 vec4_visitor::visit(ir_function *ir)
1064 {
1065    /* Ignore function bodies other than main() -- we shouldn't see calls to
1066     * them since they should all be inlined.
1067     */
1068    if (strcmp(ir->name, "main") == 0) {
1069       const ir_function_signature *sig;
1070       exec_list empty;
1071
1072       sig = ir->matching_signature(NULL, &empty, false);
1073
1074       assert(sig);
1075
1076       visit_instructions(&sig->body);
1077    }
1078 }
1079
1080 bool
1081 vec4_visitor::try_emit_mad(ir_expression *ir)
1082 {
1083    /* 3-src instructions were introduced in gen6. */
1084    if (brw->gen < 6)
1085       return false;
1086
1087    /* MAD can only handle floating-point data. */
1088    if (ir->type->base_type != GLSL_TYPE_FLOAT)
1089       return false;
1090
1091    ir_rvalue *nonmul = ir->operands[1];
1092    ir_expression *mul = ir->operands[0]->as_expression();
1093
1094    if (!mul || mul->operation != ir_binop_mul) {
1095       nonmul = ir->operands[0];
1096       mul = ir->operands[1]->as_expression();
1097
1098       if (!mul || mul->operation != ir_binop_mul)
1099          return false;
1100    }
1101
1102    nonmul->accept(this);
1103    src_reg src0 = fix_3src_operand(this->result);
1104
1105    mul->operands[0]->accept(this);
1106    src_reg src1 = fix_3src_operand(this->result);
1107
1108    mul->operands[1]->accept(this);
1109    src_reg src2 = fix_3src_operand(this->result);
1110
1111    this->result = src_reg(this, ir->type);
1112    emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1113
1114    return true;
1115 }
1116
1117 bool
1118 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1119 {
1120    /* This optimization relies on CMP setting the destination to 0 when
1121     * false.  Early hardware only sets the least significant bit, and
1122     * leaves the other bits undefined.  So we can't use it.
1123     */
1124    if (brw->gen < 6)
1125       return false;
1126
1127    ir_expression *const cmp = ir->operands[0]->as_expression();
1128
1129    if (cmp == NULL)
1130       return false;
1131
1132    switch (cmp->operation) {
1133    case ir_binop_less:
1134    case ir_binop_greater:
1135    case ir_binop_lequal:
1136    case ir_binop_gequal:
1137    case ir_binop_equal:
1138    case ir_binop_nequal:
1139       break;
1140
1141    default:
1142       return false;
1143    }
1144
1145    cmp->operands[0]->accept(this);
1146    const src_reg cmp_src0 = this->result;
1147
1148    cmp->operands[1]->accept(this);
1149    const src_reg cmp_src1 = this->result;
1150
1151    this->result = src_reg(this, ir->type);
1152
1153    emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1154             brw_conditional_for_comparison(cmp->operation)));
1155
1156    /* If the comparison is false, this->result will just happen to be zero.
1157     */
1158    vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1159                                        this->result, src_reg(1.0f));
1160    inst->predicate = BRW_PREDICATE_NORMAL;
1161    inst->predicate_inverse = true;
1162
1163    return true;
1164 }
1165
1166 void
1167 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
1168                           src_reg src0, src_reg src1)
1169 {
1170    vec4_instruction *inst;
1171
1172    if (brw->gen >= 6) {
1173       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1174       inst->conditional_mod = conditionalmod;
1175    } else {
1176       emit(CMP(dst, src0, src1, conditionalmod));
1177
1178       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1179       inst->predicate = BRW_PREDICATE_NORMAL;
1180    }
1181 }
1182
1183 void
1184 vec4_visitor::emit_lrp(const dst_reg &dst,
1185                        const src_reg &x, const src_reg &y, const src_reg &a)
1186 {
1187    if (brw->gen >= 6) {
1188       /* Note that the instruction's argument order is reversed from GLSL
1189        * and the IR.
1190        */
1191       emit(LRP(dst,
1192                fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
1193    } else {
1194       /* Earlier generations don't support three source operations, so we
1195        * need to emit x*(1-a) + y*a.
1196        */
1197       dst_reg y_times_a           = dst_reg(this, glsl_type::vec4_type);
1198       dst_reg one_minus_a         = dst_reg(this, glsl_type::vec4_type);
1199       dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1200       y_times_a.writemask           = dst.writemask;
1201       one_minus_a.writemask         = dst.writemask;
1202       x_times_one_minus_a.writemask = dst.writemask;
1203
1204       emit(MUL(y_times_a, y, a));
1205       emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1206       emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1207       emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1208    }
1209 }
1210
1211 void
1212 vec4_visitor::visit(ir_expression *ir)
1213 {
1214    unsigned int operand;
1215    src_reg op[Elements(ir->operands)];
1216    src_reg result_src;
1217    dst_reg result_dst;
1218    vec4_instruction *inst;
1219
1220    if (ir->operation == ir_binop_add) {
1221       if (try_emit_mad(ir))
1222          return;
1223    }
1224
1225    if (ir->operation == ir_unop_b2f) {
1226       if (try_emit_b2f_of_compare(ir))
1227          return;
1228    }
1229
1230    for (operand = 0; operand < ir->get_num_operands(); operand++) {
1231       this->result.file = BAD_FILE;
1232       ir->operands[operand]->accept(this);
1233       if (this->result.file == BAD_FILE) {
1234          fprintf(stderr, "Failed to get tree for expression operand:\n");
1235          ir->operands[operand]->fprint(stderr);
1236          exit(1);
1237       }
1238       op[operand] = this->result;
1239
1240       /* Matrix expression operands should have been broken down to vector
1241        * operations already.
1242        */
1243       assert(!ir->operands[operand]->type->is_matrix());
1244    }
1245
1246    int vector_elements = ir->operands[0]->type->vector_elements;
1247    if (ir->operands[1]) {
1248       vector_elements = MAX2(vector_elements,
1249                              ir->operands[1]->type->vector_elements);
1250    }
1251
1252    this->result.file = BAD_FILE;
1253
1254    /* Storage for our result.  Ideally for an assignment we'd be using
1255     * the actual storage for the result here, instead.
1256     */
1257    result_src = src_reg(this, ir->type);
1258    /* convenience for the emit functions below. */
1259    result_dst = dst_reg(result_src);
1260    /* If nothing special happens, this is the result. */
1261    this->result = result_src;
1262    /* Limit writes to the channels that will be used by result_src later.
1263     * This does limit this temp's use as a temporary for multi-instruction
1264     * sequences.
1265     */
1266    result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1267
1268    switch (ir->operation) {
1269    case ir_unop_logic_not:
1270       if (ctx->Const.UniformBooleanTrue != 1) {
1271          emit(NOT(result_dst, op[0]));
1272       } else {
1273          emit(XOR(result_dst, op[0], src_reg(1)));
1274       }
1275       break;
1276    case ir_unop_neg:
1277       op[0].negate = !op[0].negate;
1278       emit(MOV(result_dst, op[0]));
1279       break;
1280    case ir_unop_abs:
1281       op[0].abs = true;
1282       op[0].negate = false;
1283       emit(MOV(result_dst, op[0]));
1284       break;
1285
1286    case ir_unop_sign:
1287       if (ir->type->is_float()) {
1288          /* AND(val, 0x80000000) gives the sign bit.
1289           *
1290           * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1291           * zero.
1292           */
1293          emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1294
1295          op[0].type = BRW_REGISTER_TYPE_UD;
1296          result_dst.type = BRW_REGISTER_TYPE_UD;
1297          emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1298
1299          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1300          inst->predicate = BRW_PREDICATE_NORMAL;
1301
1302          this->result.type = BRW_REGISTER_TYPE_F;
1303       } else {
1304          /*  ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1305           *               -> non-negative val generates 0x00000000.
1306           *  Predicated OR sets 1 if val is positive.
1307           */
1308          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1309
1310          emit(ASR(result_dst, op[0], src_reg(31)));
1311
1312          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1313          inst->predicate = BRW_PREDICATE_NORMAL;
1314       }
1315       break;
1316
1317    case ir_unop_rcp:
1318       emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1319       break;
1320
1321    case ir_unop_exp2:
1322       emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1323       break;
1324    case ir_unop_log2:
1325       emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1326       break;
1327    case ir_unop_exp:
1328    case ir_unop_log:
1329       unreachable("not reached: should be handled by ir_explog_to_explog2");
1330    case ir_unop_sin:
1331    case ir_unop_sin_reduced:
1332       emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1333       break;
1334    case ir_unop_cos:
1335    case ir_unop_cos_reduced:
1336       emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1337       break;
1338
1339    case ir_unop_dFdx:
1340    case ir_unop_dFdx_coarse:
1341    case ir_unop_dFdx_fine:
1342    case ir_unop_dFdy:
1343    case ir_unop_dFdy_coarse:
1344    case ir_unop_dFdy_fine:
1345       unreachable("derivatives not valid in vertex shader");
1346
1347    case ir_unop_bitfield_reverse:
1348       emit(BFREV(result_dst, op[0]));
1349       break;
1350    case ir_unop_bit_count:
1351       emit(CBIT(result_dst, op[0]));
1352       break;
1353    case ir_unop_find_msb: {
1354       src_reg temp = src_reg(this, glsl_type::uint_type);
1355
1356       inst = emit(FBH(dst_reg(temp), op[0]));
1357       inst->dst.writemask = WRITEMASK_XYZW;
1358
1359       /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1360        * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1361        * subtract the result from 31 to convert the MSB count into an LSB count.
1362        */
1363
1364       /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1365       temp.swizzle = BRW_SWIZZLE_NOOP;
1366       emit(MOV(result_dst, temp));
1367
1368       src_reg src_tmp = src_reg(result_dst);
1369       emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1370
1371       src_tmp.negate = true;
1372       inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1373       inst->predicate = BRW_PREDICATE_NORMAL;
1374       break;
1375    }
1376    case ir_unop_find_lsb:
1377       emit(FBL(result_dst, op[0]));
1378       break;
1379    case ir_unop_saturate:
1380       inst = emit(MOV(result_dst, op[0]));
1381       inst->saturate = true;
1382       break;
1383
1384    case ir_unop_noise:
1385       unreachable("not reached: should be handled by lower_noise");
1386
1387    case ir_binop_add:
1388       emit(ADD(result_dst, op[0], op[1]));
1389       break;
1390    case ir_binop_sub:
1391       unreachable("not reached: should be handled by ir_sub_to_add_neg");
1392
1393    case ir_binop_mul:
1394       if (brw->gen < 8 && ir->type->is_integer()) {
1395          /* For integer multiplication, the MUL uses the low 16 bits of one of
1396           * the operands (src0 through SNB, src1 on IVB and later).  The MACH
1397           * accumulates in the contribution of the upper 16 bits of that
1398           * operand.  If we can determine that one of the args is in the low
1399           * 16 bits, though, we can just emit a single MUL.
1400           */
1401          if (ir->operands[0]->is_uint16_constant()) {
1402             if (brw->gen < 7)
1403                emit(MUL(result_dst, op[0], op[1]));
1404             else
1405                emit(MUL(result_dst, op[1], op[0]));
1406          } else if (ir->operands[1]->is_uint16_constant()) {
1407             if (brw->gen < 7)
1408                emit(MUL(result_dst, op[1], op[0]));
1409             else
1410                emit(MUL(result_dst, op[0], op[1]));
1411          } else {
1412             struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1413
1414             emit(MUL(acc, op[0], op[1]));
1415             emit(MACH(dst_null_d(), op[0], op[1]));
1416             emit(MOV(result_dst, src_reg(acc)));
1417          }
1418       } else {
1419          emit(MUL(result_dst, op[0], op[1]));
1420       }
1421       break;
1422    case ir_binop_imul_high: {
1423       struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1424
1425       emit(MUL(acc, op[0], op[1]));
1426       emit(MACH(result_dst, op[0], op[1]));
1427       break;
1428    }
1429    case ir_binop_div:
1430       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1431       assert(ir->type->is_integer());
1432       emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1433       break;
1434    case ir_binop_carry: {
1435       struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1436
1437       emit(ADDC(dst_null_ud(), op[0], op[1]));
1438       emit(MOV(result_dst, src_reg(acc)));
1439       break;
1440    }
1441    case ir_binop_borrow: {
1442       struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1443
1444       emit(SUBB(dst_null_ud(), op[0], op[1]));
1445       emit(MOV(result_dst, src_reg(acc)));
1446       break;
1447    }
1448    case ir_binop_mod:
1449       /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1450       assert(ir->type->is_integer());
1451       emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1452       break;
1453
1454    case ir_binop_less:
1455    case ir_binop_greater:
1456    case ir_binop_lequal:
1457    case ir_binop_gequal:
1458    case ir_binop_equal:
1459    case ir_binop_nequal: {
1460       emit(CMP(result_dst, op[0], op[1],
1461                brw_conditional_for_comparison(ir->operation)));
1462       if (ctx->Const.UniformBooleanTrue == 1) {
1463          emit(AND(result_dst, result_src, src_reg(1)));
1464       }
1465       break;
1466    }
1467
1468    case ir_binop_all_equal:
1469       /* "==" operator producing a scalar boolean. */
1470       if (ir->operands[0]->type->is_vector() ||
1471           ir->operands[1]->type->is_vector()) {
1472          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1473          emit(MOV(result_dst, src_reg(0)));
1474          inst = emit(MOV(result_dst, src_reg(ctx->Const.UniformBooleanTrue)));
1475          inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1476       } else {
1477          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1478          if (ctx->Const.UniformBooleanTrue == 1) {
1479             emit(AND(result_dst, result_src, src_reg(1)));
1480          }
1481       }
1482       break;
1483    case ir_binop_any_nequal:
1484       /* "!=" operator producing a scalar boolean. */
1485       if (ir->operands[0]->type->is_vector() ||
1486           ir->operands[1]->type->is_vector()) {
1487          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1488
1489          emit(MOV(result_dst, src_reg(0)));
1490          inst = emit(MOV(result_dst, src_reg(ctx->Const.UniformBooleanTrue)));
1491          inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1492       } else {
1493          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1494          if (ctx->Const.UniformBooleanTrue == 1) {
1495             emit(AND(result_dst, result_src, src_reg(1)));
1496          }
1497       }
1498       break;
1499
1500    case ir_unop_any:
1501       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1502       emit(MOV(result_dst, src_reg(0)));
1503
1504       inst = emit(MOV(result_dst, src_reg(ctx->Const.UniformBooleanTrue)));
1505       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1506       break;
1507
1508    case ir_binop_logic_xor:
1509       emit(XOR(result_dst, op[0], op[1]));
1510       break;
1511
1512    case ir_binop_logic_or:
1513       emit(OR(result_dst, op[0], op[1]));
1514       break;
1515
1516    case ir_binop_logic_and:
1517       emit(AND(result_dst, op[0], op[1]));
1518       break;
1519
1520    case ir_binop_dot:
1521       assert(ir->operands[0]->type->is_vector());
1522       assert(ir->operands[0]->type == ir->operands[1]->type);
1523       emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1524       break;
1525
1526    case ir_unop_sqrt:
1527       emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1528       break;
1529    case ir_unop_rsq:
1530       emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1531       break;
1532
1533    case ir_unop_bitcast_i2f:
1534    case ir_unop_bitcast_u2f:
1535       this->result = op[0];
1536       this->result.type = BRW_REGISTER_TYPE_F;
1537       break;
1538
1539    case ir_unop_bitcast_f2i:
1540       this->result = op[0];
1541       this->result.type = BRW_REGISTER_TYPE_D;
1542       break;
1543
1544    case ir_unop_bitcast_f2u:
1545       this->result = op[0];
1546       this->result.type = BRW_REGISTER_TYPE_UD;
1547       break;
1548
1549    case ir_unop_i2f:
1550    case ir_unop_i2u:
1551    case ir_unop_u2i:
1552    case ir_unop_u2f:
1553    case ir_unop_f2i:
1554    case ir_unop_f2u:
1555       emit(MOV(result_dst, op[0]));
1556       break;
1557    case ir_unop_b2i:
1558       if (ctx->Const.UniformBooleanTrue != 1) {
1559          emit(AND(result_dst, op[0], src_reg(1)));
1560       } else {
1561          emit(MOV(result_dst, op[0]));
1562       }
1563       break;
1564    case ir_unop_b2f:
1565       if (ctx->Const.UniformBooleanTrue != 1) {
1566          op[0].type = BRW_REGISTER_TYPE_UD;
1567          result_dst.type = BRW_REGISTER_TYPE_UD;
1568          emit(AND(result_dst, op[0], src_reg(0x3f800000u)));
1569          result_dst.type = BRW_REGISTER_TYPE_F;
1570       } else {
1571          emit(MOV(result_dst, op[0]));
1572       }
1573       break;
1574    case ir_unop_f2b:
1575    case ir_unop_i2b:
1576       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1577       if (ctx->Const.UniformBooleanTrue == 1) {
1578          emit(AND(result_dst, result_src, src_reg(1)));
1579       }
1580       break;
1581
1582    case ir_unop_trunc:
1583       emit(RNDZ(result_dst, op[0]));
1584       break;
1585    case ir_unop_ceil:
1586       op[0].negate = !op[0].negate;
1587       inst = emit(RNDD(result_dst, op[0]));
1588       this->result.negate = true;
1589       break;
1590    case ir_unop_floor:
1591       inst = emit(RNDD(result_dst, op[0]));
1592       break;
1593    case ir_unop_fract:
1594       inst = emit(FRC(result_dst, op[0]));
1595       break;
1596    case ir_unop_round_even:
1597       emit(RNDE(result_dst, op[0]));
1598       break;
1599
1600    case ir_binop_min:
1601       emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1602       break;
1603    case ir_binop_max:
1604       emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1605       break;
1606
1607    case ir_binop_pow:
1608       emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1609       break;
1610
1611    case ir_unop_bit_not:
1612       inst = emit(NOT(result_dst, op[0]));
1613       break;
1614    case ir_binop_bit_and:
1615       inst = emit(AND(result_dst, op[0], op[1]));
1616       break;
1617    case ir_binop_bit_xor:
1618       inst = emit(XOR(result_dst, op[0], op[1]));
1619       break;
1620    case ir_binop_bit_or:
1621       inst = emit(OR(result_dst, op[0], op[1]));
1622       break;
1623
1624    case ir_binop_lshift:
1625       inst = emit(SHL(result_dst, op[0], op[1]));
1626       break;
1627
1628    case ir_binop_rshift:
1629       if (ir->type->base_type == GLSL_TYPE_INT)
1630          inst = emit(ASR(result_dst, op[0], op[1]));
1631       else
1632          inst = emit(SHR(result_dst, op[0], op[1]));
1633       break;
1634
1635    case ir_binop_bfm:
1636       emit(BFI1(result_dst, op[0], op[1]));
1637       break;
1638
1639    case ir_binop_ubo_load: {
1640       ir_constant *const_uniform_block = ir->operands[0]->as_constant();
1641       ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1642       unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1643       src_reg offset;
1644
1645       /* Now, load the vector from that offset. */
1646       assert(ir->type->is_vector() || ir->type->is_scalar());
1647
1648       src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1649       packed_consts.type = result.type;
1650       src_reg surf_index;
1651
1652       if (const_uniform_block) {
1653          /* The block index is a constant, so just emit the binding table entry
1654           * as an immediate.
1655           */
1656          surf_index = src_reg(prog_data->base.binding_table.ubo_start +
1657                               const_uniform_block->value.u[0]);
1658       } else {
1659          /* The block index is not a constant. Evaluate the index expression
1660           * per-channel and add the base UBO index; the generator will select
1661           * a value from any live channel.
1662           */
1663          surf_index = src_reg(this, glsl_type::uint_type);
1664          emit(ADD(dst_reg(surf_index), op[0],
1665                   src_reg(prog_data->base.binding_table.ubo_start)));
1666
1667          /* Assume this may touch any UBO. It would be nice to provide
1668           * a tighter bound, but the array information is already lowered away.
1669           */
1670          brw_mark_surface_used(&prog_data->base,
1671                                prog_data->base.binding_table.ubo_start +
1672                                shader_prog->NumUniformBlocks - 1);
1673       }
1674
1675       if (const_offset_ir) {
1676          if (brw->gen >= 8) {
1677             /* Store the offset in a GRF so we can send-from-GRF. */
1678             offset = src_reg(this, glsl_type::int_type);
1679             emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1680          } else {
1681             /* Immediates are fine on older generations since they'll be moved
1682              * to a (potentially fake) MRF at the generator level.
1683              */
1684             offset = src_reg(const_offset / 16);
1685          }
1686       } else {
1687          offset = src_reg(this, glsl_type::uint_type);
1688          emit(SHR(dst_reg(offset), op[1], src_reg(4)));
1689       }
1690
1691       if (brw->gen >= 7) {
1692          dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1693          grf_offset.type = offset.type;
1694
1695          emit(MOV(grf_offset, offset));
1696
1697          emit(new(mem_ctx) vec4_instruction(this,
1698                                             VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1699                                             dst_reg(packed_consts),
1700                                             surf_index,
1701                                             src_reg(grf_offset)));
1702       } else {
1703          vec4_instruction *pull =
1704             emit(new(mem_ctx) vec4_instruction(this,
1705                                                VS_OPCODE_PULL_CONSTANT_LOAD,
1706                                                dst_reg(packed_consts),
1707                                                surf_index,
1708                                                offset));
1709          pull->base_mrf = 14;
1710          pull->mlen = 1;
1711       }
1712
1713       packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1714       packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1715                                             const_offset % 16 / 4,
1716                                             const_offset % 16 / 4,
1717                                             const_offset % 16 / 4);
1718
1719       /* UBO bools are any nonzero int.  We need to convert them to use the
1720        * value of true stored in ctx->Const.UniformBooleanTrue.
1721        */
1722       if (ir->type->base_type == GLSL_TYPE_BOOL) {
1723          emit(CMP(result_dst, packed_consts, src_reg(0u),
1724                   BRW_CONDITIONAL_NZ));
1725          if (ctx->Const.UniformBooleanTrue == 1) {
1726             emit(AND(result_dst, result, src_reg(1)));
1727          }
1728       } else {
1729          emit(MOV(result_dst, packed_consts));
1730       }
1731       break;
1732    }
1733
1734    case ir_binop_vector_extract:
1735       unreachable("should have been lowered by vec_index_to_cond_assign");
1736
1737    case ir_triop_fma:
1738       op[0] = fix_3src_operand(op[0]);
1739       op[1] = fix_3src_operand(op[1]);
1740       op[2] = fix_3src_operand(op[2]);
1741       /* Note that the instruction's argument order is reversed from GLSL
1742        * and the IR.
1743        */
1744       emit(MAD(result_dst, op[2], op[1], op[0]));
1745       break;
1746
1747    case ir_triop_lrp:
1748       emit_lrp(result_dst, op[0], op[1], op[2]);
1749       break;
1750
1751    case ir_triop_csel:
1752       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1753       inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1754       inst->predicate = BRW_PREDICATE_NORMAL;
1755       break;
1756
1757    case ir_triop_bfi:
1758       op[0] = fix_3src_operand(op[0]);
1759       op[1] = fix_3src_operand(op[1]);
1760       op[2] = fix_3src_operand(op[2]);
1761       emit(BFI2(result_dst, op[0], op[1], op[2]));
1762       break;
1763
1764    case ir_triop_bitfield_extract:
1765       op[0] = fix_3src_operand(op[0]);
1766       op[1] = fix_3src_operand(op[1]);
1767       op[2] = fix_3src_operand(op[2]);
1768       /* Note that the instruction's argument order is reversed from GLSL
1769        * and the IR.
1770        */
1771       emit(BFE(result_dst, op[2], op[1], op[0]));
1772       break;
1773
1774    case ir_triop_vector_insert:
1775       unreachable("should have been lowered by lower_vector_insert");
1776
1777    case ir_quadop_bitfield_insert:
1778       unreachable("not reached: should be handled by "
1779               "bitfield_insert_to_bfm_bfi\n");
1780
1781    case ir_quadop_vector:
1782       unreachable("not reached: should be handled by lower_quadop_vector");
1783
1784    case ir_unop_pack_half_2x16:
1785       emit_pack_half_2x16(result_dst, op[0]);
1786       break;
1787    case ir_unop_unpack_half_2x16:
1788       emit_unpack_half_2x16(result_dst, op[0]);
1789       break;
1790    case ir_unop_pack_snorm_2x16:
1791    case ir_unop_pack_snorm_4x8:
1792    case ir_unop_pack_unorm_2x16:
1793    case ir_unop_pack_unorm_4x8:
1794    case ir_unop_unpack_snorm_2x16:
1795    case ir_unop_unpack_snorm_4x8:
1796    case ir_unop_unpack_unorm_2x16:
1797    case ir_unop_unpack_unorm_4x8:
1798       unreachable("not reached: should be handled by lower_packing_builtins");
1799    case ir_unop_unpack_half_2x16_split_x:
1800    case ir_unop_unpack_half_2x16_split_y:
1801    case ir_binop_pack_half_2x16_split:
1802    case ir_unop_interpolate_at_centroid:
1803    case ir_binop_interpolate_at_sample:
1804    case ir_binop_interpolate_at_offset:
1805       unreachable("not reached: should not occur in vertex shader");
1806    case ir_binop_ldexp:
1807       unreachable("not reached: should be handled by ldexp_to_arith()");
1808    }
1809 }
1810
1811
1812 void
1813 vec4_visitor::visit(ir_swizzle *ir)
1814 {
1815    src_reg src;
1816    int i = 0;
1817    int swizzle[4];
1818
1819    /* Note that this is only swizzles in expressions, not those on the left
1820     * hand side of an assignment, which do write masking.  See ir_assignment
1821     * for that.
1822     */
1823
1824    ir->val->accept(this);
1825    src = this->result;
1826    assert(src.file != BAD_FILE);
1827
1828    for (i = 0; i < ir->type->vector_elements; i++) {
1829       switch (i) {
1830       case 0:
1831          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1832          break;
1833       case 1:
1834          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1835          break;
1836       case 2:
1837          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1838          break;
1839       case 3:
1840          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1841             break;
1842       }
1843    }
1844    for (; i < 4; i++) {
1845       /* Replicate the last channel out. */
1846       swizzle[i] = swizzle[ir->type->vector_elements - 1];
1847    }
1848
1849    src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1850
1851    this->result = src;
1852 }
1853
1854 void
1855 vec4_visitor::visit(ir_dereference_variable *ir)
1856 {
1857    const struct glsl_type *type = ir->type;
1858    dst_reg *reg = variable_storage(ir->var);
1859
1860    if (!reg) {
1861       fail("Failed to find variable storage for %s\n", ir->var->name);
1862       this->result = src_reg(brw_null_reg());
1863       return;
1864    }
1865
1866    this->result = src_reg(*reg);
1867
1868    /* System values get their swizzle from the dst_reg writemask */
1869    if (ir->var->data.mode == ir_var_system_value)
1870       return;
1871
1872    if (type->is_scalar() || type->is_vector() || type->is_matrix())
1873       this->result.swizzle = swizzle_for_size(type->vector_elements);
1874 }
1875
1876
1877 int
1878 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1879 {
1880    /* Under normal circumstances array elements are stored consecutively, so
1881     * the stride is equal to the size of the array element.
1882     */
1883    return type_size(ir->type);
1884 }
1885
1886
1887 void
1888 vec4_visitor::visit(ir_dereference_array *ir)
1889 {
1890    ir_constant *constant_index;
1891    src_reg src;
1892    int array_stride = compute_array_stride(ir);
1893
1894    constant_index = ir->array_index->constant_expression_value();
1895
1896    ir->array->accept(this);
1897    src = this->result;
1898
1899    if (constant_index) {
1900       src.reg_offset += constant_index->value.i[0] * array_stride;
1901    } else {
1902       /* Variable index array dereference.  It eats the "vec4" of the
1903        * base of the array and an index that offsets the Mesa register
1904        * index.
1905        */
1906       ir->array_index->accept(this);
1907
1908       src_reg index_reg;
1909
1910       if (array_stride == 1) {
1911          index_reg = this->result;
1912       } else {
1913          index_reg = src_reg(this, glsl_type::int_type);
1914
1915          emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
1916       }
1917
1918       if (src.reladdr) {
1919          src_reg temp = src_reg(this, glsl_type::int_type);
1920
1921          emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1922
1923          index_reg = temp;
1924       }
1925
1926       src.reladdr = ralloc(mem_ctx, src_reg);
1927       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1928    }
1929
1930    /* If the type is smaller than a vec4, replicate the last channel out. */
1931    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1932       src.swizzle = swizzle_for_size(ir->type->vector_elements);
1933    else
1934       src.swizzle = BRW_SWIZZLE_NOOP;
1935    src.type = brw_type_for_base_type(ir->type);
1936
1937    this->result = src;
1938 }
1939
1940 void
1941 vec4_visitor::visit(ir_dereference_record *ir)
1942 {
1943    unsigned int i;
1944    const glsl_type *struct_type = ir->record->type;
1945    int offset = 0;
1946
1947    ir->record->accept(this);
1948
1949    for (i = 0; i < struct_type->length; i++) {
1950       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1951          break;
1952       offset += type_size(struct_type->fields.structure[i].type);
1953    }
1954
1955    /* If the type is smaller than a vec4, replicate the last channel out. */
1956    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1957       this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1958    else
1959       this->result.swizzle = BRW_SWIZZLE_NOOP;
1960    this->result.type = brw_type_for_base_type(ir->type);
1961
1962    this->result.reg_offset += offset;
1963 }
1964
1965 /**
1966  * We want to be careful in assignment setup to hit the actual storage
1967  * instead of potentially using a temporary like we might with the
1968  * ir_dereference handler.
1969  */
1970 static dst_reg
1971 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1972 {
1973    /* The LHS must be a dereference.  If the LHS is a variable indexed array
1974     * access of a vector, it must be separated into a series conditional moves
1975     * before reaching this point (see ir_vec_index_to_cond_assign).
1976     */
1977    assert(ir->as_dereference());
1978    ir_dereference_array *deref_array = ir->as_dereference_array();
1979    if (deref_array) {
1980       assert(!deref_array->array->type->is_vector());
1981    }
1982
1983    /* Use the rvalue deref handler for the most part.  We'll ignore
1984     * swizzles in it and write swizzles using writemask, though.
1985     */
1986    ir->accept(v);
1987    return dst_reg(v->result);
1988 }
1989
1990 void
1991 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1992                               const struct glsl_type *type,
1993                               enum brw_predicate predicate)
1994 {
1995    if (type->base_type == GLSL_TYPE_STRUCT) {
1996       for (unsigned int i = 0; i < type->length; i++) {
1997          emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1998       }
1999       return;
2000    }
2001
2002    if (type->is_array()) {
2003       for (unsigned int i = 0; i < type->length; i++) {
2004          emit_block_move(dst, src, type->fields.array, predicate);
2005       }
2006       return;
2007    }
2008
2009    if (type->is_matrix()) {
2010       const struct glsl_type *vec_type;
2011
2012       vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
2013                                          type->vector_elements, 1);
2014
2015       for (int i = 0; i < type->matrix_columns; i++) {
2016          emit_block_move(dst, src, vec_type, predicate);
2017       }
2018       return;
2019    }
2020
2021    assert(type->is_scalar() || type->is_vector());
2022
2023    dst->type = brw_type_for_base_type(type);
2024    src->type = dst->type;
2025
2026    dst->writemask = (1 << type->vector_elements) - 1;
2027
2028    src->swizzle = swizzle_for_size(type->vector_elements);
2029
2030    vec4_instruction *inst = emit(MOV(*dst, *src));
2031    inst->predicate = predicate;
2032
2033    dst->reg_offset++;
2034    src->reg_offset++;
2035 }
2036
2037
2038 /* If the RHS processing resulted in an instruction generating a
2039  * temporary value, and it would be easy to rewrite the instruction to
2040  * generate its result right into the LHS instead, do so.  This ends
2041  * up reliably removing instructions where it can be tricky to do so
2042  * later without real UD chain information.
2043  */
2044 bool
2045 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2046                                      dst_reg dst,
2047                                      src_reg src,
2048                                      vec4_instruction *pre_rhs_inst,
2049                                      vec4_instruction *last_rhs_inst)
2050 {
2051    /* This could be supported, but it would take more smarts. */
2052    if (ir->condition)
2053       return false;
2054
2055    if (pre_rhs_inst == last_rhs_inst)
2056       return false; /* No instructions generated to work with. */
2057
2058    /* Make sure the last instruction generated our source reg. */
2059    if (src.file != GRF ||
2060        src.file != last_rhs_inst->dst.file ||
2061        src.reg != last_rhs_inst->dst.reg ||
2062        src.reg_offset != last_rhs_inst->dst.reg_offset ||
2063        src.reladdr ||
2064        src.abs ||
2065        src.negate ||
2066        last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2067       return false;
2068
2069    /* Check that that last instruction fully initialized the channels
2070     * we want to use, in the order we want to use them.  We could
2071     * potentially reswizzle the operands of many instructions so that
2072     * we could handle out of order channels, but don't yet.
2073     */
2074
2075    for (unsigned i = 0; i < 4; i++) {
2076       if (dst.writemask & (1 << i)) {
2077          if (!(last_rhs_inst->dst.writemask & (1 << i)))
2078             return false;
2079
2080          if (BRW_GET_SWZ(src.swizzle, i) != i)
2081             return false;
2082       }
2083    }
2084
2085    /* Success!  Rewrite the instruction. */
2086    last_rhs_inst->dst.file = dst.file;
2087    last_rhs_inst->dst.reg = dst.reg;
2088    last_rhs_inst->dst.reg_offset = dst.reg_offset;
2089    last_rhs_inst->dst.reladdr = dst.reladdr;
2090    last_rhs_inst->dst.writemask &= dst.writemask;
2091
2092    return true;
2093 }
2094
2095 void
2096 vec4_visitor::visit(ir_assignment *ir)
2097 {
2098    dst_reg dst = get_assignment_lhs(ir->lhs, this);
2099    enum brw_predicate predicate = BRW_PREDICATE_NONE;
2100
2101    if (!ir->lhs->type->is_scalar() &&
2102        !ir->lhs->type->is_vector()) {
2103       ir->rhs->accept(this);
2104       src_reg src = this->result;
2105
2106       if (ir->condition) {
2107          emit_bool_to_cond_code(ir->condition, &predicate);
2108       }
2109
2110       /* emit_block_move doesn't account for swizzles in the source register.
2111        * This should be ok, since the source register is a structure or an
2112        * array, and those can't be swizzled.  But double-check to be sure.
2113        */
2114       assert(src.swizzle ==
2115              (ir->rhs->type->is_matrix()
2116               ? swizzle_for_size(ir->rhs->type->vector_elements)
2117               : BRW_SWIZZLE_NOOP));
2118
2119       emit_block_move(&dst, &src, ir->rhs->type, predicate);
2120       return;
2121    }
2122
2123    /* Now we're down to just a scalar/vector with writemasks. */
2124    int i;
2125
2126    vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2127    pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2128
2129    ir->rhs->accept(this);
2130
2131    last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2132
2133    src_reg src = this->result;
2134
2135    int swizzles[4];
2136    int first_enabled_chan = 0;
2137    int src_chan = 0;
2138
2139    assert(ir->lhs->type->is_vector() ||
2140           ir->lhs->type->is_scalar());
2141    dst.writemask = ir->write_mask;
2142
2143    for (int i = 0; i < 4; i++) {
2144       if (dst.writemask & (1 << i)) {
2145          first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
2146          break;
2147       }
2148    }
2149
2150    /* Swizzle a small RHS vector into the channels being written.
2151     *
2152     * glsl ir treats write_mask as dictating how many channels are
2153     * present on the RHS while in our instructions we need to make
2154     * those channels appear in the slots of the vec4 they're written to.
2155     */
2156    for (int i = 0; i < 4; i++) {
2157       if (dst.writemask & (1 << i))
2158          swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
2159       else
2160          swizzles[i] = first_enabled_chan;
2161    }
2162    src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2163                               swizzles[2], swizzles[3]);
2164
2165    if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2166       return;
2167    }
2168
2169    if (ir->condition) {
2170       emit_bool_to_cond_code(ir->condition, &predicate);
2171    }
2172
2173    for (i = 0; i < type_size(ir->lhs->type); i++) {
2174       vec4_instruction *inst = emit(MOV(dst, src));
2175       inst->predicate = predicate;
2176
2177       dst.reg_offset++;
2178       src.reg_offset++;
2179    }
2180 }
2181
2182 void
2183 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2184 {
2185    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2186       foreach_in_list(ir_constant, field_value, &ir->components) {
2187          emit_constant_values(dst, field_value);
2188       }
2189       return;
2190    }
2191
2192    if (ir->type->is_array()) {
2193       for (unsigned int i = 0; i < ir->type->length; i++) {
2194          emit_constant_values(dst, ir->array_elements[i]);
2195       }
2196       return;
2197    }
2198
2199    if (ir->type->is_matrix()) {
2200       for (int i = 0; i < ir->type->matrix_columns; i++) {
2201          float *vec = &ir->value.f[i * ir->type->vector_elements];
2202
2203          for (int j = 0; j < ir->type->vector_elements; j++) {
2204             dst->writemask = 1 << j;
2205             dst->type = BRW_REGISTER_TYPE_F;
2206
2207             emit(MOV(*dst, src_reg(vec[j])));
2208          }
2209          dst->reg_offset++;
2210       }
2211       return;
2212    }
2213
2214    int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2215
2216    for (int i = 0; i < ir->type->vector_elements; i++) {
2217       if (!(remaining_writemask & (1 << i)))
2218          continue;
2219
2220       dst->writemask = 1 << i;
2221       dst->type = brw_type_for_base_type(ir->type);
2222
2223       /* Find other components that match the one we're about to
2224        * write.  Emits fewer instructions for things like vec4(0.5,
2225        * 1.5, 1.5, 1.5).
2226        */
2227       for (int j = i + 1; j < ir->type->vector_elements; j++) {
2228          if (ir->type->base_type == GLSL_TYPE_BOOL) {
2229             if (ir->value.b[i] == ir->value.b[j])
2230                dst->writemask |= (1 << j);
2231          } else {
2232             /* u, i, and f storage all line up, so no need for a
2233              * switch case for comparing each type.
2234              */
2235             if (ir->value.u[i] == ir->value.u[j])
2236                dst->writemask |= (1 << j);
2237          }
2238       }
2239
2240       switch (ir->type->base_type) {
2241       case GLSL_TYPE_FLOAT:
2242          emit(MOV(*dst, src_reg(ir->value.f[i])));
2243          break;
2244       case GLSL_TYPE_INT:
2245          emit(MOV(*dst, src_reg(ir->value.i[i])));
2246          break;
2247       case GLSL_TYPE_UINT:
2248          emit(MOV(*dst, src_reg(ir->value.u[i])));
2249          break;
2250       case GLSL_TYPE_BOOL:
2251          emit(MOV(*dst,
2252                   src_reg(ir->value.b[i] != 0 ? ctx->Const.UniformBooleanTrue
2253                                               : 0)));
2254          break;
2255       default:
2256          unreachable("Non-float/uint/int/bool constant");
2257       }
2258
2259       remaining_writemask &= ~dst->writemask;
2260    }
2261    dst->reg_offset++;
2262 }
2263
2264 void
2265 vec4_visitor::visit(ir_constant *ir)
2266 {
2267    dst_reg dst = dst_reg(this, ir->type);
2268    this->result = src_reg(dst);
2269
2270    emit_constant_values(&dst, ir);
2271 }
2272
2273 void
2274 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2275 {
2276    ir_dereference *deref = static_cast<ir_dereference *>(
2277       ir->actual_parameters.get_head());
2278    ir_variable *location = deref->variable_referenced();
2279    unsigned surf_index = (prog_data->base.binding_table.abo_start +
2280                           location->data.binding);
2281
2282    /* Calculate the surface offset */
2283    src_reg offset(this, glsl_type::uint_type);
2284    ir_dereference_array *deref_array = deref->as_dereference_array();
2285    if (deref_array) {
2286       deref_array->array_index->accept(this);
2287
2288       src_reg tmp(this, glsl_type::uint_type);
2289       emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2290       emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2291    } else {
2292       offset = location->data.atomic.offset;
2293    }
2294
2295    /* Emit the appropriate machine instruction */
2296    const char *callee = ir->callee->function_name();
2297    dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2298
2299    if (!strcmp("__intrinsic_atomic_read", callee)) {
2300       emit_untyped_surface_read(surf_index, dst, offset);
2301
2302    } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2303       emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2304                           src_reg(), src_reg());
2305
2306    } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2307       emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2308                           src_reg(), src_reg());
2309    }
2310 }
2311
2312 void
2313 vec4_visitor::visit(ir_call *ir)
2314 {
2315    const char *callee = ir->callee->function_name();
2316
2317    if (!strcmp("__intrinsic_atomic_read", callee) ||
2318        !strcmp("__intrinsic_atomic_increment", callee) ||
2319        !strcmp("__intrinsic_atomic_predecrement", callee)) {
2320       visit_atomic_counter_intrinsic(ir);
2321    } else {
2322       unreachable("Unsupported intrinsic.");
2323    }
2324 }
2325
2326 src_reg
2327 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, src_reg sampler)
2328 {
2329    vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_MCS);
2330    inst->base_mrf = 2;
2331    inst->mlen = 1;
2332    inst->dst = dst_reg(this, glsl_type::uvec4_type);
2333    inst->dst.writemask = WRITEMASK_XYZW;
2334
2335    inst->src[1] = sampler;
2336
2337    /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2338    int param_base = inst->base_mrf;
2339    int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2340    int zero_mask = 0xf & ~coord_mask;
2341
2342    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2343             coordinate));
2344
2345    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2346             src_reg(0)));
2347
2348    emit(inst);
2349    return src_reg(inst->dst);
2350 }
2351
2352 static bool
2353 is_high_sampler(struct brw_context *brw, src_reg sampler)
2354 {
2355    if (brw->gen < 8 && !brw->is_haswell)
2356       return false;
2357
2358    return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
2359 }
2360
2361 void
2362 vec4_visitor::visit(ir_texture *ir)
2363 {
2364    uint32_t sampler =
2365       _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2366
2367    ir_rvalue *nonconst_sampler_index =
2368       _mesa_get_sampler_array_nonconst_index(ir->sampler);
2369
2370    /* Handle non-constant sampler array indexing */
2371    src_reg sampler_reg;
2372    if (nonconst_sampler_index) {
2373       /* The highest sampler which may be used by this operation is
2374        * the last element of the array. Mark it here, because the generator
2375        * doesn't have enough information to determine the bound.
2376        */
2377       uint32_t array_size = ir->sampler->as_dereference_array()
2378          ->array->type->array_size();
2379
2380       uint32_t max_used = sampler + array_size - 1;
2381       if (ir->op == ir_tg4 && brw->gen < 8) {
2382          max_used += prog_data->base.binding_table.gather_texture_start;
2383       } else {
2384          max_used += prog_data->base.binding_table.texture_start;
2385       }
2386
2387       brw_mark_surface_used(&prog_data->base, max_used);
2388
2389       /* Emit code to evaluate the actual indexing expression */
2390       nonconst_sampler_index->accept(this);
2391       dst_reg temp(this, glsl_type::uint_type);
2392       emit(ADD(temp, this->result, src_reg(sampler)))
2393          ->force_writemask_all = true;
2394       sampler_reg = src_reg(temp);
2395    } else {
2396       /* Single sampler, or constant array index; the indexing expression
2397        * is just an immediate.
2398        */
2399       sampler_reg = src_reg(sampler);
2400    }
2401
2402    /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2403     * emitting anything other than setting up the constant result.
2404     */
2405    if (ir->op == ir_tg4) {
2406       ir_constant *chan = ir->lod_info.component->as_constant();
2407       int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2408       if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2409          dst_reg result(this, ir->type);
2410          this->result = src_reg(result);
2411          emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2412          return;
2413       }
2414    }
2415
2416    /* Should be lowered by do_lower_texture_projection */
2417    assert(!ir->projector);
2418
2419    /* Should be lowered */
2420    assert(!ir->offset || !ir->offset->type->is_array());
2421
2422    /* Generate code to compute all the subexpression trees.  This has to be
2423     * done before loading any values into MRFs for the sampler message since
2424     * generating these values may involve SEND messages that need the MRFs.
2425     */
2426    src_reg coordinate;
2427    if (ir->coordinate) {
2428       ir->coordinate->accept(this);
2429       coordinate = this->result;
2430    }
2431
2432    src_reg shadow_comparitor;
2433    if (ir->shadow_comparitor) {
2434       ir->shadow_comparitor->accept(this);
2435       shadow_comparitor = this->result;
2436    }
2437
2438    bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2439    src_reg offset_value;
2440    if (has_nonconstant_offset) {
2441       ir->offset->accept(this);
2442       offset_value = src_reg(this->result);
2443    }
2444
2445    const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2446    src_reg lod, dPdx, dPdy, sample_index, mcs;
2447    switch (ir->op) {
2448    case ir_tex:
2449       lod = src_reg(0.0f);
2450       lod_type = glsl_type::float_type;
2451       break;
2452    case ir_txf:
2453    case ir_txl:
2454    case ir_txs:
2455       ir->lod_info.lod->accept(this);
2456       lod = this->result;
2457       lod_type = ir->lod_info.lod->type;
2458       break;
2459    case ir_query_levels:
2460       lod = src_reg(0);
2461       lod_type = glsl_type::int_type;
2462       break;
2463    case ir_txf_ms:
2464       ir->lod_info.sample_index->accept(this);
2465       sample_index = this->result;
2466       sample_index_type = ir->lod_info.sample_index->type;
2467
2468       if (brw->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2469          mcs = emit_mcs_fetch(ir, coordinate, sampler_reg);
2470       else
2471          mcs = src_reg(0u);
2472       break;
2473    case ir_txd:
2474       ir->lod_info.grad.dPdx->accept(this);
2475       dPdx = this->result;
2476
2477       ir->lod_info.grad.dPdy->accept(this);
2478       dPdy = this->result;
2479
2480       lod_type = ir->lod_info.grad.dPdx->type;
2481       break;
2482    case ir_txb:
2483    case ir_lod:
2484    case ir_tg4:
2485       break;
2486    }
2487
2488    enum opcode opcode;
2489    switch (ir->op) {
2490    case ir_tex: opcode = SHADER_OPCODE_TXL; break;
2491    case ir_txl: opcode = SHADER_OPCODE_TXL; break;
2492    case ir_txd: opcode = SHADER_OPCODE_TXD; break;
2493    case ir_txf: opcode = SHADER_OPCODE_TXF; break;
2494    case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
2495    case ir_txs: opcode = SHADER_OPCODE_TXS; break;
2496    case ir_tg4: opcode = has_nonconstant_offset
2497                          ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
2498    case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
2499    case ir_txb:
2500       unreachable("TXB is not valid for vertex shaders.");
2501    case ir_lod:
2502       unreachable("LOD is not valid for vertex shaders.");
2503    default:
2504       unreachable("Unrecognized tex op");
2505    }
2506
2507    vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, opcode);
2508
2509    if (ir->offset != NULL && ir->op != ir_txf)
2510       inst->texture_offset = brw_texture_offset(ctx, ir->offset->as_constant());
2511
2512    /* Stuff the channel select bits in the top of the texture offset */
2513    if (ir->op == ir_tg4)
2514       inst->texture_offset |= gather_channel(ir, sampler) << 16;
2515
2516    /* The message header is necessary for:
2517     * - Gen4 (always)
2518     * - Texel offsets
2519     * - Gather channel selection
2520     * - Sampler indices too large to fit in a 4-bit value.
2521     */
2522    inst->header_present =
2523       brw->gen < 5 || inst->texture_offset != 0 || ir->op == ir_tg4 ||
2524       is_high_sampler(brw, sampler_reg);
2525    inst->base_mrf = 2;
2526    inst->mlen = inst->header_present + 1; /* always at least one */
2527    inst->dst = dst_reg(this, ir->type);
2528    inst->dst.writemask = WRITEMASK_XYZW;
2529    inst->shadow_compare = ir->shadow_comparitor != NULL;
2530
2531    inst->src[1] = sampler_reg;
2532
2533    /* MRF for the first parameter */
2534    int param_base = inst->base_mrf + inst->header_present;
2535
2536    if (ir->op == ir_txs || ir->op == ir_query_levels) {
2537       int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2538       emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2539    } else {
2540       /* Load the coordinate */
2541       /* FINISHME: gl_clamp_mask and saturate */
2542       int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2543       int zero_mask = 0xf & ~coord_mask;
2544
2545       emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2546                coordinate));
2547
2548       if (zero_mask != 0) {
2549          emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2550                   src_reg(0)));
2551       }
2552       /* Load the shadow comparitor */
2553       if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2554          emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2555                           WRITEMASK_X),
2556                   shadow_comparitor));
2557          inst->mlen++;
2558       }
2559
2560       /* Load the LOD info */
2561       if (ir->op == ir_tex || ir->op == ir_txl) {
2562          int mrf, writemask;
2563          if (brw->gen >= 5) {
2564             mrf = param_base + 1;
2565             if (ir->shadow_comparitor) {
2566                writemask = WRITEMASK_Y;
2567                /* mlen already incremented */
2568             } else {
2569                writemask = WRITEMASK_X;
2570                inst->mlen++;
2571             }
2572          } else /* brw->gen == 4 */ {
2573             mrf = param_base;
2574             writemask = WRITEMASK_W;
2575          }
2576          emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2577       } else if (ir->op == ir_txf) {
2578          emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2579       } else if (ir->op == ir_txf_ms) {
2580          emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2581                   sample_index));
2582          if (brw->gen >= 7)
2583             /* MCS data is in the first channel of `mcs`, but we need to get it into
2584              * the .y channel of the second vec4 of params, so replicate .x across
2585              * the whole vec4 and then mask off everything except .y
2586              */
2587             mcs.swizzle = BRW_SWIZZLE_XXXX;
2588             emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2589                      mcs));
2590          inst->mlen++;
2591       } else if (ir->op == ir_txd) {
2592          const glsl_type *type = lod_type;
2593
2594          if (brw->gen >= 5) {
2595             dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2596             dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2597             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2598             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2599             inst->mlen++;
2600
2601             if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2602                dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2603                dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2604                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2605                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2606                inst->mlen++;
2607
2608                if (ir->shadow_comparitor) {
2609                   emit(MOV(dst_reg(MRF, param_base + 2,
2610                                    ir->shadow_comparitor->type, WRITEMASK_Z),
2611                            shadow_comparitor));
2612                }
2613             }
2614          } else /* brw->gen == 4 */ {
2615             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2616             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2617             inst->mlen += 2;
2618          }
2619       } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2620          if (ir->shadow_comparitor) {
2621             emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2622                      shadow_comparitor));
2623          }
2624
2625          emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2626                   offset_value));
2627          inst->mlen++;
2628       }
2629    }
2630
2631    emit(inst);
2632
2633    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2634     * spec requires layers.
2635     */
2636    if (ir->op == ir_txs) {
2637       glsl_type const *type = ir->sampler->type;
2638       if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2639           type->sampler_array) {
2640          emit_math(SHADER_OPCODE_INT_QUOTIENT,
2641                    writemask(inst->dst, WRITEMASK_Z),
2642                    src_reg(inst->dst), src_reg(6));
2643       }
2644    }
2645
2646    if (brw->gen == 6 && ir->op == ir_tg4) {
2647       emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2648    }
2649
2650    swizzle_result(ir, src_reg(inst->dst), sampler);
2651 }
2652
2653 /**
2654  * Apply workarounds for Gen6 gather with UINT/SINT
2655  */
2656 void
2657 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2658 {
2659    if (!wa)
2660       return;
2661
2662    int width = (wa & WA_8BIT) ? 8 : 16;
2663    dst_reg dst_f = dst;
2664    dst_f.type = BRW_REGISTER_TYPE_F;
2665
2666    /* Convert from UNORM to UINT */
2667    emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2668    emit(MOV(dst, src_reg(dst_f)));
2669
2670    if (wa & WA_SIGN) {
2671       /* Reinterpret the UINT value as a signed INT value by
2672        * shifting the sign bit into place, then shifting back
2673        * preserving sign.
2674        */
2675       emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2676       emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2677    }
2678 }
2679
2680 /**
2681  * Set up the gather channel based on the swizzle, for gather4.
2682  */
2683 uint32_t
2684 vec4_visitor::gather_channel(ir_texture *ir, uint32_t sampler)
2685 {
2686    ir_constant *chan = ir->lod_info.component->as_constant();
2687    int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2688    switch (swiz) {
2689       case SWIZZLE_X: return 0;
2690       case SWIZZLE_Y:
2691          /* gather4 sampler is broken for green channel on RG32F --
2692           * we must ask for blue instead.
2693           */
2694          if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2695             return 2;
2696          return 1;
2697       case SWIZZLE_Z: return 2;
2698       case SWIZZLE_W: return 3;
2699       default:
2700          unreachable("Not reached"); /* zero, one swizzles handled already */
2701    }
2702 }
2703
2704 void
2705 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, uint32_t sampler)
2706 {
2707    int s = key->tex.swizzles[sampler];
2708
2709    this->result = src_reg(this, ir->type);
2710    dst_reg swizzled_result(this->result);
2711
2712    if (ir->op == ir_query_levels) {
2713       /* # levels is in .w */
2714       orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2715       emit(MOV(swizzled_result, orig_val));
2716       return;
2717    }
2718
2719    if (ir->op == ir_txs || ir->type == glsl_type::float_type
2720                         || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2721       emit(MOV(swizzled_result, orig_val));
2722       return;
2723    }
2724
2725
2726    int zero_mask = 0, one_mask = 0, copy_mask = 0;
2727    int swizzle[4] = {0};
2728
2729    for (int i = 0; i < 4; i++) {
2730       switch (GET_SWZ(s, i)) {
2731       case SWIZZLE_ZERO:
2732          zero_mask |= (1 << i);
2733          break;
2734       case SWIZZLE_ONE:
2735          one_mask |= (1 << i);
2736          break;
2737       default:
2738          copy_mask |= (1 << i);
2739          swizzle[i] = GET_SWZ(s, i);
2740          break;
2741       }
2742    }
2743
2744    if (copy_mask) {
2745       orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2746       swizzled_result.writemask = copy_mask;
2747       emit(MOV(swizzled_result, orig_val));
2748    }
2749
2750    if (zero_mask) {
2751       swizzled_result.writemask = zero_mask;
2752       emit(MOV(swizzled_result, src_reg(0.0f)));
2753    }
2754
2755    if (one_mask) {
2756       swizzled_result.writemask = one_mask;
2757       emit(MOV(swizzled_result, src_reg(1.0f)));
2758    }
2759 }
2760
2761 void
2762 vec4_visitor::visit(ir_return *)
2763 {
2764    unreachable("not reached");
2765 }
2766
2767 void
2768 vec4_visitor::visit(ir_discard *)
2769 {
2770    unreachable("not reached");
2771 }
2772
2773 void
2774 vec4_visitor::visit(ir_if *ir)
2775 {
2776    /* Don't point the annotation at the if statement, because then it plus
2777     * the then and else blocks get printed.
2778     */
2779    this->base_ir = ir->condition;
2780
2781    if (brw->gen == 6) {
2782       emit_if_gen6(ir);
2783    } else {
2784       enum brw_predicate predicate;
2785       emit_bool_to_cond_code(ir->condition, &predicate);
2786       emit(IF(predicate));
2787    }
2788
2789    visit_instructions(&ir->then_instructions);
2790
2791    if (!ir->else_instructions.is_empty()) {
2792       this->base_ir = ir->condition;
2793       emit(BRW_OPCODE_ELSE);
2794
2795       visit_instructions(&ir->else_instructions);
2796    }
2797
2798    this->base_ir = ir->condition;
2799    emit(BRW_OPCODE_ENDIF);
2800 }
2801
2802 void
2803 vec4_visitor::visit(ir_emit_vertex *)
2804 {
2805    unreachable("not reached");
2806 }
2807
2808 void
2809 vec4_visitor::visit(ir_end_primitive *)
2810 {
2811    unreachable("not reached");
2812 }
2813
2814 void
2815 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2816                                   dst_reg dst, src_reg offset,
2817                                   src_reg src0, src_reg src1)
2818 {
2819    unsigned mlen = 0;
2820
2821    /* Set the atomic operation offset. */
2822    emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2823    mlen++;
2824
2825    /* Set the atomic operation arguments. */
2826    if (src0.file != BAD_FILE) {
2827       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2828       mlen++;
2829    }
2830
2831    if (src1.file != BAD_FILE) {
2832       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2833       mlen++;
2834    }
2835
2836    /* Emit the instruction.  Note that this maps to the normal SIMD8
2837     * untyped atomic message on Ivy Bridge, but that's OK because
2838     * unused channels will be masked out.
2839     */
2840    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
2841                                  src_reg(atomic_op), src_reg(surf_index));
2842    inst->base_mrf = 0;
2843    inst->mlen = mlen;
2844 }
2845
2846 void
2847 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
2848                                         src_reg offset)
2849 {
2850    /* Set the surface read offset. */
2851    emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
2852
2853    /* Emit the instruction.  Note that this maps to the normal SIMD8
2854     * untyped surface read message, but that's OK because unused
2855     * channels will be masked out.
2856     */
2857    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ,
2858                                  dst, src_reg(surf_index));
2859    inst->base_mrf = 0;
2860    inst->mlen = 1;
2861 }
2862
2863 void
2864 vec4_visitor::emit_ndc_computation()
2865 {
2866    /* Get the position */
2867    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2868
2869    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2870    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2871    output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2872
2873    current_annotation = "NDC";
2874    dst_reg ndc_w = ndc;
2875    ndc_w.writemask = WRITEMASK_W;
2876    src_reg pos_w = pos;
2877    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2878    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2879
2880    dst_reg ndc_xyz = ndc;
2881    ndc_xyz.writemask = WRITEMASK_XYZ;
2882
2883    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2884 }
2885
2886 void
2887 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2888 {
2889    if (brw->gen < 6 &&
2890        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2891         key->userclip_active || brw->has_negative_rhw_bug)) {
2892       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2893       dst_reg header1_w = header1;
2894       header1_w.writemask = WRITEMASK_W;
2895
2896       emit(MOV(header1, 0u));
2897
2898       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2899          src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2900
2901          current_annotation = "Point size";
2902          emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2903          emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2904       }
2905
2906       if (key->userclip_active) {
2907          current_annotation = "Clipping flags";
2908          dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
2909          dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
2910
2911          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
2912          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
2913          emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
2914
2915          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
2916          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
2917          emit(SHL(flags1, src_reg(flags1), src_reg(4)));
2918          emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
2919       }
2920
2921       /* i965 clipping workaround:
2922        * 1) Test for -ve rhw
2923        * 2) If set,
2924        *      set ndc = (0,0,0,0)
2925        *      set ucp[6] = 1
2926        *
2927        * Later, clipping will detect ucp[6] and ensure the primitive is
2928        * clipped against all fixed planes.
2929        */
2930       if (brw->has_negative_rhw_bug) {
2931          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
2932          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
2933          emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
2934          vec4_instruction *inst;
2935          inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
2936          inst->predicate = BRW_PREDICATE_NORMAL;
2937          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
2938          inst->predicate = BRW_PREDICATE_NORMAL;
2939       }
2940
2941       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2942    } else if (brw->gen < 6) {
2943       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2944    } else {
2945       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2946       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2947          emit(MOV(brw_writemask(reg, WRITEMASK_W),
2948                   src_reg(output_reg[VARYING_SLOT_PSIZ])));
2949       }
2950       if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
2951          emit(MOV(retype(brw_writemask(reg, WRITEMASK_Y), BRW_REGISTER_TYPE_D),
2952                   src_reg(output_reg[VARYING_SLOT_LAYER])));
2953       }
2954       if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
2955          emit(MOV(retype(brw_writemask(reg, WRITEMASK_Z), BRW_REGISTER_TYPE_D),
2956                   src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
2957       }
2958    }
2959 }
2960
2961 void
2962 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
2963 {
2964    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2965     *
2966     *     "If a linked set of shaders forming the vertex stage contains no
2967     *     static write to gl_ClipVertex or gl_ClipDistance, but the
2968     *     application has requested clipping against user clip planes through
2969     *     the API, then the coordinate written to gl_Position is used for
2970     *     comparison against the user clip planes."
2971     *
2972     * This function is only called if the shader didn't write to
2973     * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
2974     * if the user wrote to it; otherwise we use gl_Position.
2975     */
2976    gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
2977    if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
2978       clip_vertex = VARYING_SLOT_POS;
2979    }
2980
2981    for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
2982         ++i) {
2983       reg.writemask = 1 << i;
2984       emit(DP4(reg,
2985                src_reg(output_reg[clip_vertex]),
2986                src_reg(this->userplane[i + offset])));
2987    }
2988 }
2989
2990 void
2991 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
2992 {
2993    assert (varying < VARYING_SLOT_MAX);
2994    reg.type = output_reg[varying].type;
2995    current_annotation = output_reg_annotation[varying];
2996    /* Copy the register, saturating if necessary */
2997    vec4_instruction *inst = emit(MOV(reg,
2998                                      src_reg(output_reg[varying])));
2999    if ((varying == VARYING_SLOT_COL0 ||
3000         varying == VARYING_SLOT_COL1 ||
3001         varying == VARYING_SLOT_BFC0 ||
3002         varying == VARYING_SLOT_BFC1) &&
3003        key->clamp_vertex_color) {
3004       inst->saturate = true;
3005    }
3006 }
3007
3008 void
3009 vec4_visitor::emit_urb_slot(int mrf, int varying)
3010 {
3011    struct brw_reg hw_reg = brw_message_reg(mrf);
3012    dst_reg reg = dst_reg(MRF, mrf);
3013    reg.type = BRW_REGISTER_TYPE_F;
3014
3015    switch (varying) {
3016    case VARYING_SLOT_PSIZ:
3017       /* PSIZ is always in slot 0, and is coupled with other flags. */
3018       current_annotation = "indices, point width, clip flags";
3019       emit_psiz_and_flags(hw_reg);
3020       break;
3021    case BRW_VARYING_SLOT_NDC:
3022       current_annotation = "NDC";
3023       emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
3024       break;
3025    case VARYING_SLOT_POS:
3026       current_annotation = "gl_Position";
3027       emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
3028       break;
3029    case VARYING_SLOT_EDGE:
3030       /* This is present when doing unfilled polygons.  We're supposed to copy
3031        * the edge flag from the user-provided vertex array
3032        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
3033        * of that attribute (starts as 1.0f).  This is then used in clipping to
3034        * determine which edges should be drawn as wireframe.
3035        */
3036       current_annotation = "edge flag";
3037       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
3038                                     glsl_type::float_type, WRITEMASK_XYZW))));
3039       break;
3040    case BRW_VARYING_SLOT_PAD:
3041       /* No need to write to this slot */
3042       break;
3043    default:
3044       emit_generic_urb_slot(reg, varying);
3045       break;
3046    }
3047 }
3048
3049 static int
3050 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
3051 {
3052    if (brw->gen >= 6) {
3053       /* URB data written (does not include the message header reg) must
3054        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
3055        * section 5.4.3.2.2: URB_INTERLEAVED.
3056        *
3057        * URB entries are allocated on a multiple of 1024 bits, so an
3058        * extra 128 bits written here to make the end align to 256 is
3059        * no problem.
3060        */
3061       if ((mlen % 2) != 1)
3062          mlen++;
3063    }
3064
3065    return mlen;
3066 }
3067
3068
3069 /**
3070  * Generates the VUE payload plus the necessary URB write instructions to
3071  * output it.
3072  *
3073  * The VUE layout is documented in Volume 2a.
3074  */
3075 void
3076 vec4_visitor::emit_vertex()
3077 {
3078    /* MRF 0 is reserved for the debugger, so start with message header
3079     * in MRF 1.
3080     */
3081    int base_mrf = 1;
3082    int mrf = base_mrf;
3083    /* In the process of generating our URB write message contents, we
3084     * may need to unspill a register or load from an array.  Those
3085     * reads would use MRFs 14-15.
3086     */
3087    int max_usable_mrf = 13;
3088
3089    /* The following assertion verifies that max_usable_mrf causes an
3090     * even-numbered amount of URB write data, which will meet gen6's
3091     * requirements for length alignment.
3092     */
3093    assert ((max_usable_mrf - base_mrf) % 2 == 0);
3094
3095    /* First mrf is the g0-based message header containing URB handles and
3096     * such.
3097     */
3098    emit_urb_write_header(mrf++);
3099
3100    if (brw->gen < 6) {
3101       emit_ndc_computation();
3102    }
3103
3104    /* Lower legacy ff and ClipVertex clipping to clip distances */
3105    if (key->userclip_active && !prog->UsesClipDistanceOut) {
3106       current_annotation = "user clip distances";
3107
3108       output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3109       output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3110
3111       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3112       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3113    }
3114
3115    /* We may need to split this up into several URB writes, so do them in a
3116     * loop.
3117     */
3118    int slot = 0;
3119    bool complete = false;
3120    do {
3121       /* URB offset is in URB row increments, and each of our MRFs is half of
3122        * one of those, since we're doing interleaved writes.
3123        */
3124       int offset = slot / 2;
3125
3126       mrf = base_mrf + 1;
3127       for (; slot < prog_data->vue_map.num_slots; ++slot) {
3128          emit_urb_slot(mrf++, prog_data->vue_map.slot_to_varying[slot]);
3129
3130          /* If this was max_usable_mrf, we can't fit anything more into this
3131           * URB WRITE.
3132           */
3133          if (mrf > max_usable_mrf) {
3134             slot++;
3135             break;
3136          }
3137       }
3138
3139       complete = slot >= prog_data->vue_map.num_slots;
3140       current_annotation = "URB write";
3141       vec4_instruction *inst = emit_urb_write_opcode(complete);
3142       inst->base_mrf = base_mrf;
3143       inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
3144       inst->offset += offset;
3145    } while(!complete);
3146 }
3147
3148
3149 src_reg
3150 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
3151                                  src_reg *reladdr, int reg_offset)
3152 {
3153    /* Because we store the values to scratch interleaved like our
3154     * vertex data, we need to scale the vec4 index by 2.
3155     */
3156    int message_header_scale = 2;
3157
3158    /* Pre-gen6, the message header uses byte offsets instead of vec4
3159     * (16-byte) offset units.
3160     */
3161    if (brw->gen < 6)
3162       message_header_scale *= 16;
3163
3164    if (reladdr) {
3165       src_reg index = src_reg(this, glsl_type::int_type);
3166
3167       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3168       emit_before(inst, MUL(dst_reg(index),
3169                             index, src_reg(message_header_scale)));
3170
3171       return index;
3172    } else {
3173       return src_reg(reg_offset * message_header_scale);
3174    }
3175 }
3176
3177 src_reg
3178 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
3179                                        src_reg *reladdr, int reg_offset)
3180 {
3181    if (reladdr) {
3182       src_reg index = src_reg(this, glsl_type::int_type);
3183
3184       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3185
3186       /* Pre-gen6, the message header uses byte offsets instead of vec4
3187        * (16-byte) offset units.
3188        */
3189       if (brw->gen < 6) {
3190          emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
3191       }
3192
3193       return index;
3194    } else if (brw->gen >= 8) {
3195       /* Store the offset in a GRF so we can send-from-GRF. */
3196       src_reg offset = src_reg(this, glsl_type::int_type);
3197       emit_before(inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3198       return offset;
3199    } else {
3200       int message_header_scale = brw->gen < 6 ? 16 : 1;
3201       return src_reg(reg_offset * message_header_scale);
3202    }
3203 }
3204
3205 /**
3206  * Emits an instruction before @inst to load the value named by @orig_src
3207  * from scratch space at @base_offset to @temp.
3208  *
3209  * @base_offset is measured in 32-byte units (the size of a register).
3210  */
3211 void
3212 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
3213                                 dst_reg temp, src_reg orig_src,
3214                                 int base_offset)
3215 {
3216    int reg_offset = base_offset + orig_src.reg_offset;
3217    src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
3218
3219    emit_before(inst, SCRATCH_READ(temp, index));
3220 }
3221
3222 /**
3223  * Emits an instruction after @inst to store the value to be written
3224  * to @orig_dst to scratch space at @base_offset, from @temp.
3225  *
3226  * @base_offset is measured in 32-byte units (the size of a register).
3227  */
3228 void
3229 vec4_visitor::emit_scratch_write(vec4_instruction *inst, int base_offset)
3230 {
3231    int reg_offset = base_offset + inst->dst.reg_offset;
3232    src_reg index = get_scratch_offset(inst, inst->dst.reladdr, reg_offset);
3233
3234    /* Create a temporary register to store *inst's result in.
3235     *
3236     * We have to be careful in MOVing from our temporary result register in
3237     * the scratch write.  If we swizzle from channels of the temporary that
3238     * weren't initialized, it will confuse live interval analysis, which will
3239     * make spilling fail to make progress.
3240     */
3241    src_reg temp = src_reg(this, glsl_type::vec4_type);
3242    temp.type = inst->dst.type;
3243    int first_writemask_chan = ffs(inst->dst.writemask) - 1;
3244    int swizzles[4];
3245    for (int i = 0; i < 4; i++)
3246       if (inst->dst.writemask & (1 << i))
3247          swizzles[i] = i;
3248       else
3249          swizzles[i] = first_writemask_chan;
3250    temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
3251                                swizzles[2], swizzles[3]);
3252
3253    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3254                                        inst->dst.writemask));
3255    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3256    write->predicate = inst->predicate;
3257    write->ir = inst->ir;
3258    write->annotation = inst->annotation;
3259    inst->insert_after(write);
3260
3261    inst->dst.file = temp.file;
3262    inst->dst.reg = temp.reg;
3263    inst->dst.reg_offset = temp.reg_offset;
3264    inst->dst.reladdr = NULL;
3265 }
3266
3267 /**
3268  * We can't generally support array access in GRF space, because a
3269  * single instruction's destination can only span 2 contiguous
3270  * registers.  So, we send all GRF arrays that get variable index
3271  * access to scratch space.
3272  */
3273 void
3274 vec4_visitor::move_grf_array_access_to_scratch()
3275 {
3276    int scratch_loc[this->virtual_grf_count];
3277
3278    for (int i = 0; i < this->virtual_grf_count; i++) {
3279       scratch_loc[i] = -1;
3280    }
3281
3282    /* First, calculate the set of virtual GRFs that need to be punted
3283     * to scratch due to having any array access on them, and where in
3284     * scratch.
3285     */
3286    foreach_in_list(vec4_instruction, inst, &instructions) {
3287       if (inst->dst.file == GRF && inst->dst.reladdr &&
3288           scratch_loc[inst->dst.reg] == -1) {
3289          scratch_loc[inst->dst.reg] = c->last_scratch;
3290          c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
3291       }
3292
3293       for (int i = 0 ; i < 3; i++) {
3294          src_reg *src = &inst->src[i];
3295
3296          if (src->file == GRF && src->reladdr &&
3297              scratch_loc[src->reg] == -1) {
3298             scratch_loc[src->reg] = c->last_scratch;
3299             c->last_scratch += this->virtual_grf_sizes[src->reg];
3300          }
3301       }
3302    }
3303
3304    /* Now, for anything that will be accessed through scratch, rewrite
3305     * it to load/store.  Note that this is a _safe list walk, because
3306     * we may generate a new scratch_write instruction after the one
3307     * we're processing.
3308     */
3309    foreach_in_list_safe(vec4_instruction, inst, &instructions) {
3310       /* Set up the annotation tracking for new generated instructions. */
3311       base_ir = inst->ir;
3312       current_annotation = inst->annotation;
3313
3314       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
3315          emit_scratch_write(inst, scratch_loc[inst->dst.reg]);
3316       }
3317
3318       for (int i = 0 ; i < 3; i++) {
3319          if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
3320             continue;
3321
3322          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3323
3324          emit_scratch_read(inst, temp, inst->src[i],
3325                            scratch_loc[inst->src[i].reg]);
3326
3327          inst->src[i].file = temp.file;
3328          inst->src[i].reg = temp.reg;
3329          inst->src[i].reg_offset = temp.reg_offset;
3330          inst->src[i].reladdr = NULL;
3331       }
3332    }
3333 }
3334
3335 /**
3336  * Emits an instruction before @inst to load the value named by @orig_src
3337  * from the pull constant buffer (surface) at @base_offset to @temp.
3338  */
3339 void
3340 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
3341                                       dst_reg temp, src_reg orig_src,
3342                                       int base_offset)
3343 {
3344    int reg_offset = base_offset + orig_src.reg_offset;
3345    src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3346    src_reg offset = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
3347    vec4_instruction *load;
3348
3349    if (brw->gen >= 7) {
3350       dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
3351       grf_offset.type = offset.type;
3352       emit_before(inst, MOV(grf_offset, offset));
3353
3354       load = new(mem_ctx) vec4_instruction(this,
3355                                            VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
3356                                            temp, index, src_reg(grf_offset));
3357    } else {
3358       load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
3359                                            temp, index, offset);
3360       load->base_mrf = 14;
3361       load->mlen = 1;
3362    }
3363    emit_before(inst, load);
3364 }
3365
3366 /**
3367  * Implements array access of uniforms by inserting a
3368  * PULL_CONSTANT_LOAD instruction.
3369  *
3370  * Unlike temporary GRF array access (where we don't support it due to
3371  * the difficulty of doing relative addressing on instruction
3372  * destinations), we could potentially do array access of uniforms
3373  * that were loaded in GRF space as push constants.  In real-world
3374  * usage we've seen, though, the arrays being used are always larger
3375  * than we could load as push constants, so just always move all
3376  * uniform array access out to a pull constant buffer.
3377  */
3378 void
3379 vec4_visitor::move_uniform_array_access_to_pull_constants()
3380 {
3381    int pull_constant_loc[this->uniforms];
3382
3383    for (int i = 0; i < this->uniforms; i++) {
3384       pull_constant_loc[i] = -1;
3385    }
3386
3387    /* Walk through and find array access of uniforms.  Put a copy of that
3388     * uniform in the pull constant buffer.
3389     *
3390     * Note that we don't move constant-indexed accesses to arrays.  No
3391     * testing has been done of the performance impact of this choice.
3392     */
3393    foreach_in_list_safe(vec4_instruction, inst, &instructions) {
3394       for (int i = 0 ; i < 3; i++) {
3395          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3396             continue;
3397
3398          int uniform = inst->src[i].reg;
3399
3400          /* If this array isn't already present in the pull constant buffer,
3401           * add it.
3402           */
3403          if (pull_constant_loc[uniform] == -1) {
3404             const gl_constant_value **values =
3405                &stage_prog_data->param[uniform * 4];
3406
3407             pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3408
3409             assert(uniform < uniform_array_size);
3410             for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3411                stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3412                   = values[j];
3413             }
3414          }
3415
3416          /* Set up the annotation tracking for new generated instructions. */
3417          base_ir = inst->ir;
3418          current_annotation = inst->annotation;
3419
3420          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3421
3422          emit_pull_constant_load(inst, temp, inst->src[i],
3423                                  pull_constant_loc[uniform]);
3424
3425          inst->src[i].file = temp.file;
3426          inst->src[i].reg = temp.reg;
3427          inst->src[i].reg_offset = temp.reg_offset;
3428          inst->src[i].reladdr = NULL;
3429       }
3430    }
3431
3432    /* Now there are no accesses of the UNIFORM file with a reladdr, so
3433     * no need to track them as larger-than-vec4 objects.  This will be
3434     * relied on in cutting out unused uniform vectors from push
3435     * constants.
3436     */
3437    split_uniform_registers();
3438 }
3439
3440 void
3441 vec4_visitor::resolve_ud_negate(src_reg *reg)
3442 {
3443    if (reg->type != BRW_REGISTER_TYPE_UD ||
3444        !reg->negate)
3445       return;
3446
3447    src_reg temp = src_reg(this, glsl_type::uvec4_type);
3448    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3449    *reg = temp;
3450 }
3451
3452 vec4_visitor::vec4_visitor(struct brw_context *brw,
3453                            struct brw_vec4_compile *c,
3454                            struct gl_program *prog,
3455                            const struct brw_vec4_prog_key *key,
3456                            struct brw_vec4_prog_data *prog_data,
3457                            struct gl_shader_program *shader_prog,
3458                            gl_shader_stage stage,
3459                            void *mem_ctx,
3460                            bool debug_flag,
3461                            bool no_spills,
3462                            shader_time_shader_type st_base,
3463                            shader_time_shader_type st_written,
3464                            shader_time_shader_type st_reset)
3465    : backend_visitor(brw, shader_prog, prog, &prog_data->base, stage),
3466      c(c),
3467      key(key),
3468      prog_data(prog_data),
3469      sanity_param_count(0),
3470      fail_msg(NULL),
3471      first_non_payload_grf(0),
3472      need_all_constants_in_pull_buffer(false),
3473      debug_flag(debug_flag),
3474      no_spills(no_spills),
3475      st_base(st_base),
3476      st_written(st_written),
3477      st_reset(st_reset)
3478 {
3479    this->mem_ctx = mem_ctx;
3480    this->failed = false;
3481
3482    this->base_ir = NULL;
3483    this->current_annotation = NULL;
3484    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3485
3486    this->variable_ht = hash_table_ctor(0,
3487                                        hash_table_pointer_hash,
3488                                        hash_table_pointer_compare);
3489
3490    this->virtual_grf_start = NULL;
3491    this->virtual_grf_end = NULL;
3492    this->virtual_grf_sizes = NULL;
3493    this->virtual_grf_count = 0;
3494    this->virtual_grf_reg_map = NULL;
3495    this->virtual_grf_reg_count = 0;
3496    this->virtual_grf_array_size = 0;
3497    this->live_intervals_valid = false;
3498
3499    this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3500
3501    this->uniforms = 0;
3502
3503    /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3504     * at least one. See setup_uniforms() in brw_vec4.cpp.
3505     */
3506    this->uniform_array_size = 1;
3507    if (prog_data) {
3508       this->uniform_array_size = MAX2(stage_prog_data->nr_params, 1);
3509    }
3510
3511    this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3512    this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3513 }
3514
3515 vec4_visitor::~vec4_visitor()
3516 {
3517    hash_table_dtor(this->variable_ht);
3518 }
3519
3520
3521 void
3522 vec4_visitor::fail(const char *format, ...)
3523 {
3524    va_list va;
3525    char *msg;
3526
3527    if (failed)
3528       return;
3529
3530    failed = true;
3531
3532    va_start(va, format);
3533    msg = ralloc_vasprintf(mem_ctx, format, va);
3534    va_end(va);
3535    msg = ralloc_asprintf(mem_ctx, "vec4 compile failed: %s\n", msg);
3536
3537    this->fail_msg = msg;
3538
3539    if (debug_flag) {
3540       fprintf(stderr, "%s",  msg);
3541    }
3542 }
3543
3544 } /* namespace brw */