src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "glsl/ir_uniform.h"
  26 extern "C" {
  27 #include "program/sampler.h"
  28 }
  29
  30 namespace brw {
  31
  32 vec4_instruction::vec4_instruction(vec4_visitor *v,
  33                                    enum opcode opcode, const dst_reg &dst,
  34                                    const src_reg &src0, const src_reg &src1,
  35                                    const src_reg &src2)
  36 {
  37    this->opcode = opcode;
  38    this->dst = dst;
  39    this->src[0] = src0;
  40    this->src[1] = src1;
  41    this->src[2] = src2;
  42    this->saturate = false;
  43    this->force_writemask_all = false;
  44    this->no_dd_clear = false;
  45    this->no_dd_check = false;
  46    this->writes_accumulator = false;
  47    this->conditional_mod = BRW_CONDITIONAL_NONE;
  48    this->texture_offset = 0;
  49    this->target = 0;
  50    this->shadow_compare = false;
  51    this->ir = v->base_ir;
  52    this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
  53    this->header_present = false;
  54    this->mlen = 0;
  55    this->base_mrf = 0;
  56    this->offset = 0;
  57    this->annotation = v->current_annotation;
  58 }
  59
  60 vec4_instruction *
  61 vec4_visitor::emit(vec4_instruction *inst)
  62 {
  63    this->instructions.push_tail(inst);
  64
  65    return inst;
  66 }
  67
  68 vec4_instruction *
  69 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
  70 {
  71    new_inst->ir = inst->ir;
  72    new_inst->annotation = inst->annotation;
  73
  74    inst->insert_before(new_inst);
  75
  76    return inst;
  77 }
  78
  79 vec4_instruction *
  80 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
  81                    src_reg src0, src_reg src1, src_reg src2)
  82 {
  83    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
  84                                              src0, src1, src2));
  85 }
  86
  87
  88 vec4_instruction *
  89 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
  90 {
  91    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
  92 }
  93
  94 vec4_instruction *
  95 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
  96 {
  97    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
  98 }
  99
 100 vec4_instruction *
 101 vec4_visitor::emit(enum opcode opcode, dst_reg dst)
 102 {
 103    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst));
 104 }
 105
 106 vec4_instruction *
 107 vec4_visitor::emit(enum opcode opcode)
 108 {
 109    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
 110 }
 111
 112 #define ALU1(op)                                                        \
 113    vec4_instruction *                                                   \
 114    vec4_visitor::op(const dst_reg &dst, const src_reg &src0)            \
 115    {                                                                    \
 116       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 117                                            src0);                       \
 118    }
 119
 120 #define ALU2(op)                                                        \
 121    vec4_instruction *                                                   \
 122    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 123                     const src_reg &src1)                                \
 124    {                                                                    \
 125       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 126                                            src0, src1);                 \
 127    }
 128
 129 #define ALU2_ACC(op)                                                    \
 130    vec4_instruction *                                                   \
 131    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 132                     const src_reg &src1)                                \
 133    {                                                                    \
 134       vec4_instruction *inst = new(mem_ctx) vec4_instruction(this,     \
 135                        BRW_OPCODE_##op, dst, src0, src1);               \
 136       inst->writes_accumulator = true;                                 \
 137       return inst;                                                     \
 138    }
 139
 140 #define ALU3(op)                                                        \
 141    vec4_instruction *                                                   \
 142    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 143                     const src_reg &src1, const src_reg &src2)           \
 144    {                                                                    \
 145       assert(brw->gen >= 6);                                            \
 146       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 147                                            src0, src1, src2);           \
 148    }
 149
 150 ALU1(NOT)
 151 ALU1(MOV)
 152 ALU1(FRC)
 153 ALU1(RNDD)
 154 ALU1(RNDE)
 155 ALU1(RNDZ)
 156 ALU1(F32TO16)
 157 ALU1(F16TO32)
 158 ALU2(ADD)
 159 ALU2(MUL)
 160 ALU2_ACC(MACH)
 161 ALU2(AND)
 162 ALU2(OR)
 163 ALU2(XOR)
 164 ALU2(DP3)
 165 ALU2(DP4)
 166 ALU2(DPH)
 167 ALU2(SHL)
 168 ALU2(SHR)
 169 ALU2(ASR)
 170 ALU3(LRP)
 171 ALU1(BFREV)
 172 ALU3(BFE)
 173 ALU2(BFI1)
 174 ALU3(BFI2)
 175 ALU1(FBH)
 176 ALU1(FBL)
 177 ALU1(CBIT)
 178 ALU3(MAD)
 179 ALU2_ACC(ADDC)
 180 ALU2_ACC(SUBB)
 181 ALU2(MAC)
 182
 183 /** Gen4 predicated IF. */
 184 vec4_instruction *
 185 vec4_visitor::IF(enum brw_predicate predicate)
 186 {
 187    vec4_instruction *inst;
 188
 189    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
 190    inst->predicate = predicate;
 191
 192    return inst;
 193 }
 194
 195 /** Gen6 IF with embedded comparison. */
 196 vec4_instruction *
 197 vec4_visitor::IF(src_reg src0, src_reg src1,
 198                  enum brw_conditional_mod condition)
 199 {
 200    assert(brw->gen == 6);
 201
 202    vec4_instruction *inst;
 203
 204    resolve_ud_negate(&src0);
 205    resolve_ud_negate(&src1);
 206
 207    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
 208                                         src0, src1);
 209    inst->conditional_mod = condition;
 210
 211    return inst;
 212 }
 213
 214 /**
 215  * CMP: Sets the low bit of the destination channels with the result
 216  * of the comparison, while the upper bits are undefined, and updates
 217  * the flag register with the packed 16 bits of the result.
 218  */
 219 vec4_instruction *
 220 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
 221                   enum brw_conditional_mod condition)
 222 {
 223    vec4_instruction *inst;
 224
 225    /* original gen4 does type conversion to the destination type
 226     * before before comparison, producing garbage results for floating
 227     * point comparisons.
 228     */
 229    if (brw->gen == 4) {
 230       dst.type = src0.type;
 231       if (dst.file == HW_REG)
 232          dst.fixed_hw_reg.type = dst.type;
 233    }
 234
 235    resolve_ud_negate(&src0);
 236    resolve_ud_negate(&src1);
 237
 238    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
 239    inst->conditional_mod = condition;
 240
 241    return inst;
 242 }
 243
 244 vec4_instruction *
 245 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
 246 {
 247    vec4_instruction *inst;
 248
 249    inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_READ,
 250                                         dst, index);
 251    inst->base_mrf = 14;
 252    inst->mlen = 2;
 253
 254    return inst;
 255 }
 256
 257 vec4_instruction *
 258 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
 259                             const src_reg &index)
 260 {
 261    vec4_instruction *inst;
 262
 263    inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_WRITE,
 264                                         dst, src, index);
 265    inst->base_mrf = 13;
 266    inst->mlen = 3;
 267
 268    return inst;
 269 }
 270
 271 void
 272 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
 273 {
 274    static enum opcode dot_opcodes[] = {
 275       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
 276    };
 277
 278    emit(dot_opcodes[elements - 2], dst, src0, src1);
 279 }
 280
 281 src_reg
 282 vec4_visitor::fix_3src_operand(src_reg src)
 283 {
 284    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
 285     * able to use vertical stride of zero to replicate the vec4 uniform, like
 286     *
 287     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
 288     *
 289     * But you can't, since vertical stride is always four in three-source
 290     * instructions. Instead, insert a MOV instruction to do the replication so
 291     * that the three-source instruction can consume it.
 292     */
 293
 294    /* The MOV is only needed if the source is a uniform or immediate. */
 295    if (src.file != UNIFORM && src.file != IMM)
 296       return src;
 297
 298    if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
 299       return src;
 300
 301    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 302    expanded.type = src.type;
 303    emit(MOV(expanded, src));
 304    return src_reg(expanded);
 305 }
 306
 307 src_reg
 308 vec4_visitor::fix_math_operand(src_reg src)
 309 {
 310    /* The gen6 math instruction ignores the source modifiers --
 311     * swizzle, abs, negate, and at least some parts of the register
 312     * region description.
 313     *
 314     * Rather than trying to enumerate all these cases, *always* expand the
 315     * operand to a temp GRF for gen6.
 316     *
 317     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 318     * can't use.
 319     */
 320
 321    if (brw->gen == 7 && src.file != IMM)
 322       return src;
 323
 324    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 325    expanded.type = src.type;
 326    emit(MOV(expanded, src));
 327    return src_reg(expanded);
 328 }
 329
 330 void
 331 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
 332 {
 333    src = fix_math_operand(src);
 334
 335    if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
 336       /* The gen6 math instruction must be align1, so we can't do
 337        * writemasks.
 338        */
 339       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 340
 341       emit(opcode, temp_dst, src);
 342
 343       emit(MOV(dst, src_reg(temp_dst)));
 344    } else {
 345       emit(opcode, dst, src);
 346    }
 347 }
 348
 349 void
 350 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
 351 {
 352    vec4_instruction *inst = emit(opcode, dst, src);
 353    inst->base_mrf = 1;
 354    inst->mlen = 1;
 355 }
 356
 357 void
 358 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
 359 {
 360    switch (opcode) {
 361    case SHADER_OPCODE_RCP:
 362    case SHADER_OPCODE_RSQ:
 363    case SHADER_OPCODE_SQRT:
 364    case SHADER_OPCODE_EXP2:
 365    case SHADER_OPCODE_LOG2:
 366    case SHADER_OPCODE_SIN:
 367    case SHADER_OPCODE_COS:
 368       break;
 369    default:
 370       unreachable("not reached: bad math opcode");
 371    }
 372
 373    if (brw->gen >= 8) {
 374       emit(opcode, dst, src);
 375    } else if (brw->gen >= 6) {
 376       emit_math1_gen6(opcode, dst, src);
 377    } else {
 378       emit_math1_gen4(opcode, dst, src);
 379    }
 380 }
 381
 382 void
 383 vec4_visitor::emit_math2_gen6(enum opcode opcode,
 384                               dst_reg dst, src_reg src0, src_reg src1)
 385 {
 386    src0 = fix_math_operand(src0);
 387    src1 = fix_math_operand(src1);
 388
 389    if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
 390       /* The gen6 math instruction must be align1, so we can't do
 391        * writemasks.
 392        */
 393       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 394       temp_dst.type = dst.type;
 395
 396       emit(opcode, temp_dst, src0, src1);
 397
 398       emit(MOV(dst, src_reg(temp_dst)));
 399    } else {
 400       emit(opcode, dst, src0, src1);
 401    }
 402 }
 403
 404 void
 405 vec4_visitor::emit_math2_gen4(enum opcode opcode,
 406                               dst_reg dst, src_reg src0, src_reg src1)
 407 {
 408    vec4_instruction *inst = emit(opcode, dst, src0, src1);
 409    inst->base_mrf = 1;
 410    inst->mlen = 2;
 411 }
 412
 413 void
 414 vec4_visitor::emit_math(enum opcode opcode,
 415                         dst_reg dst, src_reg src0, src_reg src1)
 416 {
 417    switch (opcode) {
 418    case SHADER_OPCODE_POW:
 419    case SHADER_OPCODE_INT_QUOTIENT:
 420    case SHADER_OPCODE_INT_REMAINDER:
 421       break;
 422    default:
 423       unreachable("not reached: unsupported binary math opcode");
 424    }
 425
 426    if (brw->gen >= 8) {
 427       emit(opcode, dst, src0, src1);
 428    } else if (brw->gen >= 6) {
 429       emit_math2_gen6(opcode, dst, src0, src1);
 430    } else {
 431       emit_math2_gen4(opcode, dst, src0, src1);
 432    }
 433 }
 434
 435 void
 436 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 437 {
 438    if (brw->gen < 7) {
 439       unreachable("ir_unop_pack_half_2x16 should be lowered");
 440    }
 441
 442    assert(dst.type == BRW_REGISTER_TYPE_UD);
 443    assert(src0.type == BRW_REGISTER_TYPE_F);
 444
 445    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
 446     *
 447     *   Because this instruction does not have a 16-bit floating-point type,
 448     *   the destination data type must be Word (W).
 449     *
 450     *   The destination must be DWord-aligned and specify a horizontal stride
 451     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
 452     *   each destination channel and the upper word is not modified.
 453     *
 454     * The above restriction implies that the f32to16 instruction must use
 455     * align1 mode, because only in align1 mode is it possible to specify
 456     * horizontal stride.  We choose here to defy the hardware docs and emit
 457     * align16 instructions.
 458     *
 459     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
 460     * instructions. I was partially successful in that the code passed all
 461     * tests.  However, the code was dubiously correct and fragile, and the
 462     * tests were not harsh enough to probe that frailty. Not trusting the
 463     * code, I chose instead to remain in align16 mode in defiance of the hw
 464     * docs).
 465     *
 466     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
 467     * simulator, emitting a f32to16 in align16 mode with UD as destination
 468     * data type is safe. The behavior differs from that specified in the PRM
 469     * in that the upper word of each destination channel is cleared to 0.
 470     */
 471
 472    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 473    src_reg tmp_src(tmp_dst);
 474
 475 #if 0
 476    /* Verify the undocumented behavior on which the following instructions
 477     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
 478     * then the result of the bit-or instruction below will be incorrect.
 479     *
 480     * You should inspect the disasm output in order to verify that the MOV is
 481     * not optimized away.
 482     */
 483    emit(MOV(tmp_dst, src_reg(0x12345678u)));
 484 #endif
 485
 486    /* Give tmp the form below, where "." means untouched.
 487     *
 488     *     w z          y          x w z          y          x
 489     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
 490     *
 491     * That the upper word of each write-channel be 0 is required for the
 492     * following bit-shift and bit-or instructions to work. Note that this
 493     * relies on the undocumented hardware behavior mentioned above.
 494     */
 495    tmp_dst.writemask = WRITEMASK_XY;
 496    emit(F32TO16(tmp_dst, src0));
 497
 498    /* Give the write-channels of dst the form:
 499     *   0xhhhh0000
 500     */
 501    tmp_src.swizzle = BRW_SWIZZLE_YYYY;
 502    emit(SHL(dst, tmp_src, src_reg(16u)));
 503
 504    /* Finally, give the write-channels of dst the form of packHalf2x16's
 505     * output:
 506     *   0xhhhhllll
 507     */
 508    tmp_src.swizzle = BRW_SWIZZLE_XXXX;
 509    emit(OR(dst, src_reg(dst), tmp_src));
 510 }
 511
 512 void
 513 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 514 {
 515    if (brw->gen < 7) {
 516       unreachable("ir_unop_unpack_half_2x16 should be lowered");
 517    }
 518
 519    assert(dst.type == BRW_REGISTER_TYPE_F);
 520    assert(src0.type == BRW_REGISTER_TYPE_UD);
 521
 522    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
 523     *
 524     *   Because this instruction does not have a 16-bit floating-point type,
 525     *   the source data type must be Word (W). The destination type must be
 526     *   F (Float).
 527     *
 528     * To use W as the source data type, we must adjust horizontal strides,
 529     * which is only possible in align1 mode. All my [chadv] attempts at
 530     * emitting align1 instructions for unpackHalf2x16 failed to pass the
 531     * Piglit tests, so I gave up.
 532     *
 533     * I've verified that, on gen7 hardware and the simulator, it is safe to
 534     * emit f16to32 in align16 mode with UD as source data type.
 535     */
 536
 537    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 538    src_reg tmp_src(tmp_dst);
 539
 540    tmp_dst.writemask = WRITEMASK_X;
 541    emit(AND(tmp_dst, src0, src_reg(0xffffu)));
 542
 543    tmp_dst.writemask = WRITEMASK_Y;
 544    emit(SHR(tmp_dst, src0, src_reg(16u)));
 545
 546    dst.writemask = WRITEMASK_XY;
 547    emit(F16TO32(dst, tmp_src));
 548 }
 549
 550 void
 551 vec4_visitor::visit_instructions(const exec_list *list)
 552 {
 553    foreach_in_list(ir_instruction, ir, list) {
 554       base_ir = ir;
 555       ir->accept(this);
 556    }
 557 }
 558
 559
 560 static int
 561 type_size(const struct glsl_type *type)
 562 {
 563    unsigned int i;
 564    int size;
 565
 566    switch (type->base_type) {
 567    case GLSL_TYPE_UINT:
 568    case GLSL_TYPE_INT:
 569    case GLSL_TYPE_FLOAT:
 570    case GLSL_TYPE_BOOL:
 571       if (type->is_matrix()) {
 572          return type->matrix_columns;
 573       } else {
 574          /* Regardless of size of vector, it gets a vec4. This is bad
 575           * packing for things like floats, but otherwise arrays become a
 576           * mess.  Hopefully a later pass over the code can pack scalars
 577           * down if appropriate.
 578           */
 579          return 1;
 580       }
 581    case GLSL_TYPE_ARRAY:
 582       assert(type->length > 0);
 583       return type_size(type->fields.array) * type->length;
 584    case GLSL_TYPE_STRUCT:
 585       size = 0;
 586       for (i = 0; i < type->length; i++) {
 587          size += type_size(type->fields.structure[i].type);
 588       }
 589       return size;
 590    case GLSL_TYPE_SAMPLER:
 591       /* Samplers take up one slot in UNIFORMS[], but they're baked in
 592        * at link time.
 593        */
 594       return 1;
 595    case GLSL_TYPE_ATOMIC_UINT:
 596       return 0;
 597    case GLSL_TYPE_IMAGE:
 598    case GLSL_TYPE_VOID:
 599    case GLSL_TYPE_ERROR:
 600    case GLSL_TYPE_INTERFACE:
 601       unreachable("not reached");
 602    }
 603
 604    return 0;
 605 }
 606
 607 int
 608 vec4_visitor::virtual_grf_alloc(int size)
 609 {
 610    if (virtual_grf_array_size <= virtual_grf_count) {
 611       if (virtual_grf_array_size == 0)
 612          virtual_grf_array_size = 16;
 613       else
 614          virtual_grf_array_size *= 2;
 615       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 616                                    virtual_grf_array_size);
 617       virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
 618                                      virtual_grf_array_size);
 619    }
 620    virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
 621    virtual_grf_reg_count += size;
 622    virtual_grf_sizes[virtual_grf_count] = size;
 623    return virtual_grf_count++;
 624 }
 625
 626 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 627 {
 628    init();
 629
 630    this->file = GRF;
 631    this->reg = v->virtual_grf_alloc(type_size(type));
 632
 633    if (type->is_array() || type->is_record()) {
 634       this->swizzle = BRW_SWIZZLE_NOOP;
 635    } else {
 636       this->swizzle = swizzle_for_size(type->vector_elements);
 637    }
 638
 639    this->type = brw_type_for_base_type(type);
 640 }
 641
 642 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 643 {
 644    init();
 645
 646    this->file = GRF;
 647    this->reg = v->virtual_grf_alloc(type_size(type));
 648
 649    if (type->is_array() || type->is_record()) {
 650       this->writemask = WRITEMASK_XYZW;
 651    } else {
 652       this->writemask = (1 << type->vector_elements) - 1;
 653    }
 654
 655    this->type = brw_type_for_base_type(type);
 656 }
 657
 658 /* Our support for uniforms is piggy-backed on the struct
 659  * gl_fragment_program, because that's where the values actually
 660  * get stored, rather than in some global gl_shader_program uniform
 661  * store.
 662  */
 663 void
 664 vec4_visitor::setup_uniform_values(ir_variable *ir)
 665 {
 666    int namelen = strlen(ir->name);
 667
 668    /* The data for our (non-builtin) uniforms is stored in a series of
 669     * gl_uniform_driver_storage structs for each subcomponent that
 670     * glGetUniformLocation() could name.  We know it's been set up in the same
 671     * order we'd walk the type, so walk the list of storage and find anything
 672     * with our name, or the prefix of a component that starts with our name.
 673     */
 674    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 675       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 676
 677       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 678           (storage->name[namelen] != 0 &&
 679            storage->name[namelen] != '.' &&
 680            storage->name[namelen] != '[')) {
 681          continue;
 682       }
 683
 684       gl_constant_value *components = storage->storage;
 685       unsigned vector_count = (MAX2(storage->array_elements, 1) *
 686                                storage->type->matrix_columns);
 687
 688       for (unsigned s = 0; s < vector_count; s++) {
 689          assert(uniforms < uniform_array_size);
 690          uniform_vector_size[uniforms] = storage->type->vector_elements;
 691
 692          int i;
 693          for (i = 0; i < uniform_vector_size[uniforms]; i++) {
 694             stage_prog_data->param[uniforms * 4 + i] = components;
 695             components++;
 696          }
 697          for (; i < 4; i++) {
 698             static gl_constant_value zero = { 0.0 };
 699             stage_prog_data->param[uniforms * 4 + i] = &zero;
 700          }
 701
 702          uniforms++;
 703       }
 704    }
 705 }
 706
 707 void
 708 vec4_visitor::setup_uniform_clipplane_values()
 709 {
 710    gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
 711
 712    for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
 713       assert(this->uniforms < uniform_array_size);
 714       this->uniform_vector_size[this->uniforms] = 4;
 715       this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
 716       this->userplane[i].type = BRW_REGISTER_TYPE_F;
 717       for (int j = 0; j < 4; ++j) {
 718          stage_prog_data->param[this->uniforms * 4 + j] =
 719             (gl_constant_value *) &clip_planes[i][j];
 720       }
 721       ++this->uniforms;
 722    }
 723 }
 724
 725 /* Our support for builtin uniforms is even scarier than non-builtin.
 726  * It sits on top of the PROG_STATE_VAR parameters that are
 727  * automatically updated from GL context state.
 728  */
 729 void
 730 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
 731 {
 732    const ir_state_slot *const slots = ir->state_slots;
 733    assert(ir->state_slots != NULL);
 734
 735    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 736       /* This state reference has already been setup by ir_to_mesa,
 737        * but we'll get the same index back here.  We can reference
 738        * ParameterValues directly, since unlike brw_fs.cpp, we never
 739        * add new state references during compile.
 740        */
 741       int index = _mesa_add_state_reference(this->prog->Parameters,
 742                                             (gl_state_index *)slots[i].tokens);
 743       gl_constant_value *values =
 744          &this->prog->Parameters->ParameterValues[index][0];
 745
 746       assert(this->uniforms < uniform_array_size);
 747       this->uniform_vector_size[this->uniforms] = 0;
 748       /* Add each of the unique swizzled channels of the element.
 749        * This will end up matching the size of the glsl_type of this field.
 750        */
 751       int last_swiz = -1;
 752       for (unsigned int j = 0; j < 4; j++) {
 753          int swiz = GET_SWZ(slots[i].swizzle, j);
 754          last_swiz = swiz;
 755
 756          stage_prog_data->param[this->uniforms * 4 + j] = &values[swiz];
 757          assert(this->uniforms < uniform_array_size);
 758          if (swiz <= last_swiz)
 759             this->uniform_vector_size[this->uniforms]++;
 760       }
 761       this->uniforms++;
 762    }
 763 }
 764
 765 dst_reg *
 766 vec4_visitor::variable_storage(ir_variable *var)
 767 {
 768    return (dst_reg *)hash_table_find(this->variable_ht, var);
 769 }
 770
 771 void
 772 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir,
 773                                      enum brw_predicate *predicate)
 774 {
 775    ir_expression *expr = ir->as_expression();
 776
 777    *predicate = BRW_PREDICATE_NORMAL;
 778
 779    if (expr) {
 780       src_reg op[2];
 781       vec4_instruction *inst;
 782
 783       assert(expr->get_num_operands() <= 2);
 784       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 785          expr->operands[i]->accept(this);
 786          op[i] = this->result;
 787
 788          resolve_ud_negate(&op[i]);
 789       }
 790
 791       switch (expr->operation) {
 792       case ir_unop_logic_not:
 793          inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
 794          inst->conditional_mod = BRW_CONDITIONAL_Z;
 795          break;
 796
 797       case ir_binop_logic_xor:
 798          inst = emit(XOR(dst_null_d(), op[0], op[1]));
 799          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 800          break;
 801
 802       case ir_binop_logic_or:
 803          inst = emit(OR(dst_null_d(), op[0], op[1]));
 804          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 805          break;
 806
 807       case ir_binop_logic_and:
 808          inst = emit(AND(dst_null_d(), op[0], op[1]));
 809          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 810          break;
 811
 812       case ir_unop_f2b:
 813          if (brw->gen >= 6) {
 814             emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
 815          } else {
 816             inst = emit(MOV(dst_null_f(), op[0]));
 817             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 818          }
 819          break;
 820
 821       case ir_unop_i2b:
 822          if (brw->gen >= 6) {
 823             emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 824          } else {
 825             inst = emit(MOV(dst_null_d(), op[0]));
 826             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 827          }
 828          break;
 829
 830       case ir_binop_all_equal:
 831          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 832          *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
 833          break;
 834
 835       case ir_binop_any_nequal:
 836          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 837          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 838          break;
 839
 840       case ir_unop_any:
 841          inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 842          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 843          break;
 844
 845       case ir_binop_greater:
 846       case ir_binop_gequal:
 847       case ir_binop_less:
 848       case ir_binop_lequal:
 849       case ir_binop_equal:
 850       case ir_binop_nequal:
 851          emit(CMP(dst_null_d(), op[0], op[1],
 852                   brw_conditional_for_comparison(expr->operation)));
 853          break;
 854
 855       default:
 856          unreachable("not reached");
 857       }
 858       return;
 859    }
 860
 861    ir->accept(this);
 862
 863    resolve_ud_negate(&this->result);
 864
 865    if (brw->gen >= 6) {
 866       vec4_instruction *inst = emit(AND(dst_null_d(),
 867                                         this->result, src_reg(1)));
 868       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 869    } else {
 870       vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
 871       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 872    }
 873 }
 874
 875 /**
 876  * Emit a gen6 IF statement with the comparison folded into the IF
 877  * instruction.
 878  */
 879 void
 880 vec4_visitor::emit_if_gen6(ir_if *ir)
 881 {
 882    ir_expression *expr = ir->condition->as_expression();
 883
 884    if (expr) {
 885       src_reg op[2];
 886       dst_reg temp;
 887
 888       assert(expr->get_num_operands() <= 2);
 889       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 890          expr->operands[i]->accept(this);
 891          op[i] = this->result;
 892       }
 893
 894       switch (expr->operation) {
 895       case ir_unop_logic_not:
 896          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
 897          return;
 898
 899       case ir_binop_logic_xor:
 900          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
 901          return;
 902
 903       case ir_binop_logic_or:
 904          temp = dst_reg(this, glsl_type::bool_type);
 905          emit(OR(temp, op[0], op[1]));
 906          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 907          return;
 908
 909       case ir_binop_logic_and:
 910          temp = dst_reg(this, glsl_type::bool_type);
 911          emit(AND(temp, op[0], op[1]));
 912          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 913          return;
 914
 915       case ir_unop_f2b:
 916          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 917          return;
 918
 919       case ir_unop_i2b:
 920          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 921          return;
 922
 923       case ir_binop_greater:
 924       case ir_binop_gequal:
 925       case ir_binop_less:
 926       case ir_binop_lequal:
 927       case ir_binop_equal:
 928       case ir_binop_nequal:
 929          emit(IF(op[0], op[1],
 930                  brw_conditional_for_comparison(expr->operation)));
 931          return;
 932
 933       case ir_binop_all_equal:
 934          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 935          emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
 936          return;
 937
 938       case ir_binop_any_nequal:
 939          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 940          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 941          return;
 942
 943       case ir_unop_any:
 944          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 945          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 946          return;
 947
 948       default:
 949          unreachable("not reached");
 950       }
 951       return;
 952    }
 953
 954    ir->condition->accept(this);
 955
 956    emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
 957 }
 958
 959 void
 960 vec4_visitor::visit(ir_variable *ir)
 961 {
 962    dst_reg *reg = NULL;
 963
 964    if (variable_storage(ir))
 965       return;
 966
 967    switch (ir->data.mode) {
 968    case ir_var_shader_in:
 969       reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
 970       break;
 971
 972    case ir_var_shader_out:
 973       reg = new(mem_ctx) dst_reg(this, ir->type);
 974
 975       for (int i = 0; i < type_size(ir->type); i++) {
 976          output_reg[ir->data.location + i] = *reg;
 977          output_reg[ir->data.location + i].reg_offset = i;
 978          output_reg[ir->data.location + i].type =
 979             brw_type_for_base_type(ir->type->get_scalar_type());
 980          output_reg_annotation[ir->data.location + i] = ir->name;
 981       }
 982       break;
 983
 984    case ir_var_auto:
 985    case ir_var_temporary:
 986       reg = new(mem_ctx) dst_reg(this, ir->type);
 987       break;
 988
 989    case ir_var_uniform:
 990       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
 991
 992       /* Thanks to the lower_ubo_reference pass, we will see only
 993        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
 994        * variables, so no need for them to be in variable_ht.
 995        *
 996        * Atomic counters take no uniform storage, no need to do
 997        * anything here.
 998        */
 999       if (ir->is_in_uniform_block() || ir->type->contains_atomic())
1000          return;
1001
1002       /* Track how big the whole uniform variable is, in case we need to put a
1003        * copy of its data into pull constants for array access.
1004        */
1005       assert(this->uniforms < uniform_array_size);
1006       this->uniform_size[this->uniforms] = type_size(ir->type);
1007
1008       if (!strncmp(ir->name, "gl_", 3)) {
1009          setup_builtin_uniform_values(ir);
1010       } else {
1011          setup_uniform_values(ir);
1012       }
1013       break;
1014
1015    case ir_var_system_value:
1016       reg = make_reg_for_system_value(ir);
1017       break;
1018
1019    default:
1020       unreachable("not reached");
1021    }
1022
1023    reg->type = brw_type_for_base_type(ir->type);
1024    hash_table_insert(this->variable_ht, reg, ir);
1025 }
1026
1027 void
1028 vec4_visitor::visit(ir_loop *ir)
1029 {
1030    /* We don't want debugging output to print the whole body of the
1031     * loop as the annotation.
1032     */
1033    this->base_ir = NULL;
1034
1035    emit(BRW_OPCODE_DO);
1036
1037    visit_instructions(&ir->body_instructions);
1038
1039    emit(BRW_OPCODE_WHILE);
1040 }
1041
1042 void
1043 vec4_visitor::visit(ir_loop_jump *ir)
1044 {
1045    switch (ir->mode) {
1046    case ir_loop_jump::jump_break:
1047       emit(BRW_OPCODE_BREAK);
1048       break;
1049    case ir_loop_jump::jump_continue:
1050       emit(BRW_OPCODE_CONTINUE);
1051       break;
1052    }
1053 }
1054
1055
1056 void
1057 vec4_visitor::visit(ir_function_signature *)
1058 {
1059    unreachable("not reached");
1060 }
1061
1062 void
1063 vec4_visitor::visit(ir_function *ir)
1064 {
1065    /* Ignore function bodies other than main() -- we shouldn't see calls to
1066     * them since they should all be inlined.
1067     */
1068    if (strcmp(ir->name, "main") == 0) {
1069       const ir_function_signature *sig;
1070       exec_list empty;
1071
1072       sig = ir->matching_signature(NULL, &empty, false);
1073
1074       assert(sig);
1075
1076       visit_instructions(&sig->body);
1077    }
1078 }
1079
1080 bool
1081 vec4_visitor::try_emit_sat(ir_expression *ir)
1082 {
1083    ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
1084    if (!sat_src)
1085       return false;
1086
1087    sat_src->accept(this);
1088    src_reg src = this->result;
1089
1090    this->result = src_reg(this, ir->type);
1091    vec4_instruction *inst;
1092    inst = emit(MOV(dst_reg(this->result), src));
1093    inst->saturate = true;
1094
1095    return true;
1096 }
1097
1098 bool
1099 vec4_visitor::try_emit_mad(ir_expression *ir)
1100 {
1101    /* 3-src instructions were introduced in gen6. */
1102    if (brw->gen < 6)
1103       return false;
1104
1105    /* MAD can only handle floating-point data. */
1106    if (ir->type->base_type != GLSL_TYPE_FLOAT)
1107       return false;
1108
1109    ir_rvalue *nonmul = ir->operands[1];
1110    ir_expression *mul = ir->operands[0]->as_expression();
1111
1112    if (!mul || mul->operation != ir_binop_mul) {
1113       nonmul = ir->operands[0];
1114       mul = ir->operands[1]->as_expression();
1115
1116       if (!mul || mul->operation != ir_binop_mul)
1117          return false;
1118    }
1119
1120    nonmul->accept(this);
1121    src_reg src0 = fix_3src_operand(this->result);
1122
1123    mul->operands[0]->accept(this);
1124    src_reg src1 = fix_3src_operand(this->result);
1125
1126    mul->operands[1]->accept(this);
1127    src_reg src2 = fix_3src_operand(this->result);
1128
1129    this->result = src_reg(this, ir->type);
1130    emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1131
1132    return true;
1133 }
1134
1135 bool
1136 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1137 {
1138    /* This optimization relies on CMP setting the destination to 0 when
1139     * false.  Early hardware only sets the least significant bit, and
1140     * leaves the other bits undefined.  So we can't use it.
1141     */
1142    if (brw->gen < 6)
1143       return false;
1144
1145    ir_expression *const cmp = ir->operands[0]->as_expression();
1146
1147    if (cmp == NULL)
1148       return false;
1149
1150    switch (cmp->operation) {
1151    case ir_binop_less:
1152    case ir_binop_greater:
1153    case ir_binop_lequal:
1154    case ir_binop_gequal:
1155    case ir_binop_equal:
1156    case ir_binop_nequal:
1157       break;
1158
1159    default:
1160       return false;
1161    }
1162
1163    cmp->operands[0]->accept(this);
1164    const src_reg cmp_src0 = this->result;
1165
1166    cmp->operands[1]->accept(this);
1167    const src_reg cmp_src1 = this->result;
1168
1169    this->result = src_reg(this, ir->type);
1170
1171    emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1172             brw_conditional_for_comparison(cmp->operation)));
1173
1174    /* If the comparison is false, this->result will just happen to be zero.
1175     */
1176    vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1177                                        this->result, src_reg(1.0f));
1178    inst->predicate = BRW_PREDICATE_NORMAL;
1179    inst->predicate_inverse = true;
1180
1181    return true;
1182 }
1183
1184 void
1185 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
1186                           src_reg src0, src_reg src1)
1187 {
1188    vec4_instruction *inst;
1189
1190    if (brw->gen >= 6) {
1191       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1192       inst->conditional_mod = conditionalmod;
1193    } else {
1194       emit(CMP(dst, src0, src1, conditionalmod));
1195
1196       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1197       inst->predicate = BRW_PREDICATE_NORMAL;
1198    }
1199 }
1200
1201 void
1202 vec4_visitor::emit_lrp(const dst_reg &dst,
1203                        const src_reg &x, const src_reg &y, const src_reg &a)
1204 {
1205    if (brw->gen >= 6) {
1206       /* Note that the instruction's argument order is reversed from GLSL
1207        * and the IR.
1208        */
1209       emit(LRP(dst,
1210                fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
1211    } else {
1212       /* Earlier generations don't support three source operations, so we
1213        * need to emit x*(1-a) + y*a.
1214        */
1215       dst_reg y_times_a           = dst_reg(this, glsl_type::vec4_type);
1216       dst_reg one_minus_a         = dst_reg(this, glsl_type::vec4_type);
1217       dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1218       y_times_a.writemask           = dst.writemask;
1219       one_minus_a.writemask         = dst.writemask;
1220       x_times_one_minus_a.writemask = dst.writemask;
1221
1222       emit(MUL(y_times_a, y, a));
1223       emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1224       emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1225       emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1226    }
1227 }
1228
1229 void
1230 vec4_visitor::visit(ir_expression *ir)
1231 {
1232    unsigned int operand;
1233    src_reg op[Elements(ir->operands)];
1234    src_reg result_src;
1235    dst_reg result_dst;
1236    vec4_instruction *inst;
1237
1238    if (try_emit_sat(ir))
1239       return;
1240
1241    if (ir->operation == ir_binop_add) {
1242       if (try_emit_mad(ir))
1243          return;
1244    }
1245
1246    if (ir->operation == ir_unop_b2f) {
1247       if (try_emit_b2f_of_compare(ir))
1248          return;
1249    }
1250
1251    for (operand = 0; operand < ir->get_num_operands(); operand++) {
1252       this->result.file = BAD_FILE;
1253       ir->operands[operand]->accept(this);
1254       if (this->result.file == BAD_FILE) {
1255          fprintf(stderr, "Failed to get tree for expression operand:\n");
1256          ir->operands[operand]->fprint(stderr);
1257          exit(1);
1258       }
1259       op[operand] = this->result;
1260
1261       /* Matrix expression operands should have been broken down to vector
1262        * operations already.
1263        */
1264       assert(!ir->operands[operand]->type->is_matrix());
1265    }
1266
1267    int vector_elements = ir->operands[0]->type->vector_elements;
1268    if (ir->operands[1]) {
1269       vector_elements = MAX2(vector_elements,
1270                              ir->operands[1]->type->vector_elements);
1271    }
1272
1273    this->result.file = BAD_FILE;
1274
1275    /* Storage for our result.  Ideally for an assignment we'd be using
1276     * the actual storage for the result here, instead.
1277     */
1278    result_src = src_reg(this, ir->type);
1279    /* convenience for the emit functions below. */
1280    result_dst = dst_reg(result_src);
1281    /* If nothing special happens, this is the result. */
1282    this->result = result_src;
1283    /* Limit writes to the channels that will be used by result_src later.
1284     * This does limit this temp's use as a temporary for multi-instruction
1285     * sequences.
1286     */
1287    result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1288
1289    switch (ir->operation) {
1290    case ir_unop_logic_not:
1291       if (ctx->Const.UniformBooleanTrue != 1) {
1292          emit(NOT(result_dst, op[0]));
1293       } else {
1294          emit(XOR(result_dst, op[0], src_reg(1)));
1295       }
1296       break;
1297    case ir_unop_neg:
1298       op[0].negate = !op[0].negate;
1299       emit(MOV(result_dst, op[0]));
1300       break;
1301    case ir_unop_abs:
1302       op[0].abs = true;
1303       op[0].negate = false;
1304       emit(MOV(result_dst, op[0]));
1305       break;
1306
1307    case ir_unop_sign:
1308       if (ir->type->is_float()) {
1309          /* AND(val, 0x80000000) gives the sign bit.
1310           *
1311           * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1312           * zero.
1313           */
1314          emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1315
1316          op[0].type = BRW_REGISTER_TYPE_UD;
1317          result_dst.type = BRW_REGISTER_TYPE_UD;
1318          emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1319
1320          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1321          inst->predicate = BRW_PREDICATE_NORMAL;
1322
1323          this->result.type = BRW_REGISTER_TYPE_F;
1324       } else {
1325          /*  ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1326           *               -> non-negative val generates 0x00000000.
1327           *  Predicated OR sets 1 if val is positive.
1328           */
1329          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1330
1331          emit(ASR(result_dst, op[0], src_reg(31)));
1332
1333          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1334          inst->predicate = BRW_PREDICATE_NORMAL;
1335       }
1336       break;
1337
1338    case ir_unop_rcp:
1339       emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1340       break;
1341
1342    case ir_unop_exp2:
1343       emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1344       break;
1345    case ir_unop_log2:
1346       emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1347       break;
1348    case ir_unop_exp:
1349    case ir_unop_log:
1350       unreachable("not reached: should be handled by ir_explog_to_explog2");
1351    case ir_unop_sin:
1352    case ir_unop_sin_reduced:
1353       emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1354       break;
1355    case ir_unop_cos:
1356    case ir_unop_cos_reduced:
1357       emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1358       break;
1359
1360    case ir_unop_dFdx:
1361    case ir_unop_dFdx_coarse:
1362    case ir_unop_dFdx_fine:
1363    case ir_unop_dFdy:
1364    case ir_unop_dFdy_coarse:
1365    case ir_unop_dFdy_fine:
1366       unreachable("derivatives not valid in vertex shader");
1367
1368    case ir_unop_bitfield_reverse:
1369       emit(BFREV(result_dst, op[0]));
1370       break;
1371    case ir_unop_bit_count:
1372       emit(CBIT(result_dst, op[0]));
1373       break;
1374    case ir_unop_find_msb: {
1375       src_reg temp = src_reg(this, glsl_type::uint_type);
1376
1377       inst = emit(FBH(dst_reg(temp), op[0]));
1378       inst->dst.writemask = WRITEMASK_XYZW;
1379
1380       /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1381        * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1382        * subtract the result from 31 to convert the MSB count into an LSB count.
1383        */
1384
1385       /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1386       temp.swizzle = BRW_SWIZZLE_NOOP;
1387       emit(MOV(result_dst, temp));
1388
1389       src_reg src_tmp = src_reg(result_dst);
1390       emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1391
1392       src_tmp.negate = true;
1393       inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1394       inst->predicate = BRW_PREDICATE_NORMAL;
1395       break;
1396    }
1397    case ir_unop_find_lsb:
1398       emit(FBL(result_dst, op[0]));
1399       break;
1400    case ir_unop_saturate:
1401       inst = emit(MOV(result_dst, op[0]));
1402       inst->saturate = true;
1403       break;
1404
1405    case ir_unop_noise:
1406       unreachable("not reached: should be handled by lower_noise");
1407
1408    case ir_binop_add:
1409       emit(ADD(result_dst, op[0], op[1]));
1410       break;
1411    case ir_binop_sub:
1412       unreachable("not reached: should be handled by ir_sub_to_add_neg");
1413
1414    case ir_binop_mul:
1415       if (brw->gen < 8 && ir->type->is_integer()) {
1416          /* For integer multiplication, the MUL uses the low 16 bits of one of
1417           * the operands (src0 through SNB, src1 on IVB and later).  The MACH
1418           * accumulates in the contribution of the upper 16 bits of that
1419           * operand.  If we can determine that one of the args is in the low
1420           * 16 bits, though, we can just emit a single MUL.
1421           */
1422          if (ir->operands[0]->is_uint16_constant()) {
1423             if (brw->gen < 7)
1424                emit(MUL(result_dst, op[0], op[1]));
1425             else
1426                emit(MUL(result_dst, op[1], op[0]));
1427          } else if (ir->operands[1]->is_uint16_constant()) {
1428             if (brw->gen < 7)
1429                emit(MUL(result_dst, op[1], op[0]));
1430             else
1431                emit(MUL(result_dst, op[0], op[1]));
1432          } else {
1433             struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1434
1435             emit(MUL(acc, op[0], op[1]));
1436             emit(MACH(dst_null_d(), op[0], op[1]));
1437             emit(MOV(result_dst, src_reg(acc)));
1438          }
1439       } else {
1440          emit(MUL(result_dst, op[0], op[1]));
1441       }
1442       break;
1443    case ir_binop_imul_high: {
1444       struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1445
1446       emit(MUL(acc, op[0], op[1]));
1447       emit(MACH(result_dst, op[0], op[1]));
1448       break;
1449    }
1450    case ir_binop_div:
1451       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1452       assert(ir->type->is_integer());
1453       emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1454       break;
1455    case ir_binop_carry: {
1456       struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1457
1458       emit(ADDC(dst_null_ud(), op[0], op[1]));
1459       emit(MOV(result_dst, src_reg(acc)));
1460       break;
1461    }
1462    case ir_binop_borrow: {
1463       struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1464
1465       emit(SUBB(dst_null_ud(), op[0], op[1]));
1466       emit(MOV(result_dst, src_reg(acc)));
1467       break;
1468    }
1469    case ir_binop_mod:
1470       /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1471       assert(ir->type->is_integer());
1472       emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1473       break;
1474
1475    case ir_binop_less:
1476    case ir_binop_greater:
1477    case ir_binop_lequal:
1478    case ir_binop_gequal:
1479    case ir_binop_equal:
1480    case ir_binop_nequal: {
1481       emit(CMP(result_dst, op[0], op[1],
1482                brw_conditional_for_comparison(ir->operation)));
1483       if (ctx->Const.UniformBooleanTrue == 1) {
1484          emit(AND(result_dst, result_src, src_reg(1)));
1485       }
1486       break;
1487    }
1488
1489    case ir_binop_all_equal:
1490       /* "==" operator producing a scalar boolean. */
1491       if (ir->operands[0]->type->is_vector() ||
1492           ir->operands[1]->type->is_vector()) {
1493          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1494          emit(MOV(result_dst, src_reg(0)));
1495          inst = emit(MOV(result_dst, src_reg(ctx->Const.UniformBooleanTrue)));
1496          inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1497       } else {
1498          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1499          if (ctx->Const.UniformBooleanTrue == 1) {
1500             emit(AND(result_dst, result_src, src_reg(1)));
1501          }
1502       }
1503       break;
1504    case ir_binop_any_nequal:
1505       /* "!=" operator producing a scalar boolean. */
1506       if (ir->operands[0]->type->is_vector() ||
1507           ir->operands[1]->type->is_vector()) {
1508          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1509
1510          emit(MOV(result_dst, src_reg(0)));
1511          inst = emit(MOV(result_dst, src_reg(ctx->Const.UniformBooleanTrue)));
1512          inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1513       } else {
1514          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1515          if (ctx->Const.UniformBooleanTrue == 1) {
1516             emit(AND(result_dst, result_src, src_reg(1)));
1517          }
1518       }
1519       break;
1520
1521    case ir_unop_any:
1522       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1523       emit(MOV(result_dst, src_reg(0)));
1524
1525       inst = emit(MOV(result_dst, src_reg(ctx->Const.UniformBooleanTrue)));
1526       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1527       break;
1528
1529    case ir_binop_logic_xor:
1530       emit(XOR(result_dst, op[0], op[1]));
1531       break;
1532
1533    case ir_binop_logic_or:
1534       emit(OR(result_dst, op[0], op[1]));
1535       break;
1536
1537    case ir_binop_logic_and:
1538       emit(AND(result_dst, op[0], op[1]));
1539       break;
1540
1541    case ir_binop_dot:
1542       assert(ir->operands[0]->type->is_vector());
1543       assert(ir->operands[0]->type == ir->operands[1]->type);
1544       emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1545       break;
1546
1547    case ir_unop_sqrt:
1548       emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1549       break;
1550    case ir_unop_rsq:
1551       emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1552       break;
1553
1554    case ir_unop_bitcast_i2f:
1555    case ir_unop_bitcast_u2f:
1556       this->result = op[0];
1557       this->result.type = BRW_REGISTER_TYPE_F;
1558       break;
1559
1560    case ir_unop_bitcast_f2i:
1561       this->result = op[0];
1562       this->result.type = BRW_REGISTER_TYPE_D;
1563       break;
1564
1565    case ir_unop_bitcast_f2u:
1566       this->result = op[0];
1567       this->result.type = BRW_REGISTER_TYPE_UD;
1568       break;
1569
1570    case ir_unop_i2f:
1571    case ir_unop_i2u:
1572    case ir_unop_u2i:
1573    case ir_unop_u2f:
1574    case ir_unop_f2i:
1575    case ir_unop_f2u:
1576       emit(MOV(result_dst, op[0]));
1577       break;
1578    case ir_unop_b2i:
1579       if (ctx->Const.UniformBooleanTrue != 1) {
1580          emit(AND(result_dst, op[0], src_reg(1)));
1581       } else {
1582          emit(MOV(result_dst, op[0]));
1583       }
1584       break;
1585    case ir_unop_b2f:
1586       if (ctx->Const.UniformBooleanTrue != 1) {
1587          op[0].type = BRW_REGISTER_TYPE_UD;
1588          result_dst.type = BRW_REGISTER_TYPE_UD;
1589          emit(AND(result_dst, op[0], src_reg(0x3f800000u)));
1590          result_dst.type = BRW_REGISTER_TYPE_F;
1591       } else {
1592          emit(MOV(result_dst, op[0]));
1593       }
1594       break;
1595    case ir_unop_f2b:
1596    case ir_unop_i2b:
1597       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1598       if (ctx->Const.UniformBooleanTrue == 1) {
1599          emit(AND(result_dst, result_src, src_reg(1)));
1600       }
1601       break;
1602
1603    case ir_unop_trunc:
1604       emit(RNDZ(result_dst, op[0]));
1605       break;
1606    case ir_unop_ceil:
1607       op[0].negate = !op[0].negate;
1608       inst = emit(RNDD(result_dst, op[0]));
1609       this->result.negate = true;
1610       break;
1611    case ir_unop_floor:
1612       inst = emit(RNDD(result_dst, op[0]));
1613       break;
1614    case ir_unop_fract:
1615       inst = emit(FRC(result_dst, op[0]));
1616       break;
1617    case ir_unop_round_even:
1618       emit(RNDE(result_dst, op[0]));
1619       break;
1620
1621    case ir_binop_min:
1622       emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1623       break;
1624    case ir_binop_max:
1625       emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1626       break;
1627
1628    case ir_binop_pow:
1629       emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1630       break;
1631
1632    case ir_unop_bit_not:
1633       inst = emit(NOT(result_dst, op[0]));
1634       break;
1635    case ir_binop_bit_and:
1636       inst = emit(AND(result_dst, op[0], op[1]));
1637       break;
1638    case ir_binop_bit_xor:
1639       inst = emit(XOR(result_dst, op[0], op[1]));
1640       break;
1641    case ir_binop_bit_or:
1642       inst = emit(OR(result_dst, op[0], op[1]));
1643       break;
1644
1645    case ir_binop_lshift:
1646       inst = emit(SHL(result_dst, op[0], op[1]));
1647       break;
1648
1649    case ir_binop_rshift:
1650       if (ir->type->base_type == GLSL_TYPE_INT)
1651          inst = emit(ASR(result_dst, op[0], op[1]));
1652       else
1653          inst = emit(SHR(result_dst, op[0], op[1]));
1654       break;
1655
1656    case ir_binop_bfm:
1657       emit(BFI1(result_dst, op[0], op[1]));
1658       break;
1659
1660    case ir_binop_ubo_load: {
1661       ir_constant *const_uniform_block = ir->operands[0]->as_constant();
1662       ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1663       unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1664       src_reg offset;
1665
1666       /* Now, load the vector from that offset. */
1667       assert(ir->type->is_vector() || ir->type->is_scalar());
1668
1669       src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1670       packed_consts.type = result.type;
1671       src_reg surf_index;
1672
1673       if (const_uniform_block) {
1674          /* The block index is a constant, so just emit the binding table entry
1675           * as an immediate.
1676           */
1677          surf_index = src_reg(prog_data->base.binding_table.ubo_start +
1678                               const_uniform_block->value.u[0]);
1679       } else {
1680          /* The block index is not a constant. Evaluate the index expression
1681           * per-channel and add the base UBO index; the generator will select
1682           * a value from any live channel.
1683           */
1684          surf_index = src_reg(this, glsl_type::uint_type);
1685          emit(ADD(dst_reg(surf_index), op[0],
1686                   src_reg(prog_data->base.binding_table.ubo_start)));
1687
1688          /* Assume this may touch any UBO. It would be nice to provide
1689           * a tighter bound, but the array information is already lowered away.
1690           */
1691          brw_mark_surface_used(&prog_data->base,
1692                                prog_data->base.binding_table.ubo_start +
1693                                shader_prog->NumUniformBlocks - 1);
1694       }
1695
1696       if (const_offset_ir) {
1697          if (brw->gen >= 8) {
1698             /* Store the offset in a GRF so we can send-from-GRF. */
1699             offset = src_reg(this, glsl_type::int_type);
1700             emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1701          } else {
1702             /* Immediates are fine on older generations since they'll be moved
1703              * to a (potentially fake) MRF at the generator level.
1704              */
1705             offset = src_reg(const_offset / 16);
1706          }
1707       } else {
1708          offset = src_reg(this, glsl_type::uint_type);
1709          emit(SHR(dst_reg(offset), op[1], src_reg(4)));
1710       }
1711
1712       if (brw->gen >= 7) {
1713          dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1714          grf_offset.type = offset.type;
1715
1716          emit(MOV(grf_offset, offset));
1717
1718          emit(new(mem_ctx) vec4_instruction(this,
1719                                             VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1720                                             dst_reg(packed_consts),
1721                                             surf_index,
1722                                             src_reg(grf_offset)));
1723       } else {
1724          vec4_instruction *pull =
1725             emit(new(mem_ctx) vec4_instruction(this,
1726                                                VS_OPCODE_PULL_CONSTANT_LOAD,
1727                                                dst_reg(packed_consts),
1728                                                surf_index,
1729                                                offset));
1730          pull->base_mrf = 14;
1731          pull->mlen = 1;
1732       }
1733
1734       packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1735       packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1736                                             const_offset % 16 / 4,
1737                                             const_offset % 16 / 4,
1738                                             const_offset % 16 / 4);
1739
1740       /* UBO bools are any nonzero int.  We need to convert them to use the
1741        * value of true stored in ctx->Const.UniformBooleanTrue.
1742        */
1743       if (ir->type->base_type == GLSL_TYPE_BOOL) {
1744          emit(CMP(result_dst, packed_consts, src_reg(0u),
1745                   BRW_CONDITIONAL_NZ));
1746          if (ctx->Const.UniformBooleanTrue == 1) {
1747             emit(AND(result_dst, result, src_reg(1)));
1748          }
1749       } else {
1750          emit(MOV(result_dst, packed_consts));
1751       }
1752       break;
1753    }
1754
1755    case ir_binop_vector_extract:
1756       unreachable("should have been lowered by vec_index_to_cond_assign");
1757
1758    case ir_triop_fma:
1759       op[0] = fix_3src_operand(op[0]);
1760       op[1] = fix_3src_operand(op[1]);
1761       op[2] = fix_3src_operand(op[2]);
1762       /* Note that the instruction's argument order is reversed from GLSL
1763        * and the IR.
1764        */
1765       emit(MAD(result_dst, op[2], op[1], op[0]));
1766       break;
1767
1768    case ir_triop_lrp:
1769       emit_lrp(result_dst, op[0], op[1], op[2]);
1770       break;
1771
1772    case ir_triop_csel:
1773       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1774       inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1775       inst->predicate = BRW_PREDICATE_NORMAL;
1776       break;
1777
1778    case ir_triop_bfi:
1779       op[0] = fix_3src_operand(op[0]);
1780       op[1] = fix_3src_operand(op[1]);
1781       op[2] = fix_3src_operand(op[2]);
1782       emit(BFI2(result_dst, op[0], op[1], op[2]));
1783       break;
1784
1785    case ir_triop_bitfield_extract:
1786       op[0] = fix_3src_operand(op[0]);
1787       op[1] = fix_3src_operand(op[1]);
1788       op[2] = fix_3src_operand(op[2]);
1789       /* Note that the instruction's argument order is reversed from GLSL
1790        * and the IR.
1791        */
1792       emit(BFE(result_dst, op[2], op[1], op[0]));
1793       break;
1794
1795    case ir_triop_vector_insert:
1796       unreachable("should have been lowered by lower_vector_insert");
1797
1798    case ir_quadop_bitfield_insert:
1799       unreachable("not reached: should be handled by "
1800               "bitfield_insert_to_bfm_bfi\n");
1801
1802    case ir_quadop_vector:
1803       unreachable("not reached: should be handled by lower_quadop_vector");
1804
1805    case ir_unop_pack_half_2x16:
1806       emit_pack_half_2x16(result_dst, op[0]);
1807       break;
1808    case ir_unop_unpack_half_2x16:
1809       emit_unpack_half_2x16(result_dst, op[0]);
1810       break;
1811    case ir_unop_pack_snorm_2x16:
1812    case ir_unop_pack_snorm_4x8:
1813    case ir_unop_pack_unorm_2x16:
1814    case ir_unop_pack_unorm_4x8:
1815    case ir_unop_unpack_snorm_2x16:
1816    case ir_unop_unpack_snorm_4x8:
1817    case ir_unop_unpack_unorm_2x16:
1818    case ir_unop_unpack_unorm_4x8:
1819       unreachable("not reached: should be handled by lower_packing_builtins");
1820    case ir_unop_unpack_half_2x16_split_x:
1821    case ir_unop_unpack_half_2x16_split_y:
1822    case ir_binop_pack_half_2x16_split:
1823    case ir_unop_interpolate_at_centroid:
1824    case ir_binop_interpolate_at_sample:
1825    case ir_binop_interpolate_at_offset:
1826       unreachable("not reached: should not occur in vertex shader");
1827    case ir_binop_ldexp:
1828       unreachable("not reached: should be handled by ldexp_to_arith()");
1829    }
1830 }
1831
1832
1833 void
1834 vec4_visitor::visit(ir_swizzle *ir)
1835 {
1836    src_reg src;
1837    int i = 0;
1838    int swizzle[4];
1839
1840    /* Note that this is only swizzles in expressions, not those on the left
1841     * hand side of an assignment, which do write masking.  See ir_assignment
1842     * for that.
1843     */
1844
1845    ir->val->accept(this);
1846    src = this->result;
1847    assert(src.file != BAD_FILE);
1848
1849    for (i = 0; i < ir->type->vector_elements; i++) {
1850       switch (i) {
1851       case 0:
1852          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1853          break;
1854       case 1:
1855          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1856          break;
1857       case 2:
1858          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1859          break;
1860       case 3:
1861          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1862             break;
1863       }
1864    }
1865    for (; i < 4; i++) {
1866       /* Replicate the last channel out. */
1867       swizzle[i] = swizzle[ir->type->vector_elements - 1];
1868    }
1869
1870    src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1871
1872    this->result = src;
1873 }
1874
1875 void
1876 vec4_visitor::visit(ir_dereference_variable *ir)
1877 {
1878    const struct glsl_type *type = ir->type;
1879    dst_reg *reg = variable_storage(ir->var);
1880
1881    if (!reg) {
1882       fail("Failed to find variable storage for %s\n", ir->var->name);
1883       this->result = src_reg(brw_null_reg());
1884       return;
1885    }
1886
1887    this->result = src_reg(*reg);
1888
1889    /* System values get their swizzle from the dst_reg writemask */
1890    if (ir->var->data.mode == ir_var_system_value)
1891       return;
1892
1893    if (type->is_scalar() || type->is_vector() || type->is_matrix())
1894       this->result.swizzle = swizzle_for_size(type->vector_elements);
1895 }
1896
1897
1898 int
1899 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1900 {
1901    /* Under normal circumstances array elements are stored consecutively, so
1902     * the stride is equal to the size of the array element.
1903     */
1904    return type_size(ir->type);
1905 }
1906
1907
1908 void
1909 vec4_visitor::visit(ir_dereference_array *ir)
1910 {
1911    ir_constant *constant_index;
1912    src_reg src;
1913    int array_stride = compute_array_stride(ir);
1914
1915    constant_index = ir->array_index->constant_expression_value();
1916
1917    ir->array->accept(this);
1918    src = this->result;
1919
1920    if (constant_index) {
1921       src.reg_offset += constant_index->value.i[0] * array_stride;
1922    } else {
1923       /* Variable index array dereference.  It eats the "vec4" of the
1924        * base of the array and an index that offsets the Mesa register
1925        * index.
1926        */
1927       ir->array_index->accept(this);
1928
1929       src_reg index_reg;
1930
1931       if (array_stride == 1) {
1932          index_reg = this->result;
1933       } else {
1934          index_reg = src_reg(this, glsl_type::int_type);
1935
1936          emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
1937       }
1938
1939       if (src.reladdr) {
1940          src_reg temp = src_reg(this, glsl_type::int_type);
1941
1942          emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1943
1944          index_reg = temp;
1945       }
1946
1947       src.reladdr = ralloc(mem_ctx, src_reg);
1948       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1949    }
1950
1951    /* If the type is smaller than a vec4, replicate the last channel out. */
1952    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1953       src.swizzle = swizzle_for_size(ir->type->vector_elements);
1954    else
1955       src.swizzle = BRW_SWIZZLE_NOOP;
1956    src.type = brw_type_for_base_type(ir->type);
1957
1958    this->result = src;
1959 }
1960
1961 void
1962 vec4_visitor::visit(ir_dereference_record *ir)
1963 {
1964    unsigned int i;
1965    const glsl_type *struct_type = ir->record->type;
1966    int offset = 0;
1967
1968    ir->record->accept(this);
1969
1970    for (i = 0; i < struct_type->length; i++) {
1971       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1972          break;
1973       offset += type_size(struct_type->fields.structure[i].type);
1974    }
1975
1976    /* If the type is smaller than a vec4, replicate the last channel out. */
1977    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1978       this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1979    else
1980       this->result.swizzle = BRW_SWIZZLE_NOOP;
1981    this->result.type = brw_type_for_base_type(ir->type);
1982
1983    this->result.reg_offset += offset;
1984 }
1985
1986 /**
1987  * We want to be careful in assignment setup to hit the actual storage
1988  * instead of potentially using a temporary like we might with the
1989  * ir_dereference handler.
1990  */
1991 static dst_reg
1992 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1993 {
1994    /* The LHS must be a dereference.  If the LHS is a variable indexed array
1995     * access of a vector, it must be separated into a series conditional moves
1996     * before reaching this point (see ir_vec_index_to_cond_assign).
1997     */
1998    assert(ir->as_dereference());
1999    ir_dereference_array *deref_array = ir->as_dereference_array();
2000    if (deref_array) {
2001       assert(!deref_array->array->type->is_vector());
2002    }
2003
2004    /* Use the rvalue deref handler for the most part.  We'll ignore
2005     * swizzles in it and write swizzles using writemask, though.
2006     */
2007    ir->accept(v);
2008    return dst_reg(v->result);
2009 }
2010
2011 void
2012 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
2013                               const struct glsl_type *type,
2014                               enum brw_predicate predicate)
2015 {
2016    if (type->base_type == GLSL_TYPE_STRUCT) {
2017       for (unsigned int i = 0; i < type->length; i++) {
2018          emit_block_move(dst, src, type->fields.structure[i].type, predicate);
2019       }
2020       return;
2021    }
2022
2023    if (type->is_array()) {
2024       for (unsigned int i = 0; i < type->length; i++) {
2025          emit_block_move(dst, src, type->fields.array, predicate);
2026       }
2027       return;
2028    }
2029
2030    if (type->is_matrix()) {
2031       const struct glsl_type *vec_type;
2032
2033       vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
2034                                          type->vector_elements, 1);
2035
2036       for (int i = 0; i < type->matrix_columns; i++) {
2037          emit_block_move(dst, src, vec_type, predicate);
2038       }
2039       return;
2040    }
2041
2042    assert(type->is_scalar() || type->is_vector());
2043
2044    dst->type = brw_type_for_base_type(type);
2045    src->type = dst->type;
2046
2047    dst->writemask = (1 << type->vector_elements) - 1;
2048
2049    src->swizzle = swizzle_for_size(type->vector_elements);
2050
2051    vec4_instruction *inst = emit(MOV(*dst, *src));
2052    inst->predicate = predicate;
2053
2054    dst->reg_offset++;
2055    src->reg_offset++;
2056 }
2057
2058
2059 /* If the RHS processing resulted in an instruction generating a
2060  * temporary value, and it would be easy to rewrite the instruction to
2061  * generate its result right into the LHS instead, do so.  This ends
2062  * up reliably removing instructions where it can be tricky to do so
2063  * later without real UD chain information.
2064  */
2065 bool
2066 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2067                                      dst_reg dst,
2068                                      src_reg src,
2069                                      vec4_instruction *pre_rhs_inst,
2070                                      vec4_instruction *last_rhs_inst)
2071 {
2072    /* This could be supported, but it would take more smarts. */
2073    if (ir->condition)
2074       return false;
2075
2076    if (pre_rhs_inst == last_rhs_inst)
2077       return false; /* No instructions generated to work with. */
2078
2079    /* Make sure the last instruction generated our source reg. */
2080    if (src.file != GRF ||
2081        src.file != last_rhs_inst->dst.file ||
2082        src.reg != last_rhs_inst->dst.reg ||
2083        src.reg_offset != last_rhs_inst->dst.reg_offset ||
2084        src.reladdr ||
2085        src.abs ||
2086        src.negate ||
2087        last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2088       return false;
2089
2090    /* Check that that last instruction fully initialized the channels
2091     * we want to use, in the order we want to use them.  We could
2092     * potentially reswizzle the operands of many instructions so that
2093     * we could handle out of order channels, but don't yet.
2094     */
2095
2096    for (unsigned i = 0; i < 4; i++) {
2097       if (dst.writemask & (1 << i)) {
2098          if (!(last_rhs_inst->dst.writemask & (1 << i)))
2099             return false;
2100
2101          if (BRW_GET_SWZ(src.swizzle, i) != i)
2102             return false;
2103       }
2104    }
2105
2106    /* Success!  Rewrite the instruction. */
2107    last_rhs_inst->dst.file = dst.file;
2108    last_rhs_inst->dst.reg = dst.reg;
2109    last_rhs_inst->dst.reg_offset = dst.reg_offset;
2110    last_rhs_inst->dst.reladdr = dst.reladdr;
2111    last_rhs_inst->dst.writemask &= dst.writemask;
2112
2113    return true;
2114 }
2115
2116 void
2117 vec4_visitor::visit(ir_assignment *ir)
2118 {
2119    dst_reg dst = get_assignment_lhs(ir->lhs, this);
2120    enum brw_predicate predicate = BRW_PREDICATE_NONE;
2121
2122    if (!ir->lhs->type->is_scalar() &&
2123        !ir->lhs->type->is_vector()) {
2124       ir->rhs->accept(this);
2125       src_reg src = this->result;
2126
2127       if (ir->condition) {
2128          emit_bool_to_cond_code(ir->condition, &predicate);
2129       }
2130
2131       /* emit_block_move doesn't account for swizzles in the source register.
2132        * This should be ok, since the source register is a structure or an
2133        * array, and those can't be swizzled.  But double-check to be sure.
2134        */
2135       assert(src.swizzle ==
2136              (ir->rhs->type->is_matrix()
2137               ? swizzle_for_size(ir->rhs->type->vector_elements)
2138               : BRW_SWIZZLE_NOOP));
2139
2140       emit_block_move(&dst, &src, ir->rhs->type, predicate);
2141       return;
2142    }
2143
2144    /* Now we're down to just a scalar/vector with writemasks. */
2145    int i;
2146
2147    vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2148    pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2149
2150    ir->rhs->accept(this);
2151
2152    last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2153
2154    src_reg src = this->result;
2155
2156    int swizzles[4];
2157    int first_enabled_chan = 0;
2158    int src_chan = 0;
2159
2160    assert(ir->lhs->type->is_vector() ||
2161           ir->lhs->type->is_scalar());
2162    dst.writemask = ir->write_mask;
2163
2164    for (int i = 0; i < 4; i++) {
2165       if (dst.writemask & (1 << i)) {
2166          first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
2167          break;
2168       }
2169    }
2170
2171    /* Swizzle a small RHS vector into the channels being written.
2172     *
2173     * glsl ir treats write_mask as dictating how many channels are
2174     * present on the RHS while in our instructions we need to make
2175     * those channels appear in the slots of the vec4 they're written to.
2176     */
2177    for (int i = 0; i < 4; i++) {
2178       if (dst.writemask & (1 << i))
2179          swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
2180       else
2181          swizzles[i] = first_enabled_chan;
2182    }
2183    src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2184                               swizzles[2], swizzles[3]);
2185
2186    if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2187       return;
2188    }
2189
2190    if (ir->condition) {
2191       emit_bool_to_cond_code(ir->condition, &predicate);
2192    }
2193
2194    for (i = 0; i < type_size(ir->lhs->type); i++) {
2195       vec4_instruction *inst = emit(MOV(dst, src));
2196       inst->predicate = predicate;
2197
2198       dst.reg_offset++;
2199       src.reg_offset++;
2200    }
2201 }
2202
2203 void
2204 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2205 {
2206    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2207       foreach_in_list(ir_constant, field_value, &ir->components) {
2208          emit_constant_values(dst, field_value);
2209       }
2210       return;
2211    }
2212
2213    if (ir->type->is_array()) {
2214       for (unsigned int i = 0; i < ir->type->length; i++) {
2215          emit_constant_values(dst, ir->array_elements[i]);
2216       }
2217       return;
2218    }
2219
2220    if (ir->type->is_matrix()) {
2221       for (int i = 0; i < ir->type->matrix_columns; i++) {
2222          float *vec = &ir->value.f[i * ir->type->vector_elements];
2223
2224          for (int j = 0; j < ir->type->vector_elements; j++) {
2225             dst->writemask = 1 << j;
2226             dst->type = BRW_REGISTER_TYPE_F;
2227
2228             emit(MOV(*dst, src_reg(vec[j])));
2229          }
2230          dst->reg_offset++;
2231       }
2232       return;
2233    }
2234
2235    int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2236
2237    for (int i = 0; i < ir->type->vector_elements; i++) {
2238       if (!(remaining_writemask & (1 << i)))
2239          continue;
2240
2241       dst->writemask = 1 << i;
2242       dst->type = brw_type_for_base_type(ir->type);
2243
2244       /* Find other components that match the one we're about to
2245        * write.  Emits fewer instructions for things like vec4(0.5,
2246        * 1.5, 1.5, 1.5).
2247        */
2248       for (int j = i + 1; j < ir->type->vector_elements; j++) {
2249          if (ir->type->base_type == GLSL_TYPE_BOOL) {
2250             if (ir->value.b[i] == ir->value.b[j])
2251                dst->writemask |= (1 << j);
2252          } else {
2253             /* u, i, and f storage all line up, so no need for a
2254              * switch case for comparing each type.
2255              */
2256             if (ir->value.u[i] == ir->value.u[j])
2257                dst->writemask |= (1 << j);
2258          }
2259       }
2260
2261       switch (ir->type->base_type) {
2262       case GLSL_TYPE_FLOAT:
2263          emit(MOV(*dst, src_reg(ir->value.f[i])));
2264          break;
2265       case GLSL_TYPE_INT:
2266          emit(MOV(*dst, src_reg(ir->value.i[i])));
2267          break;
2268       case GLSL_TYPE_UINT:
2269          emit(MOV(*dst, src_reg(ir->value.u[i])));
2270          break;
2271       case GLSL_TYPE_BOOL:
2272          emit(MOV(*dst,
2273                   src_reg(ir->value.b[i] != 0 ? ctx->Const.UniformBooleanTrue
2274                                               : 0)));
2275          break;
2276       default:
2277          unreachable("Non-float/uint/int/bool constant");
2278       }
2279
2280       remaining_writemask &= ~dst->writemask;
2281    }
2282    dst->reg_offset++;
2283 }
2284
2285 void
2286 vec4_visitor::visit(ir_constant *ir)
2287 {
2288    dst_reg dst = dst_reg(this, ir->type);
2289    this->result = src_reg(dst);
2290
2291    emit_constant_values(&dst, ir);
2292 }
2293
2294 void
2295 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2296 {
2297    ir_dereference *deref = static_cast<ir_dereference *>(
2298       ir->actual_parameters.get_head());
2299    ir_variable *location = deref->variable_referenced();
2300    unsigned surf_index = (prog_data->base.binding_table.abo_start +
2301                           location->data.binding);
2302
2303    /* Calculate the surface offset */
2304    src_reg offset(this, glsl_type::uint_type);
2305    ir_dereference_array *deref_array = deref->as_dereference_array();
2306    if (deref_array) {
2307       deref_array->array_index->accept(this);
2308
2309       src_reg tmp(this, glsl_type::uint_type);
2310       emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2311       emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2312    } else {
2313       offset = location->data.atomic.offset;
2314    }
2315
2316    /* Emit the appropriate machine instruction */
2317    const char *callee = ir->callee->function_name();
2318    dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2319
2320    if (!strcmp("__intrinsic_atomic_read", callee)) {
2321       emit_untyped_surface_read(surf_index, dst, offset);
2322
2323    } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2324       emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2325                           src_reg(), src_reg());
2326
2327    } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2328       emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2329                           src_reg(), src_reg());
2330    }
2331 }
2332
2333 void
2334 vec4_visitor::visit(ir_call *ir)
2335 {
2336    const char *callee = ir->callee->function_name();
2337
2338    if (!strcmp("__intrinsic_atomic_read", callee) ||
2339        !strcmp("__intrinsic_atomic_increment", callee) ||
2340        !strcmp("__intrinsic_atomic_predecrement", callee)) {
2341       visit_atomic_counter_intrinsic(ir);
2342    } else {
2343       unreachable("Unsupported intrinsic.");
2344    }
2345 }
2346
2347 src_reg
2348 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, src_reg sampler)
2349 {
2350    vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_MCS);
2351    inst->base_mrf = 2;
2352    inst->mlen = 1;
2353    inst->dst = dst_reg(this, glsl_type::uvec4_type);
2354    inst->dst.writemask = WRITEMASK_XYZW;
2355
2356    inst->src[1] = sampler;
2357
2358    /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2359    int param_base = inst->base_mrf;
2360    int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2361    int zero_mask = 0xf & ~coord_mask;
2362
2363    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2364             coordinate));
2365
2366    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2367             src_reg(0)));
2368
2369    emit(inst);
2370    return src_reg(inst->dst);
2371 }
2372
2373 static bool
2374 is_high_sampler(struct brw_context *brw, src_reg sampler)
2375 {
2376    if (brw->gen < 8 && !brw->is_haswell)
2377       return false;
2378
2379    return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
2380 }
2381
2382 void
2383 vec4_visitor::visit(ir_texture *ir)
2384 {
2385    uint32_t sampler =
2386       _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2387
2388    ir_rvalue *nonconst_sampler_index =
2389       _mesa_get_sampler_array_nonconst_index(ir->sampler);
2390
2391    /* Handle non-constant sampler array indexing */
2392    src_reg sampler_reg;
2393    if (nonconst_sampler_index) {
2394       /* The highest sampler which may be used by this operation is
2395        * the last element of the array. Mark it here, because the generator
2396        * doesn't have enough information to determine the bound.
2397        */
2398       uint32_t array_size = ir->sampler->as_dereference_array()
2399          ->array->type->array_size();
2400
2401       uint32_t max_used = sampler + array_size - 1;
2402       if (ir->op == ir_tg4 && brw->gen < 8) {
2403          max_used += prog_data->base.binding_table.gather_texture_start;
2404       } else {
2405          max_used += prog_data->base.binding_table.texture_start;
2406       }
2407
2408       brw_mark_surface_used(&prog_data->base, max_used);
2409
2410       /* Emit code to evaluate the actual indexing expression */
2411       nonconst_sampler_index->accept(this);
2412       dst_reg temp(this, glsl_type::uint_type);
2413       emit(ADD(temp, this->result, src_reg(sampler)))
2414          ->force_writemask_all = true;
2415       sampler_reg = src_reg(temp);
2416    } else {
2417       /* Single sampler, or constant array index; the indexing expression
2418        * is just an immediate.
2419        */
2420       sampler_reg = src_reg(sampler);
2421    }
2422
2423    /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2424     * emitting anything other than setting up the constant result.
2425     */
2426    if (ir->op == ir_tg4) {
2427       ir_constant *chan = ir->lod_info.component->as_constant();
2428       int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2429       if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2430          dst_reg result(this, ir->type);
2431          this->result = src_reg(result);
2432          emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2433          return;
2434       }
2435    }
2436
2437    /* Should be lowered by do_lower_texture_projection */
2438    assert(!ir->projector);
2439
2440    /* Should be lowered */
2441    assert(!ir->offset || !ir->offset->type->is_array());
2442
2443    /* Generate code to compute all the subexpression trees.  This has to be
2444     * done before loading any values into MRFs for the sampler message since
2445     * generating these values may involve SEND messages that need the MRFs.
2446     */
2447    src_reg coordinate;
2448    if (ir->coordinate) {
2449       ir->coordinate->accept(this);
2450       coordinate = this->result;
2451    }
2452
2453    src_reg shadow_comparitor;
2454    if (ir->shadow_comparitor) {
2455       ir->shadow_comparitor->accept(this);
2456       shadow_comparitor = this->result;
2457    }
2458
2459    bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2460    src_reg offset_value;
2461    if (has_nonconstant_offset) {
2462       ir->offset->accept(this);
2463       offset_value = src_reg(this->result);
2464    }
2465
2466    const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2467    src_reg lod, dPdx, dPdy, sample_index, mcs;
2468    switch (ir->op) {
2469    case ir_tex:
2470       lod = src_reg(0.0f);
2471       lod_type = glsl_type::float_type;
2472       break;
2473    case ir_txf:
2474    case ir_txl:
2475    case ir_txs:
2476       ir->lod_info.lod->accept(this);
2477       lod = this->result;
2478       lod_type = ir->lod_info.lod->type;
2479       break;
2480    case ir_query_levels:
2481       lod = src_reg(0);
2482       lod_type = glsl_type::int_type;
2483       break;
2484    case ir_txf_ms:
2485       ir->lod_info.sample_index->accept(this);
2486       sample_index = this->result;
2487       sample_index_type = ir->lod_info.sample_index->type;
2488
2489       if (brw->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2490          mcs = emit_mcs_fetch(ir, coordinate, sampler_reg);
2491       else
2492          mcs = src_reg(0u);
2493       break;
2494    case ir_txd:
2495       ir->lod_info.grad.dPdx->accept(this);
2496       dPdx = this->result;
2497
2498       ir->lod_info.grad.dPdy->accept(this);
2499       dPdy = this->result;
2500
2501       lod_type = ir->lod_info.grad.dPdx->type;
2502       break;
2503    case ir_txb:
2504    case ir_lod:
2505    case ir_tg4:
2506       break;
2507    }
2508
2509    enum opcode opcode;
2510    switch (ir->op) {
2511    case ir_tex: opcode = SHADER_OPCODE_TXL; break;
2512    case ir_txl: opcode = SHADER_OPCODE_TXL; break;
2513    case ir_txd: opcode = SHADER_OPCODE_TXD; break;
2514    case ir_txf: opcode = SHADER_OPCODE_TXF; break;
2515    case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
2516    case ir_txs: opcode = SHADER_OPCODE_TXS; break;
2517    case ir_tg4: opcode = has_nonconstant_offset
2518                          ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
2519    case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
2520    case ir_txb:
2521       unreachable("TXB is not valid for vertex shaders.");
2522    case ir_lod:
2523       unreachable("LOD is not valid for vertex shaders.");
2524    default:
2525       unreachable("Unrecognized tex op");
2526    }
2527
2528    vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, opcode);
2529
2530    if (ir->offset != NULL && ir->op != ir_txf)
2531       inst->texture_offset = brw_texture_offset(ctx, ir->offset->as_constant());
2532
2533    /* Stuff the channel select bits in the top of the texture offset */
2534    if (ir->op == ir_tg4)
2535       inst->texture_offset |= gather_channel(ir, sampler) << 16;
2536
2537    /* The message header is necessary for:
2538     * - Gen4 (always)
2539     * - Texel offsets
2540     * - Gather channel selection
2541     * - Sampler indices too large to fit in a 4-bit value.
2542     */
2543    inst->header_present =
2544       brw->gen < 5 || inst->texture_offset != 0 || ir->op == ir_tg4 ||
2545       is_high_sampler(brw, sampler_reg);
2546    inst->base_mrf = 2;
2547    inst->mlen = inst->header_present + 1; /* always at least one */
2548    inst->dst = dst_reg(this, ir->type);
2549    inst->dst.writemask = WRITEMASK_XYZW;
2550    inst->shadow_compare = ir->shadow_comparitor != NULL;
2551
2552    inst->src[1] = sampler_reg;
2553
2554    /* MRF for the first parameter */
2555    int param_base = inst->base_mrf + inst->header_present;
2556
2557    if (ir->op == ir_txs || ir->op == ir_query_levels) {
2558       int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2559       emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2560    } else {
2561       /* Load the coordinate */
2562       /* FINISHME: gl_clamp_mask and saturate */
2563       int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2564       int zero_mask = 0xf & ~coord_mask;
2565
2566       emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2567                coordinate));
2568
2569       if (zero_mask != 0) {
2570          emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2571                   src_reg(0)));
2572       }
2573       /* Load the shadow comparitor */
2574       if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2575          emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2576                           WRITEMASK_X),
2577                   shadow_comparitor));
2578          inst->mlen++;
2579       }
2580
2581       /* Load the LOD info */
2582       if (ir->op == ir_tex || ir->op == ir_txl) {
2583          int mrf, writemask;
2584          if (brw->gen >= 5) {
2585             mrf = param_base + 1;
2586             if (ir->shadow_comparitor) {
2587                writemask = WRITEMASK_Y;
2588                /* mlen already incremented */
2589             } else {
2590                writemask = WRITEMASK_X;
2591                inst->mlen++;
2592             }
2593          } else /* brw->gen == 4 */ {
2594             mrf = param_base;
2595             writemask = WRITEMASK_W;
2596          }
2597          emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2598       } else if (ir->op == ir_txf) {
2599          emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2600       } else if (ir->op == ir_txf_ms) {
2601          emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2602                   sample_index));
2603          if (brw->gen >= 7)
2604             /* MCS data is in the first channel of `mcs`, but we need to get it into
2605              * the .y channel of the second vec4 of params, so replicate .x across
2606              * the whole vec4 and then mask off everything except .y
2607              */
2608             mcs.swizzle = BRW_SWIZZLE_XXXX;
2609             emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2610                      mcs));
2611          inst->mlen++;
2612       } else if (ir->op == ir_txd) {
2613          const glsl_type *type = lod_type;
2614
2615          if (brw->gen >= 5) {
2616             dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2617             dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2618             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2619             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2620             inst->mlen++;
2621
2622             if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2623                dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2624                dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2625                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2626                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2627                inst->mlen++;
2628
2629                if (ir->shadow_comparitor) {
2630                   emit(MOV(dst_reg(MRF, param_base + 2,
2631                                    ir->shadow_comparitor->type, WRITEMASK_Z),
2632                            shadow_comparitor));
2633                }
2634             }
2635          } else /* brw->gen == 4 */ {
2636             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2637             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2638             inst->mlen += 2;
2639          }
2640       } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2641          if (ir->shadow_comparitor) {
2642             emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2643                      shadow_comparitor));
2644          }
2645
2646          emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2647                   offset_value));
2648          inst->mlen++;
2649       }
2650    }
2651
2652    emit(inst);
2653
2654    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2655     * spec requires layers.
2656     */
2657    if (ir->op == ir_txs) {
2658       glsl_type const *type = ir->sampler->type;
2659       if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2660           type->sampler_array) {
2661          emit_math(SHADER_OPCODE_INT_QUOTIENT,
2662                    writemask(inst->dst, WRITEMASK_Z),
2663                    src_reg(inst->dst), src_reg(6));
2664       }
2665    }
2666
2667    if (brw->gen == 6 && ir->op == ir_tg4) {
2668       emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2669    }
2670
2671    swizzle_result(ir, src_reg(inst->dst), sampler);
2672 }
2673
2674 /**
2675  * Apply workarounds for Gen6 gather with UINT/SINT
2676  */
2677 void
2678 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2679 {
2680    if (!wa)
2681       return;
2682
2683    int width = (wa & WA_8BIT) ? 8 : 16;
2684    dst_reg dst_f = dst;
2685    dst_f.type = BRW_REGISTER_TYPE_F;
2686
2687    /* Convert from UNORM to UINT */
2688    emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2689    emit(MOV(dst, src_reg(dst_f)));
2690
2691    if (wa & WA_SIGN) {
2692       /* Reinterpret the UINT value as a signed INT value by
2693        * shifting the sign bit into place, then shifting back
2694        * preserving sign.
2695        */
2696       emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2697       emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2698    }
2699 }
2700
2701 /**
2702  * Set up the gather channel based on the swizzle, for gather4.
2703  */
2704 uint32_t
2705 vec4_visitor::gather_channel(ir_texture *ir, uint32_t sampler)
2706 {
2707    ir_constant *chan = ir->lod_info.component->as_constant();
2708    int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2709    switch (swiz) {
2710       case SWIZZLE_X: return 0;
2711       case SWIZZLE_Y:
2712          /* gather4 sampler is broken for green channel on RG32F --
2713           * we must ask for blue instead.
2714           */
2715          if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2716             return 2;
2717          return 1;
2718       case SWIZZLE_Z: return 2;
2719       case SWIZZLE_W: return 3;
2720       default:
2721          unreachable("Not reached"); /* zero, one swizzles handled already */
2722    }
2723 }
2724
2725 void
2726 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, uint32_t sampler)
2727 {
2728    int s = key->tex.swizzles[sampler];
2729
2730    this->result = src_reg(this, ir->type);
2731    dst_reg swizzled_result(this->result);
2732
2733    if (ir->op == ir_query_levels) {
2734       /* # levels is in .w */
2735       orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2736       emit(MOV(swizzled_result, orig_val));
2737       return;
2738    }
2739
2740    if (ir->op == ir_txs || ir->type == glsl_type::float_type
2741                         || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2742       emit(MOV(swizzled_result, orig_val));
2743       return;
2744    }
2745
2746
2747    int zero_mask = 0, one_mask = 0, copy_mask = 0;
2748    int swizzle[4] = {0};
2749
2750    for (int i = 0; i < 4; i++) {
2751       switch (GET_SWZ(s, i)) {
2752       case SWIZZLE_ZERO:
2753          zero_mask |= (1 << i);
2754          break;
2755       case SWIZZLE_ONE:
2756          one_mask |= (1 << i);
2757          break;
2758       default:
2759          copy_mask |= (1 << i);
2760          swizzle[i] = GET_SWZ(s, i);
2761          break;
2762       }
2763    }
2764
2765    if (copy_mask) {
2766       orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2767       swizzled_result.writemask = copy_mask;
2768       emit(MOV(swizzled_result, orig_val));
2769    }
2770
2771    if (zero_mask) {
2772       swizzled_result.writemask = zero_mask;
2773       emit(MOV(swizzled_result, src_reg(0.0f)));
2774    }
2775
2776    if (one_mask) {
2777       swizzled_result.writemask = one_mask;
2778       emit(MOV(swizzled_result, src_reg(1.0f)));
2779    }
2780 }
2781
2782 void
2783 vec4_visitor::visit(ir_return *)
2784 {
2785    unreachable("not reached");
2786 }
2787
2788 void
2789 vec4_visitor::visit(ir_discard *)
2790 {
2791    unreachable("not reached");
2792 }
2793
2794 void
2795 vec4_visitor::visit(ir_if *ir)
2796 {
2797    /* Don't point the annotation at the if statement, because then it plus
2798     * the then and else blocks get printed.
2799     */
2800    this->base_ir = ir->condition;
2801
2802    if (brw->gen == 6) {
2803       emit_if_gen6(ir);
2804    } else {
2805       enum brw_predicate predicate;
2806       emit_bool_to_cond_code(ir->condition, &predicate);
2807       emit(IF(predicate));
2808    }
2809
2810    visit_instructions(&ir->then_instructions);
2811
2812    if (!ir->else_instructions.is_empty()) {
2813       this->base_ir = ir->condition;
2814       emit(BRW_OPCODE_ELSE);
2815
2816       visit_instructions(&ir->else_instructions);
2817    }
2818
2819    this->base_ir = ir->condition;
2820    emit(BRW_OPCODE_ENDIF);
2821 }
2822
2823 void
2824 vec4_visitor::visit(ir_emit_vertex *)
2825 {
2826    unreachable("not reached");
2827 }
2828
2829 void
2830 vec4_visitor::visit(ir_end_primitive *)
2831 {
2832    unreachable("not reached");
2833 }
2834
2835 void
2836 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2837                                   dst_reg dst, src_reg offset,
2838                                   src_reg src0, src_reg src1)
2839 {
2840    unsigned mlen = 0;
2841
2842    /* Set the atomic operation offset. */
2843    emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2844    mlen++;
2845
2846    /* Set the atomic operation arguments. */
2847    if (src0.file != BAD_FILE) {
2848       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2849       mlen++;
2850    }
2851
2852    if (src1.file != BAD_FILE) {
2853       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2854       mlen++;
2855    }
2856
2857    /* Emit the instruction.  Note that this maps to the normal SIMD8
2858     * untyped atomic message on Ivy Bridge, but that's OK because
2859     * unused channels will be masked out.
2860     */
2861    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
2862                                  src_reg(atomic_op), src_reg(surf_index));
2863    inst->base_mrf = 0;
2864    inst->mlen = mlen;
2865 }
2866
2867 void
2868 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
2869                                         src_reg offset)
2870 {
2871    /* Set the surface read offset. */
2872    emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
2873
2874    /* Emit the instruction.  Note that this maps to the normal SIMD8
2875     * untyped surface read message, but that's OK because unused
2876     * channels will be masked out.
2877     */
2878    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ,
2879                                  dst, src_reg(surf_index));
2880    inst->base_mrf = 0;
2881    inst->mlen = 1;
2882 }
2883
2884 void
2885 vec4_visitor::emit_ndc_computation()
2886 {
2887    /* Get the position */
2888    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2889
2890    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2891    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2892    output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2893
2894    current_annotation = "NDC";
2895    dst_reg ndc_w = ndc;
2896    ndc_w.writemask = WRITEMASK_W;
2897    src_reg pos_w = pos;
2898    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2899    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2900
2901    dst_reg ndc_xyz = ndc;
2902    ndc_xyz.writemask = WRITEMASK_XYZ;
2903
2904    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2905 }
2906
2907 void
2908 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2909 {
2910    if (brw->gen < 6 &&
2911        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2912         key->userclip_active || brw->has_negative_rhw_bug)) {
2913       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2914       dst_reg header1_w = header1;
2915       header1_w.writemask = WRITEMASK_W;
2916
2917       emit(MOV(header1, 0u));
2918
2919       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2920          src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2921
2922          current_annotation = "Point size";
2923          emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2924          emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2925       }
2926
2927       if (key->userclip_active) {
2928          current_annotation = "Clipping flags";
2929          dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
2930          dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
2931
2932          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
2933          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
2934          emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
2935
2936          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
2937          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
2938          emit(SHL(flags1, src_reg(flags1), src_reg(4)));
2939          emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
2940       }
2941
2942       /* i965 clipping workaround:
2943        * 1) Test for -ve rhw
2944        * 2) If set,
2945        *      set ndc = (0,0,0,0)
2946        *      set ucp[6] = 1
2947        *
2948        * Later, clipping will detect ucp[6] and ensure the primitive is
2949        * clipped against all fixed planes.
2950        */
2951       if (brw->has_negative_rhw_bug) {
2952          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
2953          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
2954          emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
2955          vec4_instruction *inst;
2956          inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
2957          inst->predicate = BRW_PREDICATE_NORMAL;
2958          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
2959          inst->predicate = BRW_PREDICATE_NORMAL;
2960       }
2961
2962       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2963    } else if (brw->gen < 6) {
2964       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2965    } else {
2966       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2967       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2968          emit(MOV(brw_writemask(reg, WRITEMASK_W),
2969                   src_reg(output_reg[VARYING_SLOT_PSIZ])));
2970       }
2971       if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
2972          emit(MOV(retype(brw_writemask(reg, WRITEMASK_Y), BRW_REGISTER_TYPE_D),
2973                   src_reg(output_reg[VARYING_SLOT_LAYER])));
2974       }
2975       if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
2976          emit(MOV(retype(brw_writemask(reg, WRITEMASK_Z), BRW_REGISTER_TYPE_D),
2977                   src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
2978       }
2979    }
2980 }
2981
2982 void
2983 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
2984 {
2985    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2986     *
2987     *     "If a linked set of shaders forming the vertex stage contains no
2988     *     static write to gl_ClipVertex or gl_ClipDistance, but the
2989     *     application has requested clipping against user clip planes through
2990     *     the API, then the coordinate written to gl_Position is used for
2991     *     comparison against the user clip planes."
2992     *
2993     * This function is only called if the shader didn't write to
2994     * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
2995     * if the user wrote to it; otherwise we use gl_Position.
2996     */
2997    gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
2998    if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
2999       clip_vertex = VARYING_SLOT_POS;
3000    }
3001
3002    for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
3003         ++i) {
3004       reg.writemask = 1 << i;
3005       emit(DP4(reg,
3006                src_reg(output_reg[clip_vertex]),
3007                src_reg(this->userplane[i + offset])));
3008    }
3009 }
3010
3011 void
3012 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
3013 {
3014    assert (varying < VARYING_SLOT_MAX);
3015    reg.type = output_reg[varying].type;
3016    current_annotation = output_reg_annotation[varying];
3017    /* Copy the register, saturating if necessary */
3018    vec4_instruction *inst = emit(MOV(reg,
3019                                      src_reg(output_reg[varying])));
3020    if ((varying == VARYING_SLOT_COL0 ||
3021         varying == VARYING_SLOT_COL1 ||
3022         varying == VARYING_SLOT_BFC0 ||
3023         varying == VARYING_SLOT_BFC1) &&
3024        key->clamp_vertex_color) {
3025       inst->saturate = true;
3026    }
3027 }
3028
3029 void
3030 vec4_visitor::emit_urb_slot(int mrf, int varying)
3031 {
3032    struct brw_reg hw_reg = brw_message_reg(mrf);
3033    dst_reg reg = dst_reg(MRF, mrf);
3034    reg.type = BRW_REGISTER_TYPE_F;
3035
3036    switch (varying) {
3037    case VARYING_SLOT_PSIZ:
3038       /* PSIZ is always in slot 0, and is coupled with other flags. */
3039       current_annotation = "indices, point width, clip flags";
3040       emit_psiz_and_flags(hw_reg);
3041       break;
3042    case BRW_VARYING_SLOT_NDC:
3043       current_annotation = "NDC";
3044       emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
3045       break;
3046    case VARYING_SLOT_POS:
3047       current_annotation = "gl_Position";
3048       emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
3049       break;
3050    case VARYING_SLOT_EDGE:
3051       /* This is present when doing unfilled polygons.  We're supposed to copy
3052        * the edge flag from the user-provided vertex array
3053        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
3054        * of that attribute (starts as 1.0f).  This is then used in clipping to
3055        * determine which edges should be drawn as wireframe.
3056        */
3057       current_annotation = "edge flag";
3058       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
3059                                     glsl_type::float_type, WRITEMASK_XYZW))));
3060       break;
3061    case BRW_VARYING_SLOT_PAD:
3062       /* No need to write to this slot */
3063       break;
3064    default:
3065       emit_generic_urb_slot(reg, varying);
3066       break;
3067    }
3068 }
3069
3070 static int
3071 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
3072 {
3073    if (brw->gen >= 6) {
3074       /* URB data written (does not include the message header reg) must
3075        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
3076        * section 5.4.3.2.2: URB_INTERLEAVED.
3077        *
3078        * URB entries are allocated on a multiple of 1024 bits, so an
3079        * extra 128 bits written here to make the end align to 256 is
3080        * no problem.
3081        */
3082       if ((mlen % 2) != 1)
3083          mlen++;
3084    }
3085
3086    return mlen;
3087 }
3088
3089
3090 /**
3091  * Generates the VUE payload plus the necessary URB write instructions to
3092  * output it.
3093  *
3094  * The VUE layout is documented in Volume 2a.
3095  */
3096 void
3097 vec4_visitor::emit_vertex()
3098 {
3099    /* MRF 0 is reserved for the debugger, so start with message header
3100     * in MRF 1.
3101     */
3102    int base_mrf = 1;
3103    int mrf = base_mrf;
3104    /* In the process of generating our URB write message contents, we
3105     * may need to unspill a register or load from an array.  Those
3106     * reads would use MRFs 14-15.
3107     */
3108    int max_usable_mrf = 13;
3109
3110    /* The following assertion verifies that max_usable_mrf causes an
3111     * even-numbered amount of URB write data, which will meet gen6's
3112     * requirements for length alignment.
3113     */
3114    assert ((max_usable_mrf - base_mrf) % 2 == 0);
3115
3116    /* First mrf is the g0-based message header containing URB handles and
3117     * such.
3118     */
3119    emit_urb_write_header(mrf++);
3120
3121    if (brw->gen < 6) {
3122       emit_ndc_computation();
3123    }
3124
3125    /* Lower legacy ff and ClipVertex clipping to clip distances */
3126    if (key->userclip_active && !prog->UsesClipDistanceOut) {
3127       current_annotation = "user clip distances";
3128
3129       output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3130       output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3131
3132       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3133       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3134    }
3135
3136    /* We may need to split this up into several URB writes, so do them in a
3137     * loop.
3138     */
3139    int slot = 0;
3140    bool complete = false;
3141    do {
3142       /* URB offset is in URB row increments, and each of our MRFs is half of
3143        * one of those, since we're doing interleaved writes.
3144        */
3145       int offset = slot / 2;
3146
3147       mrf = base_mrf + 1;
3148       for (; slot < prog_data->vue_map.num_slots; ++slot) {
3149          emit_urb_slot(mrf++, prog_data->vue_map.slot_to_varying[slot]);
3150
3151          /* If this was max_usable_mrf, we can't fit anything more into this
3152           * URB WRITE.
3153           */
3154          if (mrf > max_usable_mrf) {
3155             slot++;
3156             break;
3157          }
3158       }
3159
3160       complete = slot >= prog_data->vue_map.num_slots;
3161       current_annotation = "URB write";
3162       vec4_instruction *inst = emit_urb_write_opcode(complete);
3163       inst->base_mrf = base_mrf;
3164       inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
3165       inst->offset += offset;
3166    } while(!complete);
3167 }
3168
3169
3170 src_reg
3171 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
3172                                  src_reg *reladdr, int reg_offset)
3173 {
3174    /* Because we store the values to scratch interleaved like our
3175     * vertex data, we need to scale the vec4 index by 2.
3176     */
3177    int message_header_scale = 2;
3178
3179    /* Pre-gen6, the message header uses byte offsets instead of vec4
3180     * (16-byte) offset units.
3181     */
3182    if (brw->gen < 6)
3183       message_header_scale *= 16;
3184
3185    if (reladdr) {
3186       src_reg index = src_reg(this, glsl_type::int_type);
3187
3188       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3189       emit_before(inst, MUL(dst_reg(index),
3190                             index, src_reg(message_header_scale)));
3191
3192       return index;
3193    } else {
3194       return src_reg(reg_offset * message_header_scale);
3195    }
3196 }
3197
3198 src_reg
3199 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
3200                                        src_reg *reladdr, int reg_offset)
3201 {
3202    if (reladdr) {
3203       src_reg index = src_reg(this, glsl_type::int_type);
3204
3205       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3206
3207       /* Pre-gen6, the message header uses byte offsets instead of vec4
3208        * (16-byte) offset units.
3209        */
3210       if (brw->gen < 6) {
3211          emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
3212       }
3213
3214       return index;
3215    } else if (brw->gen >= 8) {
3216       /* Store the offset in a GRF so we can send-from-GRF. */
3217       src_reg offset = src_reg(this, glsl_type::int_type);
3218       emit_before(inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3219       return offset;
3220    } else {
3221       int message_header_scale = brw->gen < 6 ? 16 : 1;
3222       return src_reg(reg_offset * message_header_scale);
3223    }
3224 }
3225
3226 /**
3227  * Emits an instruction before @inst to load the value named by @orig_src
3228  * from scratch space at @base_offset to @temp.
3229  *
3230  * @base_offset is measured in 32-byte units (the size of a register).
3231  */
3232 void
3233 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
3234                                 dst_reg temp, src_reg orig_src,
3235                                 int base_offset)
3236 {
3237    int reg_offset = base_offset + orig_src.reg_offset;
3238    src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
3239
3240    emit_before(inst, SCRATCH_READ(temp, index));
3241 }
3242
3243 /**
3244  * Emits an instruction after @inst to store the value to be written
3245  * to @orig_dst to scratch space at @base_offset, from @temp.
3246  *
3247  * @base_offset is measured in 32-byte units (the size of a register).
3248  */
3249 void
3250 vec4_visitor::emit_scratch_write(vec4_instruction *inst, int base_offset)
3251 {
3252    int reg_offset = base_offset + inst->dst.reg_offset;
3253    src_reg index = get_scratch_offset(inst, inst->dst.reladdr, reg_offset);
3254
3255    /* Create a temporary register to store *inst's result in.
3256     *
3257     * We have to be careful in MOVing from our temporary result register in
3258     * the scratch write.  If we swizzle from channels of the temporary that
3259     * weren't initialized, it will confuse live interval analysis, which will
3260     * make spilling fail to make progress.
3261     */
3262    src_reg temp = src_reg(this, glsl_type::vec4_type);
3263    temp.type = inst->dst.type;
3264    int first_writemask_chan = ffs(inst->dst.writemask) - 1;
3265    int swizzles[4];
3266    for (int i = 0; i < 4; i++)
3267       if (inst->dst.writemask & (1 << i))
3268          swizzles[i] = i;
3269       else
3270          swizzles[i] = first_writemask_chan;
3271    temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
3272                                swizzles[2], swizzles[3]);
3273
3274    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3275                                        inst->dst.writemask));
3276    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3277    write->predicate = inst->predicate;
3278    write->ir = inst->ir;
3279    write->annotation = inst->annotation;
3280    inst->insert_after(write);
3281
3282    inst->dst.file = temp.file;
3283    inst->dst.reg = temp.reg;
3284    inst->dst.reg_offset = temp.reg_offset;
3285    inst->dst.reladdr = NULL;
3286 }
3287
3288 /**
3289  * We can't generally support array access in GRF space, because a
3290  * single instruction's destination can only span 2 contiguous
3291  * registers.  So, we send all GRF arrays that get variable index
3292  * access to scratch space.
3293  */
3294 void
3295 vec4_visitor::move_grf_array_access_to_scratch()
3296 {
3297    int scratch_loc[this->virtual_grf_count];
3298
3299    for (int i = 0; i < this->virtual_grf_count; i++) {
3300       scratch_loc[i] = -1;
3301    }
3302
3303    /* First, calculate the set of virtual GRFs that need to be punted
3304     * to scratch due to having any array access on them, and where in
3305     * scratch.
3306     */
3307    foreach_in_list(vec4_instruction, inst, &instructions) {
3308       if (inst->dst.file == GRF && inst->dst.reladdr &&
3309           scratch_loc[inst->dst.reg] == -1) {
3310          scratch_loc[inst->dst.reg] = c->last_scratch;
3311          c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
3312       }
3313
3314       for (int i = 0 ; i < 3; i++) {
3315          src_reg *src = &inst->src[i];
3316
3317          if (src->file == GRF && src->reladdr &&
3318              scratch_loc[src->reg] == -1) {
3319             scratch_loc[src->reg] = c->last_scratch;
3320             c->last_scratch += this->virtual_grf_sizes[src->reg];
3321          }
3322       }
3323    }
3324
3325    /* Now, for anything that will be accessed through scratch, rewrite
3326     * it to load/store.  Note that this is a _safe list walk, because
3327     * we may generate a new scratch_write instruction after the one
3328     * we're processing.
3329     */
3330    foreach_in_list_safe(vec4_instruction, inst, &instructions) {
3331       /* Set up the annotation tracking for new generated instructions. */
3332       base_ir = inst->ir;
3333       current_annotation = inst->annotation;
3334
3335       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
3336          emit_scratch_write(inst, scratch_loc[inst->dst.reg]);
3337       }
3338
3339       for (int i = 0 ; i < 3; i++) {
3340          if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
3341             continue;
3342
3343          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3344
3345          emit_scratch_read(inst, temp, inst->src[i],
3346                            scratch_loc[inst->src[i].reg]);
3347
3348          inst->src[i].file = temp.file;
3349          inst->src[i].reg = temp.reg;
3350          inst->src[i].reg_offset = temp.reg_offset;
3351          inst->src[i].reladdr = NULL;
3352       }
3353    }
3354 }
3355
3356 /**
3357  * Emits an instruction before @inst to load the value named by @orig_src
3358  * from the pull constant buffer (surface) at @base_offset to @temp.
3359  */
3360 void
3361 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
3362                                       dst_reg temp, src_reg orig_src,
3363                                       int base_offset)
3364 {
3365    int reg_offset = base_offset + orig_src.reg_offset;
3366    src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3367    src_reg offset = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
3368    vec4_instruction *load;
3369
3370    if (brw->gen >= 7) {
3371       dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
3372       grf_offset.type = offset.type;
3373       emit_before(inst, MOV(grf_offset, offset));
3374
3375       load = new(mem_ctx) vec4_instruction(this,
3376                                            VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
3377                                            temp, index, src_reg(grf_offset));
3378    } else {
3379       load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
3380                                            temp, index, offset);
3381       load->base_mrf = 14;
3382       load->mlen = 1;
3383    }
3384    emit_before(inst, load);
3385 }
3386
3387 /**
3388  * Implements array access of uniforms by inserting a
3389  * PULL_CONSTANT_LOAD instruction.
3390  *
3391  * Unlike temporary GRF array access (where we don't support it due to
3392  * the difficulty of doing relative addressing on instruction
3393  * destinations), we could potentially do array access of uniforms
3394  * that were loaded in GRF space as push constants.  In real-world
3395  * usage we've seen, though, the arrays being used are always larger
3396  * than we could load as push constants, so just always move all
3397  * uniform array access out to a pull constant buffer.
3398  */
3399 void
3400 vec4_visitor::move_uniform_array_access_to_pull_constants()
3401 {
3402    int pull_constant_loc[this->uniforms];
3403
3404    for (int i = 0; i < this->uniforms; i++) {
3405       pull_constant_loc[i] = -1;
3406    }
3407
3408    /* Walk through and find array access of uniforms.  Put a copy of that
3409     * uniform in the pull constant buffer.
3410     *
3411     * Note that we don't move constant-indexed accesses to arrays.  No
3412     * testing has been done of the performance impact of this choice.
3413     */
3414    foreach_in_list_safe(vec4_instruction, inst, &instructions) {
3415       for (int i = 0 ; i < 3; i++) {
3416          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3417             continue;
3418
3419          int uniform = inst->src[i].reg;
3420
3421          /* If this array isn't already present in the pull constant buffer,
3422           * add it.
3423           */
3424          if (pull_constant_loc[uniform] == -1) {
3425             const gl_constant_value **values =
3426                &stage_prog_data->param[uniform * 4];
3427
3428             pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3429
3430             assert(uniform < uniform_array_size);
3431             for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3432                stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3433                   = values[j];
3434             }
3435          }
3436
3437          /* Set up the annotation tracking for new generated instructions. */
3438          base_ir = inst->ir;
3439          current_annotation = inst->annotation;
3440
3441          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3442
3443          emit_pull_constant_load(inst, temp, inst->src[i],
3444                                  pull_constant_loc[uniform]);
3445
3446          inst->src[i].file = temp.file;
3447          inst->src[i].reg = temp.reg;
3448          inst->src[i].reg_offset = temp.reg_offset;
3449          inst->src[i].reladdr = NULL;
3450       }
3451    }
3452
3453    /* Now there are no accesses of the UNIFORM file with a reladdr, so
3454     * no need to track them as larger-than-vec4 objects.  This will be
3455     * relied on in cutting out unused uniform vectors from push
3456     * constants.
3457     */
3458    split_uniform_registers();
3459 }
3460
3461 void
3462 vec4_visitor::resolve_ud_negate(src_reg *reg)
3463 {
3464    if (reg->type != BRW_REGISTER_TYPE_UD ||
3465        !reg->negate)
3466       return;
3467
3468    src_reg temp = src_reg(this, glsl_type::uvec4_type);
3469    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3470    *reg = temp;
3471 }
3472
3473 vec4_visitor::vec4_visitor(struct brw_context *brw,
3474                            struct brw_vec4_compile *c,
3475                            struct gl_program *prog,
3476                            const struct brw_vec4_prog_key *key,
3477                            struct brw_vec4_prog_data *prog_data,
3478                            struct gl_shader_program *shader_prog,
3479                            gl_shader_stage stage,
3480                            void *mem_ctx,
3481                            bool debug_flag,
3482                            bool no_spills,
3483                            shader_time_shader_type st_base,
3484                            shader_time_shader_type st_written,
3485                            shader_time_shader_type st_reset)
3486    : backend_visitor(brw, shader_prog, prog, &prog_data->base, stage),
3487      c(c),
3488      key(key),
3489      prog_data(prog_data),
3490      sanity_param_count(0),
3491      fail_msg(NULL),
3492      first_non_payload_grf(0),
3493      need_all_constants_in_pull_buffer(false),
3494      debug_flag(debug_flag),
3495      no_spills(no_spills),
3496      st_base(st_base),
3497      st_written(st_written),
3498      st_reset(st_reset)
3499 {
3500    this->mem_ctx = mem_ctx;
3501    this->failed = false;
3502
3503    this->base_ir = NULL;
3504    this->current_annotation = NULL;
3505    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3506
3507    this->variable_ht = hash_table_ctor(0,
3508                                        hash_table_pointer_hash,
3509                                        hash_table_pointer_compare);
3510
3511    this->virtual_grf_start = NULL;
3512    this->virtual_grf_end = NULL;
3513    this->virtual_grf_sizes = NULL;
3514    this->virtual_grf_count = 0;
3515    this->virtual_grf_reg_map = NULL;
3516    this->virtual_grf_reg_count = 0;
3517    this->virtual_grf_array_size = 0;
3518    this->live_intervals_valid = false;
3519
3520    this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3521
3522    this->uniforms = 0;
3523
3524    /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3525     * at least one. See setup_uniforms() in brw_vec4.cpp.
3526     */
3527    this->uniform_array_size = 1;
3528    if (prog_data) {
3529       this->uniform_array_size = MAX2(stage_prog_data->nr_params, 1);
3530    }
3531
3532    this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3533    this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3534 }
3535
3536 vec4_visitor::~vec4_visitor()
3537 {
3538    hash_table_dtor(this->variable_ht);
3539 }
3540
3541
3542 void
3543 vec4_visitor::fail(const char *format, ...)
3544 {
3545    va_list va;
3546    char *msg;
3547
3548    if (failed)
3549       return;
3550
3551    failed = true;
3552
3553    va_start(va, format);
3554    msg = ralloc_vasprintf(mem_ctx, format, va);
3555    va_end(va);
3556    msg = ralloc_asprintf(mem_ctx, "vec4 compile failed: %s\n", msg);
3557
3558    this->fail_msg = msg;
3559
3560    if (debug_flag) {
3561       fprintf(stderr, "%s",  msg);
3562    }
3563 }
3564
3565 } /* namespace brw */