src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "glsl/ir_uniform.h"
  26 extern "C" {
  27 #include "program/sampler.h"
  28 }
  29
  30 namespace brw {
  31
  32 vec4_instruction::vec4_instruction(vec4_visitor *v,
  33                                    enum opcode opcode, const dst_reg &dst,
  34                                    const src_reg &src0, const src_reg &src1,
  35                                    const src_reg &src2)
  36 {
  37    this->opcode = opcode;
  38    this->dst = dst;
  39    this->src[0] = src0;
  40    this->src[1] = src1;
  41    this->src[2] = src2;
  42    this->saturate = false;
  43    this->force_writemask_all = false;
  44    this->no_dd_clear = false;
  45    this->no_dd_check = false;
  46    this->writes_accumulator = false;
  47    this->conditional_mod = BRW_CONDITIONAL_NONE;
  48    this->texture_offset = 0;
  49    this->target = 0;
  50    this->shadow_compare = false;
  51    this->ir = v->base_ir;
  52    this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
  53    this->header_present = false;
  54    this->mlen = 0;
  55    this->base_mrf = 0;
  56    this->offset = 0;
  57    this->annotation = v->current_annotation;
  58 }
  59
  60 vec4_instruction *
  61 vec4_visitor::emit(vec4_instruction *inst)
  62 {
  63    this->instructions.push_tail(inst);
  64
  65    return inst;
  66 }
  67
  68 vec4_instruction *
  69 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
  70 {
  71    new_inst->ir = inst->ir;
  72    new_inst->annotation = inst->annotation;
  73
  74    inst->insert_before(new_inst);
  75
  76    return inst;
  77 }
  78
  79 vec4_instruction *
  80 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
  81                    src_reg src0, src_reg src1, src_reg src2)
  82 {
  83    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
  84                                              src0, src1, src2));
  85 }
  86
  87
  88 vec4_instruction *
  89 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
  90 {
  91    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
  92 }
  93
  94 vec4_instruction *
  95 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
  96 {
  97    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
  98 }
  99
 100 vec4_instruction *
 101 vec4_visitor::emit(enum opcode opcode, dst_reg dst)
 102 {
 103    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst));
 104 }
 105
 106 vec4_instruction *
 107 vec4_visitor::emit(enum opcode opcode)
 108 {
 109    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
 110 }
 111
 112 #define ALU1(op)                                                        \
 113    vec4_instruction *                                                   \
 114    vec4_visitor::op(const dst_reg &dst, const src_reg &src0)            \
 115    {                                                                    \
 116       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 117                                            src0);                       \
 118    }
 119
 120 #define ALU2(op)                                                        \
 121    vec4_instruction *                                                   \
 122    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 123                     const src_reg &src1)                                \
 124    {                                                                    \
 125       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 126                                            src0, src1);                 \
 127    }
 128
 129 #define ALU2_ACC(op)                                                    \
 130    vec4_instruction *                                                   \
 131    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 132                     const src_reg &src1)                                \
 133    {                                                                    \
 134       vec4_instruction *inst = new(mem_ctx) vec4_instruction(this,     \
 135                        BRW_OPCODE_##op, dst, src0, src1);               \
 136       inst->writes_accumulator = true;                                 \
 137       return inst;                                                     \
 138    }
 139
 140 #define ALU3(op)                                                        \
 141    vec4_instruction *                                                   \
 142    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 143                     const src_reg &src1, const src_reg &src2)           \
 144    {                                                                    \
 145       assert(brw->gen >= 6);                                            \
 146       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 147                                            src0, src1, src2);           \
 148    }
 149
 150 ALU1(NOT)
 151 ALU1(MOV)
 152 ALU1(FRC)
 153 ALU1(RNDD)
 154 ALU1(RNDE)
 155 ALU1(RNDZ)
 156 ALU1(F32TO16)
 157 ALU1(F16TO32)
 158 ALU2(ADD)
 159 ALU2(MUL)
 160 ALU2_ACC(MACH)
 161 ALU2(AND)
 162 ALU2(OR)
 163 ALU2(XOR)
 164 ALU2(DP3)
 165 ALU2(DP4)
 166 ALU2(DPH)
 167 ALU2(SHL)
 168 ALU2(SHR)
 169 ALU2(ASR)
 170 ALU3(LRP)
 171 ALU1(BFREV)
 172 ALU3(BFE)
 173 ALU2(BFI1)
 174 ALU3(BFI2)
 175 ALU1(FBH)
 176 ALU1(FBL)
 177 ALU1(CBIT)
 178 ALU3(MAD)
 179 ALU2_ACC(ADDC)
 180 ALU2_ACC(SUBB)
 181 ALU2(MAC)
 182
 183 /** Gen4 predicated IF. */
 184 vec4_instruction *
 185 vec4_visitor::IF(enum brw_predicate predicate)
 186 {
 187    vec4_instruction *inst;
 188
 189    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
 190    inst->predicate = predicate;
 191
 192    return inst;
 193 }
 194
 195 /** Gen6 IF with embedded comparison. */
 196 vec4_instruction *
 197 vec4_visitor::IF(src_reg src0, src_reg src1,
 198                  enum brw_conditional_mod condition)
 199 {
 200    assert(brw->gen == 6);
 201
 202    vec4_instruction *inst;
 203
 204    resolve_ud_negate(&src0);
 205    resolve_ud_negate(&src1);
 206
 207    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
 208                                         src0, src1);
 209    inst->conditional_mod = condition;
 210
 211    return inst;
 212 }
 213
 214 /**
 215  * CMP: Sets the low bit of the destination channels with the result
 216  * of the comparison, while the upper bits are undefined, and updates
 217  * the flag register with the packed 16 bits of the result.
 218  */
 219 vec4_instruction *
 220 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
 221                   enum brw_conditional_mod condition)
 222 {
 223    vec4_instruction *inst;
 224
 225    /* original gen4 does type conversion to the destination type
 226     * before before comparison, producing garbage results for floating
 227     * point comparisons.
 228     */
 229    if (brw->gen == 4) {
 230       dst.type = src0.type;
 231       if (dst.file == HW_REG)
 232          dst.fixed_hw_reg.type = dst.type;
 233    }
 234
 235    resolve_ud_negate(&src0);
 236    resolve_ud_negate(&src1);
 237
 238    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
 239    inst->conditional_mod = condition;
 240
 241    return inst;
 242 }
 243
 244 vec4_instruction *
 245 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
 246 {
 247    vec4_instruction *inst;
 248
 249    inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_READ,
 250                                         dst, index);
 251    inst->base_mrf = 14;
 252    inst->mlen = 2;
 253
 254    return inst;
 255 }
 256
 257 vec4_instruction *
 258 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
 259                             const src_reg &index)
 260 {
 261    vec4_instruction *inst;
 262
 263    inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_WRITE,
 264                                         dst, src, index);
 265    inst->base_mrf = 13;
 266    inst->mlen = 3;
 267
 268    return inst;
 269 }
 270
 271 void
 272 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
 273 {
 274    static enum opcode dot_opcodes[] = {
 275       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
 276    };
 277
 278    emit(dot_opcodes[elements - 2], dst, src0, src1);
 279 }
 280
 281 src_reg
 282 vec4_visitor::fix_3src_operand(src_reg src)
 283 {
 284    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
 285     * able to use vertical stride of zero to replicate the vec4 uniform, like
 286     *
 287     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
 288     *
 289     * But you can't, since vertical stride is always four in three-source
 290     * instructions. Instead, insert a MOV instruction to do the replication so
 291     * that the three-source instruction can consume it.
 292     */
 293
 294    /* The MOV is only needed if the source is a uniform or immediate. */
 295    if (src.file != UNIFORM && src.file != IMM)
 296       return src;
 297
 298    if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
 299       return src;
 300
 301    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 302    expanded.type = src.type;
 303    emit(MOV(expanded, src));
 304    return src_reg(expanded);
 305 }
 306
 307 src_reg
 308 vec4_visitor::fix_math_operand(src_reg src)
 309 {
 310    /* The gen6 math instruction ignores the source modifiers --
 311     * swizzle, abs, negate, and at least some parts of the register
 312     * region description.
 313     *
 314     * Rather than trying to enumerate all these cases, *always* expand the
 315     * operand to a temp GRF for gen6.
 316     *
 317     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 318     * can't use.
 319     */
 320
 321    if (brw->gen == 7 && src.file != IMM)
 322       return src;
 323
 324    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 325    expanded.type = src.type;
 326    emit(MOV(expanded, src));
 327    return src_reg(expanded);
 328 }
 329
 330 void
 331 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
 332 {
 333    src = fix_math_operand(src);
 334
 335    if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
 336       /* The gen6 math instruction must be align1, so we can't do
 337        * writemasks.
 338        */
 339       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 340
 341       emit(opcode, temp_dst, src);
 342
 343       emit(MOV(dst, src_reg(temp_dst)));
 344    } else {
 345       emit(opcode, dst, src);
 346    }
 347 }
 348
 349 void
 350 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
 351 {
 352    vec4_instruction *inst = emit(opcode, dst, src);
 353    inst->base_mrf = 1;
 354    inst->mlen = 1;
 355 }
 356
 357 void
 358 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
 359 {
 360    switch (opcode) {
 361    case SHADER_OPCODE_RCP:
 362    case SHADER_OPCODE_RSQ:
 363    case SHADER_OPCODE_SQRT:
 364    case SHADER_OPCODE_EXP2:
 365    case SHADER_OPCODE_LOG2:
 366    case SHADER_OPCODE_SIN:
 367    case SHADER_OPCODE_COS:
 368       break;
 369    default:
 370       unreachable("not reached: bad math opcode");
 371    }
 372
 373    if (brw->gen >= 8) {
 374       emit(opcode, dst, src);
 375    } else if (brw->gen >= 6) {
 376       emit_math1_gen6(opcode, dst, src);
 377    } else {
 378       emit_math1_gen4(opcode, dst, src);
 379    }
 380 }
 381
 382 void
 383 vec4_visitor::emit_math2_gen6(enum opcode opcode,
 384                               dst_reg dst, src_reg src0, src_reg src1)
 385 {
 386    src0 = fix_math_operand(src0);
 387    src1 = fix_math_operand(src1);
 388
 389    if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
 390       /* The gen6 math instruction must be align1, so we can't do
 391        * writemasks.
 392        */
 393       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 394       temp_dst.type = dst.type;
 395
 396       emit(opcode, temp_dst, src0, src1);
 397
 398       emit(MOV(dst, src_reg(temp_dst)));
 399    } else {
 400       emit(opcode, dst, src0, src1);
 401    }
 402 }
 403
 404 void
 405 vec4_visitor::emit_math2_gen4(enum opcode opcode,
 406                               dst_reg dst, src_reg src0, src_reg src1)
 407 {
 408    vec4_instruction *inst = emit(opcode, dst, src0, src1);
 409    inst->base_mrf = 1;
 410    inst->mlen = 2;
 411 }
 412
 413 void
 414 vec4_visitor::emit_math(enum opcode opcode,
 415                         dst_reg dst, src_reg src0, src_reg src1)
 416 {
 417    switch (opcode) {
 418    case SHADER_OPCODE_POW:
 419    case SHADER_OPCODE_INT_QUOTIENT:
 420    case SHADER_OPCODE_INT_REMAINDER:
 421       break;
 422    default:
 423       unreachable("not reached: unsupported binary math opcode");
 424    }
 425
 426    if (brw->gen >= 8) {
 427       emit(opcode, dst, src0, src1);
 428    } else if (brw->gen >= 6) {
 429       emit_math2_gen6(opcode, dst, src0, src1);
 430    } else {
 431       emit_math2_gen4(opcode, dst, src0, src1);
 432    }
 433 }
 434
 435 void
 436 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 437 {
 438    if (brw->gen < 7) {
 439       unreachable("ir_unop_pack_half_2x16 should be lowered");
 440    }
 441
 442    assert(dst.type == BRW_REGISTER_TYPE_UD);
 443    assert(src0.type == BRW_REGISTER_TYPE_F);
 444
 445    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
 446     *
 447     *   Because this instruction does not have a 16-bit floating-point type,
 448     *   the destination data type must be Word (W).
 449     *
 450     *   The destination must be DWord-aligned and specify a horizontal stride
 451     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
 452     *   each destination channel and the upper word is not modified.
 453     *
 454     * The above restriction implies that the f32to16 instruction must use
 455     * align1 mode, because only in align1 mode is it possible to specify
 456     * horizontal stride.  We choose here to defy the hardware docs and emit
 457     * align16 instructions.
 458     *
 459     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
 460     * instructions. I was partially successful in that the code passed all
 461     * tests.  However, the code was dubiously correct and fragile, and the
 462     * tests were not harsh enough to probe that frailty. Not trusting the
 463     * code, I chose instead to remain in align16 mode in defiance of the hw
 464     * docs).
 465     *
 466     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
 467     * simulator, emitting a f32to16 in align16 mode with UD as destination
 468     * data type is safe. The behavior differs from that specified in the PRM
 469     * in that the upper word of each destination channel is cleared to 0.
 470     */
 471
 472    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 473    src_reg tmp_src(tmp_dst);
 474
 475 #if 0
 476    /* Verify the undocumented behavior on which the following instructions
 477     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
 478     * then the result of the bit-or instruction below will be incorrect.
 479     *
 480     * You should inspect the disasm output in order to verify that the MOV is
 481     * not optimized away.
 482     */
 483    emit(MOV(tmp_dst, src_reg(0x12345678u)));
 484 #endif
 485
 486    /* Give tmp the form below, where "." means untouched.
 487     *
 488     *     w z          y          x w z          y          x
 489     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
 490     *
 491     * That the upper word of each write-channel be 0 is required for the
 492     * following bit-shift and bit-or instructions to work. Note that this
 493     * relies on the undocumented hardware behavior mentioned above.
 494     */
 495    tmp_dst.writemask = WRITEMASK_XY;
 496    emit(F32TO16(tmp_dst, src0));
 497
 498    /* Give the write-channels of dst the form:
 499     *   0xhhhh0000
 500     */
 501    tmp_src.swizzle = BRW_SWIZZLE_YYYY;
 502    emit(SHL(dst, tmp_src, src_reg(16u)));
 503
 504    /* Finally, give the write-channels of dst the form of packHalf2x16's
 505     * output:
 506     *   0xhhhhllll
 507     */
 508    tmp_src.swizzle = BRW_SWIZZLE_XXXX;
 509    emit(OR(dst, src_reg(dst), tmp_src));
 510 }
 511
 512 void
 513 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 514 {
 515    if (brw->gen < 7) {
 516       unreachable("ir_unop_unpack_half_2x16 should be lowered");
 517    }
 518
 519    assert(dst.type == BRW_REGISTER_TYPE_F);
 520    assert(src0.type == BRW_REGISTER_TYPE_UD);
 521
 522    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
 523     *
 524     *   Because this instruction does not have a 16-bit floating-point type,
 525     *   the source data type must be Word (W). The destination type must be
 526     *   F (Float).
 527     *
 528     * To use W as the source data type, we must adjust horizontal strides,
 529     * which is only possible in align1 mode. All my [chadv] attempts at
 530     * emitting align1 instructions for unpackHalf2x16 failed to pass the
 531     * Piglit tests, so I gave up.
 532     *
 533     * I've verified that, on gen7 hardware and the simulator, it is safe to
 534     * emit f16to32 in align16 mode with UD as source data type.
 535     */
 536
 537    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 538    src_reg tmp_src(tmp_dst);
 539
 540    tmp_dst.writemask = WRITEMASK_X;
 541    emit(AND(tmp_dst, src0, src_reg(0xffffu)));
 542
 543    tmp_dst.writemask = WRITEMASK_Y;
 544    emit(SHR(tmp_dst, src0, src_reg(16u)));
 545
 546    dst.writemask = WRITEMASK_XY;
 547    emit(F16TO32(dst, tmp_src));
 548 }
 549
 550 void
 551 vec4_visitor::visit_instructions(const exec_list *list)
 552 {
 553    foreach_in_list(ir_instruction, ir, list) {
 554       base_ir = ir;
 555       ir->accept(this);
 556    }
 557 }
 558
 559
 560 static int
 561 type_size(const struct glsl_type *type)
 562 {
 563    unsigned int i;
 564    int size;
 565
 566    switch (type->base_type) {
 567    case GLSL_TYPE_UINT:
 568    case GLSL_TYPE_INT:
 569    case GLSL_TYPE_FLOAT:
 570    case GLSL_TYPE_BOOL:
 571       if (type->is_matrix()) {
 572          return type->matrix_columns;
 573       } else {
 574          /* Regardless of size of vector, it gets a vec4. This is bad
 575           * packing for things like floats, but otherwise arrays become a
 576           * mess.  Hopefully a later pass over the code can pack scalars
 577           * down if appropriate.
 578           */
 579          return 1;
 580       }
 581    case GLSL_TYPE_ARRAY:
 582       assert(type->length > 0);
 583       return type_size(type->fields.array) * type->length;
 584    case GLSL_TYPE_STRUCT:
 585       size = 0;
 586       for (i = 0; i < type->length; i++) {
 587          size += type_size(type->fields.structure[i].type);
 588       }
 589       return size;
 590    case GLSL_TYPE_SAMPLER:
 591       /* Samplers take up one slot in UNIFORMS[], but they're baked in
 592        * at link time.
 593        */
 594       return 1;
 595    case GLSL_TYPE_ATOMIC_UINT:
 596       return 0;
 597    case GLSL_TYPE_IMAGE:
 598    case GLSL_TYPE_VOID:
 599    case GLSL_TYPE_ERROR:
 600    case GLSL_TYPE_INTERFACE:
 601       unreachable("not reached");
 602    }
 603
 604    return 0;
 605 }
 606
 607 int
 608 vec4_visitor::virtual_grf_alloc(int size)
 609 {
 610    if (virtual_grf_array_size <= virtual_grf_count) {
 611       if (virtual_grf_array_size == 0)
 612          virtual_grf_array_size = 16;
 613       else
 614          virtual_grf_array_size *= 2;
 615       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 616                                    virtual_grf_array_size);
 617       virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
 618                                      virtual_grf_array_size);
 619    }
 620    virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
 621    virtual_grf_reg_count += size;
 622    virtual_grf_sizes[virtual_grf_count] = size;
 623    return virtual_grf_count++;
 624 }
 625
 626 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 627 {
 628    init();
 629
 630    this->file = GRF;
 631    this->reg = v->virtual_grf_alloc(type_size(type));
 632
 633    if (type->is_array() || type->is_record()) {
 634       this->swizzle = BRW_SWIZZLE_NOOP;
 635    } else {
 636       this->swizzle = swizzle_for_size(type->vector_elements);
 637    }
 638
 639    this->type = brw_type_for_base_type(type);
 640 }
 641
 642 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 643 {
 644    init();
 645
 646    this->file = GRF;
 647    this->reg = v->virtual_grf_alloc(type_size(type));
 648
 649    if (type->is_array() || type->is_record()) {
 650       this->writemask = WRITEMASK_XYZW;
 651    } else {
 652       this->writemask = (1 << type->vector_elements) - 1;
 653    }
 654
 655    this->type = brw_type_for_base_type(type);
 656 }
 657
 658 /* Our support for uniforms is piggy-backed on the struct
 659  * gl_fragment_program, because that's where the values actually
 660  * get stored, rather than in some global gl_shader_program uniform
 661  * store.
 662  */
 663 void
 664 vec4_visitor::setup_uniform_values(ir_variable *ir)
 665 {
 666    int namelen = strlen(ir->name);
 667
 668    /* The data for our (non-builtin) uniforms is stored in a series of
 669     * gl_uniform_driver_storage structs for each subcomponent that
 670     * glGetUniformLocation() could name.  We know it's been set up in the same
 671     * order we'd walk the type, so walk the list of storage and find anything
 672     * with our name, or the prefix of a component that starts with our name.
 673     */
 674    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 675       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 676
 677       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 678           (storage->name[namelen] != 0 &&
 679            storage->name[namelen] != '.' &&
 680            storage->name[namelen] != '[')) {
 681          continue;
 682       }
 683
 684       gl_constant_value *components = storage->storage;
 685       unsigned vector_count = (MAX2(storage->array_elements, 1) *
 686                                storage->type->matrix_columns);
 687
 688       for (unsigned s = 0; s < vector_count; s++) {
 689          assert(uniforms < uniform_array_size);
 690          uniform_vector_size[uniforms] = storage->type->vector_elements;
 691
 692          int i;
 693          for (i = 0; i < uniform_vector_size[uniforms]; i++) {
 694             stage_prog_data->param[uniforms * 4 + i] = components;
 695             components++;
 696          }
 697          for (; i < 4; i++) {
 698             static gl_constant_value zero = { 0.0 };
 699             stage_prog_data->param[uniforms * 4 + i] = &zero;
 700          }
 701
 702          uniforms++;
 703       }
 704    }
 705 }
 706
 707 void
 708 vec4_visitor::setup_uniform_clipplane_values()
 709 {
 710    gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
 711
 712    for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
 713       assert(this->uniforms < uniform_array_size);
 714       this->uniform_vector_size[this->uniforms] = 4;
 715       this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
 716       this->userplane[i].type = BRW_REGISTER_TYPE_F;
 717       for (int j = 0; j < 4; ++j) {
 718          stage_prog_data->param[this->uniforms * 4 + j] =
 719             (gl_constant_value *) &clip_planes[i][j];
 720       }
 721       ++this->uniforms;
 722    }
 723 }
 724
 725 /* Our support for builtin uniforms is even scarier than non-builtin.
 726  * It sits on top of the PROG_STATE_VAR parameters that are
 727  * automatically updated from GL context state.
 728  */
 729 void
 730 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
 731 {
 732    const ir_state_slot *const slots = ir->state_slots;
 733    assert(ir->state_slots != NULL);
 734
 735    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 736       /* This state reference has already been setup by ir_to_mesa,
 737        * but we'll get the same index back here.  We can reference
 738        * ParameterValues directly, since unlike brw_fs.cpp, we never
 739        * add new state references during compile.
 740        */
 741       int index = _mesa_add_state_reference(this->prog->Parameters,
 742                                             (gl_state_index *)slots[i].tokens);
 743       gl_constant_value *values =
 744          &this->prog->Parameters->ParameterValues[index][0];
 745
 746       assert(this->uniforms < uniform_array_size);
 747       this->uniform_vector_size[this->uniforms] = 0;
 748       /* Add each of the unique swizzled channels of the element.
 749        * This will end up matching the size of the glsl_type of this field.
 750        */
 751       int last_swiz = -1;
 752       for (unsigned int j = 0; j < 4; j++) {
 753          int swiz = GET_SWZ(slots[i].swizzle, j);
 754          last_swiz = swiz;
 755
 756          stage_prog_data->param[this->uniforms * 4 + j] = &values[swiz];
 757          assert(this->uniforms < uniform_array_size);
 758          if (swiz <= last_swiz)
 759             this->uniform_vector_size[this->uniforms]++;
 760       }
 761       this->uniforms++;
 762    }
 763 }
 764
 765 dst_reg *
 766 vec4_visitor::variable_storage(ir_variable *var)
 767 {
 768    return (dst_reg *)hash_table_find(this->variable_ht, var);
 769 }
 770
 771 void
 772 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir,
 773                                      enum brw_predicate *predicate)
 774 {
 775    ir_expression *expr = ir->as_expression();
 776
 777    *predicate = BRW_PREDICATE_NORMAL;
 778
 779    if (expr) {
 780       src_reg op[2];
 781       vec4_instruction *inst;
 782
 783       assert(expr->get_num_operands() <= 2);
 784       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 785          expr->operands[i]->accept(this);
 786          op[i] = this->result;
 787
 788          resolve_ud_negate(&op[i]);
 789       }
 790
 791       switch (expr->operation) {
 792       case ir_unop_logic_not:
 793          inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
 794          inst->conditional_mod = BRW_CONDITIONAL_Z;
 795          break;
 796
 797       case ir_binop_logic_xor:
 798          inst = emit(XOR(dst_null_d(), op[0], op[1]));
 799          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 800          break;
 801
 802       case ir_binop_logic_or:
 803          inst = emit(OR(dst_null_d(), op[0], op[1]));
 804          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 805          break;
 806
 807       case ir_binop_logic_and:
 808          inst = emit(AND(dst_null_d(), op[0], op[1]));
 809          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 810          break;
 811
 812       case ir_unop_f2b:
 813          if (brw->gen >= 6) {
 814             emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
 815          } else {
 816             inst = emit(MOV(dst_null_f(), op[0]));
 817             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 818          }
 819          break;
 820
 821       case ir_unop_i2b:
 822          if (brw->gen >= 6) {
 823             emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 824          } else {
 825             inst = emit(MOV(dst_null_d(), op[0]));
 826             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 827          }
 828          break;
 829
 830       case ir_binop_all_equal:
 831          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 832          *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
 833          break;
 834
 835       case ir_binop_any_nequal:
 836          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 837          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 838          break;
 839
 840       case ir_unop_any:
 841          inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 842          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 843          break;
 844
 845       case ir_binop_greater:
 846       case ir_binop_gequal:
 847       case ir_binop_less:
 848       case ir_binop_lequal:
 849       case ir_binop_equal:
 850       case ir_binop_nequal:
 851          emit(CMP(dst_null_d(), op[0], op[1],
 852                   brw_conditional_for_comparison(expr->operation)));
 853          break;
 854
 855       default:
 856          unreachable("not reached");
 857       }
 858       return;
 859    }
 860
 861    ir->accept(this);
 862
 863    resolve_ud_negate(&this->result);
 864
 865    if (brw->gen >= 6) {
 866       vec4_instruction *inst = emit(AND(dst_null_d(),
 867                                         this->result, src_reg(1)));
 868       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 869    } else {
 870       vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
 871       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 872    }
 873 }
 874
 875 /**
 876  * Emit a gen6 IF statement with the comparison folded into the IF
 877  * instruction.
 878  */
 879 void
 880 vec4_visitor::emit_if_gen6(ir_if *ir)
 881 {
 882    ir_expression *expr = ir->condition->as_expression();
 883
 884    if (expr) {
 885       src_reg op[2];
 886       dst_reg temp;
 887
 888       assert(expr->get_num_operands() <= 2);
 889       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 890          expr->operands[i]->accept(this);
 891          op[i] = this->result;
 892       }
 893
 894       switch (expr->operation) {
 895       case ir_unop_logic_not:
 896          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
 897          return;
 898
 899       case ir_binop_logic_xor:
 900          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
 901          return;
 902
 903       case ir_binop_logic_or:
 904          temp = dst_reg(this, glsl_type::bool_type);
 905          emit(OR(temp, op[0], op[1]));
 906          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 907          return;
 908
 909       case ir_binop_logic_and:
 910          temp = dst_reg(this, glsl_type::bool_type);
 911          emit(AND(temp, op[0], op[1]));
 912          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 913          return;
 914
 915       case ir_unop_f2b:
 916          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 917          return;
 918
 919       case ir_unop_i2b:
 920          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 921          return;
 922
 923       case ir_binop_greater:
 924       case ir_binop_gequal:
 925       case ir_binop_less:
 926       case ir_binop_lequal:
 927       case ir_binop_equal:
 928       case ir_binop_nequal:
 929          emit(IF(op[0], op[1],
 930                  brw_conditional_for_comparison(expr->operation)));
 931          return;
 932
 933       case ir_binop_all_equal:
 934          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 935          emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
 936          return;
 937
 938       case ir_binop_any_nequal:
 939          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 940          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 941          return;
 942
 943       case ir_unop_any:
 944          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 945          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 946          return;
 947
 948       default:
 949          unreachable("not reached");
 950       }
 951       return;
 952    }
 953
 954    ir->condition->accept(this);
 955
 956    emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
 957 }
 958
 959 void
 960 vec4_visitor::visit(ir_variable *ir)
 961 {
 962    dst_reg *reg = NULL;
 963
 964    if (variable_storage(ir))
 965       return;
 966
 967    switch (ir->data.mode) {
 968    case ir_var_shader_in:
 969       reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
 970       break;
 971
 972    case ir_var_shader_out:
 973       reg = new(mem_ctx) dst_reg(this, ir->type);
 974
 975       for (int i = 0; i < type_size(ir->type); i++) {
 976          output_reg[ir->data.location + i] = *reg;
 977          output_reg[ir->data.location + i].reg_offset = i;
 978          output_reg[ir->data.location + i].type =
 979             brw_type_for_base_type(ir->type->get_scalar_type());
 980          output_reg_annotation[ir->data.location + i] = ir->name;
 981       }
 982       break;
 983
 984    case ir_var_auto:
 985    case ir_var_temporary:
 986       reg = new(mem_ctx) dst_reg(this, ir->type);
 987       break;
 988
 989    case ir_var_uniform:
 990       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
 991
 992       /* Thanks to the lower_ubo_reference pass, we will see only
 993        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
 994        * variables, so no need for them to be in variable_ht.
 995        *
 996        * Atomic counters take no uniform storage, no need to do
 997        * anything here.
 998        */
 999       if (ir->is_in_uniform_block() || ir->type->contains_atomic())
1000          return;
1001
1002       /* Track how big the whole uniform variable is, in case we need to put a
1003        * copy of its data into pull constants for array access.
1004        */
1005       assert(this->uniforms < uniform_array_size);
1006       this->uniform_size[this->uniforms] = type_size(ir->type);
1007
1008       if (!strncmp(ir->name, "gl_", 3)) {
1009          setup_builtin_uniform_values(ir);
1010       } else {
1011          setup_uniform_values(ir);
1012       }
1013       break;
1014
1015    case ir_var_system_value:
1016       reg = make_reg_for_system_value(ir);
1017       break;
1018
1019    default:
1020       unreachable("not reached");
1021    }
1022
1023    reg->type = brw_type_for_base_type(ir->type);
1024    hash_table_insert(this->variable_ht, reg, ir);
1025 }
1026
1027 void
1028 vec4_visitor::visit(ir_loop *ir)
1029 {
1030    /* We don't want debugging output to print the whole body of the
1031     * loop as the annotation.
1032     */
1033    this->base_ir = NULL;
1034
1035    emit(BRW_OPCODE_DO);
1036
1037    visit_instructions(&ir->body_instructions);
1038
1039    emit(BRW_OPCODE_WHILE);
1040 }
1041
1042 void
1043 vec4_visitor::visit(ir_loop_jump *ir)
1044 {
1045    switch (ir->mode) {
1046    case ir_loop_jump::jump_break:
1047       emit(BRW_OPCODE_BREAK);
1048       break;
1049    case ir_loop_jump::jump_continue:
1050       emit(BRW_OPCODE_CONTINUE);
1051       break;
1052    }
1053 }
1054
1055
1056 void
1057 vec4_visitor::visit(ir_function_signature *)
1058 {
1059    unreachable("not reached");
1060 }
1061
1062 void
1063 vec4_visitor::visit(ir_function *ir)
1064 {
1065    /* Ignore function bodies other than main() -- we shouldn't see calls to
1066     * them since they should all be inlined.
1067     */
1068    if (strcmp(ir->name, "main") == 0) {
1069       const ir_function_signature *sig;
1070       exec_list empty;
1071
1072       sig = ir->matching_signature(NULL, &empty, false);
1073
1074       assert(sig);
1075
1076       visit_instructions(&sig->body);
1077    }
1078 }
1079
1080 bool
1081 vec4_visitor::try_emit_sat(ir_expression *ir)
1082 {
1083    ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
1084    if (!sat_src)
1085       return false;
1086
1087    sat_src->accept(this);
1088    src_reg src = this->result;
1089
1090    this->result = src_reg(this, ir->type);
1091    vec4_instruction *inst;
1092    inst = emit(MOV(dst_reg(this->result), src));
1093    inst->saturate = true;
1094
1095    return true;
1096 }
1097
1098 bool
1099 vec4_visitor::try_emit_mad(ir_expression *ir)
1100 {
1101    /* 3-src instructions were introduced in gen6. */
1102    if (brw->gen < 6)
1103       return false;
1104
1105    /* MAD can only handle floating-point data. */
1106    if (ir->type->base_type != GLSL_TYPE_FLOAT)
1107       return false;
1108
1109    ir_rvalue *nonmul = ir->operands[1];
1110    ir_expression *mul = ir->operands[0]->as_expression();
1111
1112    if (!mul || mul->operation != ir_binop_mul) {
1113       nonmul = ir->operands[0];
1114       mul = ir->operands[1]->as_expression();
1115
1116       if (!mul || mul->operation != ir_binop_mul)
1117          return false;
1118    }
1119
1120    nonmul->accept(this);
1121    src_reg src0 = fix_3src_operand(this->result);
1122
1123    mul->operands[0]->accept(this);
1124    src_reg src1 = fix_3src_operand(this->result);
1125
1126    mul->operands[1]->accept(this);
1127    src_reg src2 = fix_3src_operand(this->result);
1128
1129    this->result = src_reg(this, ir->type);
1130    emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1131
1132    return true;
1133 }
1134
1135 bool
1136 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1137 {
1138    /* This optimization relies on CMP setting the destination to 0 when
1139     * false.  Early hardware only sets the least significant bit, and
1140     * leaves the other bits undefined.  So we can't use it.
1141     */
1142    if (brw->gen < 6)
1143       return false;
1144
1145    ir_expression *const cmp = ir->operands[0]->as_expression();
1146
1147    if (cmp == NULL)
1148       return false;
1149
1150    switch (cmp->operation) {
1151    case ir_binop_less:
1152    case ir_binop_greater:
1153    case ir_binop_lequal:
1154    case ir_binop_gequal:
1155    case ir_binop_equal:
1156    case ir_binop_nequal:
1157       break;
1158
1159    default:
1160       return false;
1161    }
1162
1163    cmp->operands[0]->accept(this);
1164    const src_reg cmp_src0 = this->result;
1165
1166    cmp->operands[1]->accept(this);
1167    const src_reg cmp_src1 = this->result;
1168
1169    this->result = src_reg(this, ir->type);
1170
1171    emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1172             brw_conditional_for_comparison(cmp->operation)));
1173
1174    /* If the comparison is false, this->result will just happen to be zero.
1175     */
1176    vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1177                                        this->result, src_reg(1.0f));
1178    inst->predicate = BRW_PREDICATE_NORMAL;
1179    inst->predicate_inverse = true;
1180
1181    return true;
1182 }
1183
1184 void
1185 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
1186                           src_reg src0, src_reg src1)
1187 {
1188    vec4_instruction *inst;
1189
1190    if (brw->gen >= 6) {
1191       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1192       inst->conditional_mod = conditionalmod;
1193    } else {
1194       emit(CMP(dst, src0, src1, conditionalmod));
1195
1196       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1197       inst->predicate = BRW_PREDICATE_NORMAL;
1198    }
1199 }
1200
1201 void
1202 vec4_visitor::emit_lrp(const dst_reg &dst,
1203                        const src_reg &x, const src_reg &y, const src_reg &a)
1204 {
1205    if (brw->gen >= 6) {
1206       /* Note that the instruction's argument order is reversed from GLSL
1207        * and the IR.
1208        */
1209       emit(LRP(dst,
1210                fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
1211    } else {
1212       /* Earlier generations don't support three source operations, so we
1213        * need to emit x*(1-a) + y*a.
1214        */
1215       dst_reg y_times_a           = dst_reg(this, glsl_type::vec4_type);
1216       dst_reg one_minus_a         = dst_reg(this, glsl_type::vec4_type);
1217       dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1218       y_times_a.writemask           = dst.writemask;
1219       one_minus_a.writemask         = dst.writemask;
1220       x_times_one_minus_a.writemask = dst.writemask;
1221
1222       emit(MUL(y_times_a, y, a));
1223       emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1224       emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1225       emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1226    }
1227 }
1228
1229 void
1230 vec4_visitor::visit(ir_expression *ir)
1231 {
1232    unsigned int operand;
1233    src_reg op[Elements(ir->operands)];
1234    src_reg result_src;
1235    dst_reg result_dst;
1236    vec4_instruction *inst;
1237
1238    if (try_emit_sat(ir))
1239       return;
1240
1241    if (ir->operation == ir_binop_add) {
1242       if (try_emit_mad(ir))
1243          return;
1244    }
1245
1246    if (ir->operation == ir_unop_b2f) {
1247       if (try_emit_b2f_of_compare(ir))
1248          return;
1249    }
1250
1251    for (operand = 0; operand < ir->get_num_operands(); operand++) {
1252       this->result.file = BAD_FILE;
1253       ir->operands[operand]->accept(this);
1254       if (this->result.file == BAD_FILE) {
1255          fprintf(stderr, "Failed to get tree for expression operand:\n");
1256          ir->operands[operand]->fprint(stderr);
1257          exit(1);
1258       }
1259       op[operand] = this->result;
1260
1261       /* Matrix expression operands should have been broken down to vector
1262        * operations already.
1263        */
1264       assert(!ir->operands[operand]->type->is_matrix());
1265    }
1266
1267    int vector_elements = ir->operands[0]->type->vector_elements;
1268    if (ir->operands[1]) {
1269       vector_elements = MAX2(vector_elements,
1270                              ir->operands[1]->type->vector_elements);
1271    }
1272
1273    this->result.file = BAD_FILE;
1274
1275    /* Storage for our result.  Ideally for an assignment we'd be using
1276     * the actual storage for the result here, instead.
1277     */
1278    result_src = src_reg(this, ir->type);
1279    /* convenience for the emit functions below. */
1280    result_dst = dst_reg(result_src);
1281    /* If nothing special happens, this is the result. */
1282    this->result = result_src;
1283    /* Limit writes to the channels that will be used by result_src later.
1284     * This does limit this temp's use as a temporary for multi-instruction
1285     * sequences.
1286     */
1287    result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1288
1289    switch (ir->operation) {
1290    case ir_unop_logic_not:
1291       if (ctx->Const.UniformBooleanTrue != 1) {
1292          emit(NOT(result_dst, op[0]));
1293       } else {
1294          emit(XOR(result_dst, op[0], src_reg(1)));
1295       }
1296       break;
1297    case ir_unop_neg:
1298       op[0].negate = !op[0].negate;
1299       emit(MOV(result_dst, op[0]));
1300       break;
1301    case ir_unop_abs:
1302       op[0].abs = true;
1303       op[0].negate = false;
1304       emit(MOV(result_dst, op[0]));
1305       break;
1306
1307    case ir_unop_sign:
1308       if (ir->type->is_float()) {
1309          /* AND(val, 0x80000000) gives the sign bit.
1310           *
1311           * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1312           * zero.
1313           */
1314          emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1315
1316          op[0].type = BRW_REGISTER_TYPE_UD;
1317          result_dst.type = BRW_REGISTER_TYPE_UD;
1318          emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1319
1320          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1321          inst->predicate = BRW_PREDICATE_NORMAL;
1322
1323          this->result.type = BRW_REGISTER_TYPE_F;
1324       } else {
1325          /*  ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1326           *               -> non-negative val generates 0x00000000.
1327           *  Predicated OR sets 1 if val is positive.
1328           */
1329          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1330
1331          emit(ASR(result_dst, op[0], src_reg(31)));
1332
1333          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1334          inst->predicate = BRW_PREDICATE_NORMAL;
1335       }
1336       break;
1337
1338    case ir_unop_rcp:
1339       emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1340       break;
1341
1342    case ir_unop_exp2:
1343       emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1344       break;
1345    case ir_unop_log2:
1346       emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1347       break;
1348    case ir_unop_exp:
1349    case ir_unop_log:
1350       unreachable("not reached: should be handled by ir_explog_to_explog2");
1351    case ir_unop_sin:
1352    case ir_unop_sin_reduced:
1353       emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1354       break;
1355    case ir_unop_cos:
1356    case ir_unop_cos_reduced:
1357       emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1358       break;
1359
1360    case ir_unop_dFdx:
1361    case ir_unop_dFdx_coarse:
1362    case ir_unop_dFdx_fine:
1363    case ir_unop_dFdy:
1364    case ir_unop_dFdy_coarse:
1365    case ir_unop_dFdy_fine:
1366       unreachable("derivatives not valid in vertex shader");
1367
1368    case ir_unop_bitfield_reverse:
1369       emit(BFREV(result_dst, op[0]));
1370       break;
1371    case ir_unop_bit_count:
1372       emit(CBIT(result_dst, op[0]));
1373       break;
1374    case ir_unop_find_msb: {
1375       src_reg temp = src_reg(this, glsl_type::uint_type);
1376
1377       inst = emit(FBH(dst_reg(temp), op[0]));
1378       inst->dst.writemask = WRITEMASK_XYZW;
1379
1380       /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1381        * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1382        * subtract the result from 31 to convert the MSB count into an LSB count.
1383        */
1384
1385       /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1386       temp.swizzle = BRW_SWIZZLE_NOOP;
1387       emit(MOV(result_dst, temp));
1388
1389       src_reg src_tmp = src_reg(result_dst);
1390       emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1391
1392       src_tmp.negate = true;
1393       inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1394       inst->predicate = BRW_PREDICATE_NORMAL;
1395       break;
1396    }
1397    case ir_unop_find_lsb:
1398       emit(FBL(result_dst, op[0]));
1399       break;
1400
1401    case ir_unop_noise:
1402       unreachable("not reached: should be handled by lower_noise");
1403
1404    case ir_binop_add:
1405       emit(ADD(result_dst, op[0], op[1]));
1406       break;
1407    case ir_binop_sub:
1408       unreachable("not reached: should be handled by ir_sub_to_add_neg");
1409
1410    case ir_binop_mul:
1411       if (brw->gen < 8 && ir->type->is_integer()) {
1412          /* For integer multiplication, the MUL uses the low 16 bits of one of
1413           * the operands (src0 through SNB, src1 on IVB and later).  The MACH
1414           * accumulates in the contribution of the upper 16 bits of that
1415           * operand.  If we can determine that one of the args is in the low
1416           * 16 bits, though, we can just emit a single MUL.
1417           */
1418          if (ir->operands[0]->is_uint16_constant()) {
1419             if (brw->gen < 7)
1420                emit(MUL(result_dst, op[0], op[1]));
1421             else
1422                emit(MUL(result_dst, op[1], op[0]));
1423          } else if (ir->operands[1]->is_uint16_constant()) {
1424             if (brw->gen < 7)
1425                emit(MUL(result_dst, op[1], op[0]));
1426             else
1427                emit(MUL(result_dst, op[0], op[1]));
1428          } else {
1429             struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1430
1431             emit(MUL(acc, op[0], op[1]));
1432             emit(MACH(dst_null_d(), op[0], op[1]));
1433             emit(MOV(result_dst, src_reg(acc)));
1434          }
1435       } else {
1436          emit(MUL(result_dst, op[0], op[1]));
1437       }
1438       break;
1439    case ir_binop_imul_high: {
1440       struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1441
1442       emit(MUL(acc, op[0], op[1]));
1443       emit(MACH(result_dst, op[0], op[1]));
1444       break;
1445    }
1446    case ir_binop_div:
1447       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1448       assert(ir->type->is_integer());
1449       emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1450       break;
1451    case ir_binop_carry: {
1452       struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1453
1454       emit(ADDC(dst_null_ud(), op[0], op[1]));
1455       emit(MOV(result_dst, src_reg(acc)));
1456       break;
1457    }
1458    case ir_binop_borrow: {
1459       struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1460
1461       emit(SUBB(dst_null_ud(), op[0], op[1]));
1462       emit(MOV(result_dst, src_reg(acc)));
1463       break;
1464    }
1465    case ir_binop_mod:
1466       /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1467       assert(ir->type->is_integer());
1468       emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1469       break;
1470
1471    case ir_binop_less:
1472    case ir_binop_greater:
1473    case ir_binop_lequal:
1474    case ir_binop_gequal:
1475    case ir_binop_equal:
1476    case ir_binop_nequal: {
1477       emit(CMP(result_dst, op[0], op[1],
1478                brw_conditional_for_comparison(ir->operation)));
1479       if (ctx->Const.UniformBooleanTrue == 1) {
1480          emit(AND(result_dst, result_src, src_reg(1)));
1481       }
1482       break;
1483    }
1484
1485    case ir_binop_all_equal:
1486       /* "==" operator producing a scalar boolean. */
1487       if (ir->operands[0]->type->is_vector() ||
1488           ir->operands[1]->type->is_vector()) {
1489          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1490          emit(MOV(result_dst, src_reg(0)));
1491          inst = emit(MOV(result_dst, src_reg(ctx->Const.UniformBooleanTrue)));
1492          inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1493       } else {
1494          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1495          if (ctx->Const.UniformBooleanTrue == 1) {
1496             emit(AND(result_dst, result_src, src_reg(1)));
1497          }
1498       }
1499       break;
1500    case ir_binop_any_nequal:
1501       /* "!=" operator producing a scalar boolean. */
1502       if (ir->operands[0]->type->is_vector() ||
1503           ir->operands[1]->type->is_vector()) {
1504          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1505
1506          emit(MOV(result_dst, src_reg(0)));
1507          inst = emit(MOV(result_dst, src_reg(ctx->Const.UniformBooleanTrue)));
1508          inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1509       } else {
1510          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1511          if (ctx->Const.UniformBooleanTrue == 1) {
1512             emit(AND(result_dst, result_src, src_reg(1)));
1513          }
1514       }
1515       break;
1516
1517    case ir_unop_any:
1518       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1519       emit(MOV(result_dst, src_reg(0)));
1520
1521       inst = emit(MOV(result_dst, src_reg(ctx->Const.UniformBooleanTrue)));
1522       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1523       break;
1524
1525    case ir_binop_logic_xor:
1526       emit(XOR(result_dst, op[0], op[1]));
1527       break;
1528
1529    case ir_binop_logic_or:
1530       emit(OR(result_dst, op[0], op[1]));
1531       break;
1532
1533    case ir_binop_logic_and:
1534       emit(AND(result_dst, op[0], op[1]));
1535       break;
1536
1537    case ir_binop_dot:
1538       assert(ir->operands[0]->type->is_vector());
1539       assert(ir->operands[0]->type == ir->operands[1]->type);
1540       emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1541       break;
1542
1543    case ir_unop_sqrt:
1544       emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1545       break;
1546    case ir_unop_rsq:
1547       emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1548       break;
1549
1550    case ir_unop_bitcast_i2f:
1551    case ir_unop_bitcast_u2f:
1552       this->result = op[0];
1553       this->result.type = BRW_REGISTER_TYPE_F;
1554       break;
1555
1556    case ir_unop_bitcast_f2i:
1557       this->result = op[0];
1558       this->result.type = BRW_REGISTER_TYPE_D;
1559       break;
1560
1561    case ir_unop_bitcast_f2u:
1562       this->result = op[0];
1563       this->result.type = BRW_REGISTER_TYPE_UD;
1564       break;
1565
1566    case ir_unop_i2f:
1567    case ir_unop_i2u:
1568    case ir_unop_u2i:
1569    case ir_unop_u2f:
1570    case ir_unop_f2i:
1571    case ir_unop_f2u:
1572       emit(MOV(result_dst, op[0]));
1573       break;
1574    case ir_unop_b2i:
1575       if (ctx->Const.UniformBooleanTrue != 1) {
1576          emit(AND(result_dst, op[0], src_reg(1)));
1577       } else {
1578          emit(MOV(result_dst, op[0]));
1579       }
1580       break;
1581    case ir_unop_b2f:
1582       if (ctx->Const.UniformBooleanTrue != 1) {
1583          op[0].type = BRW_REGISTER_TYPE_UD;
1584          result_dst.type = BRW_REGISTER_TYPE_UD;
1585          emit(AND(result_dst, op[0], src_reg(0x3f800000u)));
1586          result_dst.type = BRW_REGISTER_TYPE_F;
1587       } else {
1588          emit(MOV(result_dst, op[0]));
1589       }
1590       break;
1591    case ir_unop_f2b:
1592    case ir_unop_i2b:
1593       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1594       if (ctx->Const.UniformBooleanTrue == 1) {
1595          emit(AND(result_dst, result_src, src_reg(1)));
1596       }
1597       break;
1598
1599    case ir_unop_trunc:
1600       emit(RNDZ(result_dst, op[0]));
1601       break;
1602    case ir_unop_ceil:
1603       op[0].negate = !op[0].negate;
1604       inst = emit(RNDD(result_dst, op[0]));
1605       this->result.negate = true;
1606       break;
1607    case ir_unop_floor:
1608       inst = emit(RNDD(result_dst, op[0]));
1609       break;
1610    case ir_unop_fract:
1611       inst = emit(FRC(result_dst, op[0]));
1612       break;
1613    case ir_unop_round_even:
1614       emit(RNDE(result_dst, op[0]));
1615       break;
1616
1617    case ir_binop_min:
1618       emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1619       break;
1620    case ir_binop_max:
1621       emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1622       break;
1623
1624    case ir_binop_pow:
1625       emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1626       break;
1627
1628    case ir_unop_bit_not:
1629       inst = emit(NOT(result_dst, op[0]));
1630       break;
1631    case ir_binop_bit_and:
1632       inst = emit(AND(result_dst, op[0], op[1]));
1633       break;
1634    case ir_binop_bit_xor:
1635       inst = emit(XOR(result_dst, op[0], op[1]));
1636       break;
1637    case ir_binop_bit_or:
1638       inst = emit(OR(result_dst, op[0], op[1]));
1639       break;
1640
1641    case ir_binop_lshift:
1642       inst = emit(SHL(result_dst, op[0], op[1]));
1643       break;
1644
1645    case ir_binop_rshift:
1646       if (ir->type->base_type == GLSL_TYPE_INT)
1647          inst = emit(ASR(result_dst, op[0], op[1]));
1648       else
1649          inst = emit(SHR(result_dst, op[0], op[1]));
1650       break;
1651
1652    case ir_binop_bfm:
1653       emit(BFI1(result_dst, op[0], op[1]));
1654       break;
1655
1656    case ir_binop_ubo_load: {
1657       ir_constant *const_uniform_block = ir->operands[0]->as_constant();
1658       ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1659       unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1660       src_reg offset;
1661
1662       /* Now, load the vector from that offset. */
1663       assert(ir->type->is_vector() || ir->type->is_scalar());
1664
1665       src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1666       packed_consts.type = result.type;
1667       src_reg surf_index;
1668
1669       if (const_uniform_block) {
1670          /* The block index is a constant, so just emit the binding table entry
1671           * as an immediate.
1672           */
1673          surf_index = src_reg(prog_data->base.binding_table.ubo_start +
1674                               const_uniform_block->value.u[0]);
1675       } else {
1676          /* The block index is not a constant. Evaluate the index expression
1677           * per-channel and add the base UBO index; the generator will select
1678           * a value from any live channel.
1679           */
1680          surf_index = src_reg(this, glsl_type::uint_type);
1681          emit(ADD(dst_reg(surf_index), op[0],
1682                   src_reg(prog_data->base.binding_table.ubo_start)));
1683
1684          /* Assume this may touch any UBO. It would be nice to provide
1685           * a tighter bound, but the array information is already lowered away.
1686           */
1687          brw_mark_surface_used(&prog_data->base,
1688                                prog_data->base.binding_table.ubo_start +
1689                                shader_prog->NumUniformBlocks - 1);
1690       }
1691
1692       if (const_offset_ir) {
1693          if (brw->gen >= 8) {
1694             /* Store the offset in a GRF so we can send-from-GRF. */
1695             offset = src_reg(this, glsl_type::int_type);
1696             emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1697          } else {
1698             /* Immediates are fine on older generations since they'll be moved
1699              * to a (potentially fake) MRF at the generator level.
1700              */
1701             offset = src_reg(const_offset / 16);
1702          }
1703       } else {
1704          offset = src_reg(this, glsl_type::uint_type);
1705          emit(SHR(dst_reg(offset), op[1], src_reg(4)));
1706       }
1707
1708       if (brw->gen >= 7) {
1709          dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1710          grf_offset.type = offset.type;
1711
1712          emit(MOV(grf_offset, offset));
1713
1714          emit(new(mem_ctx) vec4_instruction(this,
1715                                             VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1716                                             dst_reg(packed_consts),
1717                                             surf_index,
1718                                             src_reg(grf_offset)));
1719       } else {
1720          vec4_instruction *pull =
1721             emit(new(mem_ctx) vec4_instruction(this,
1722                                                VS_OPCODE_PULL_CONSTANT_LOAD,
1723                                                dst_reg(packed_consts),
1724                                                surf_index,
1725                                                offset));
1726          pull->base_mrf = 14;
1727          pull->mlen = 1;
1728       }
1729
1730       packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1731       packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1732                                             const_offset % 16 / 4,
1733                                             const_offset % 16 / 4,
1734                                             const_offset % 16 / 4);
1735
1736       /* UBO bools are any nonzero int.  We need to convert them to use the
1737        * value of true stored in ctx->Const.UniformBooleanTrue.
1738        */
1739       if (ir->type->base_type == GLSL_TYPE_BOOL) {
1740          emit(CMP(result_dst, packed_consts, src_reg(0u),
1741                   BRW_CONDITIONAL_NZ));
1742          if (ctx->Const.UniformBooleanTrue == 1) {
1743             emit(AND(result_dst, result, src_reg(1)));
1744          }
1745       } else {
1746          emit(MOV(result_dst, packed_consts));
1747       }
1748       break;
1749    }
1750
1751    case ir_binop_vector_extract:
1752       unreachable("should have been lowered by vec_index_to_cond_assign");
1753
1754    case ir_triop_fma:
1755       op[0] = fix_3src_operand(op[0]);
1756       op[1] = fix_3src_operand(op[1]);
1757       op[2] = fix_3src_operand(op[2]);
1758       /* Note that the instruction's argument order is reversed from GLSL
1759        * and the IR.
1760        */
1761       emit(MAD(result_dst, op[2], op[1], op[0]));
1762       break;
1763
1764    case ir_triop_lrp:
1765       emit_lrp(result_dst, op[0], op[1], op[2]);
1766       break;
1767
1768    case ir_triop_csel:
1769       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1770       inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1771       inst->predicate = BRW_PREDICATE_NORMAL;
1772       break;
1773
1774    case ir_triop_bfi:
1775       op[0] = fix_3src_operand(op[0]);
1776       op[1] = fix_3src_operand(op[1]);
1777       op[2] = fix_3src_operand(op[2]);
1778       emit(BFI2(result_dst, op[0], op[1], op[2]));
1779       break;
1780
1781    case ir_triop_bitfield_extract:
1782       op[0] = fix_3src_operand(op[0]);
1783       op[1] = fix_3src_operand(op[1]);
1784       op[2] = fix_3src_operand(op[2]);
1785       /* Note that the instruction's argument order is reversed from GLSL
1786        * and the IR.
1787        */
1788       emit(BFE(result_dst, op[2], op[1], op[0]));
1789       break;
1790
1791    case ir_triop_vector_insert:
1792       unreachable("should have been lowered by lower_vector_insert");
1793
1794    case ir_quadop_bitfield_insert:
1795       unreachable("not reached: should be handled by "
1796               "bitfield_insert_to_bfm_bfi\n");
1797
1798    case ir_quadop_vector:
1799       unreachable("not reached: should be handled by lower_quadop_vector");
1800
1801    case ir_unop_pack_half_2x16:
1802       emit_pack_half_2x16(result_dst, op[0]);
1803       break;
1804    case ir_unop_unpack_half_2x16:
1805       emit_unpack_half_2x16(result_dst, op[0]);
1806       break;
1807    case ir_unop_pack_snorm_2x16:
1808    case ir_unop_pack_snorm_4x8:
1809    case ir_unop_pack_unorm_2x16:
1810    case ir_unop_pack_unorm_4x8:
1811    case ir_unop_unpack_snorm_2x16:
1812    case ir_unop_unpack_snorm_4x8:
1813    case ir_unop_unpack_unorm_2x16:
1814    case ir_unop_unpack_unorm_4x8:
1815       unreachable("not reached: should be handled by lower_packing_builtins");
1816    case ir_unop_unpack_half_2x16_split_x:
1817    case ir_unop_unpack_half_2x16_split_y:
1818    case ir_binop_pack_half_2x16_split:
1819    case ir_unop_interpolate_at_centroid:
1820    case ir_binop_interpolate_at_sample:
1821    case ir_binop_interpolate_at_offset:
1822       unreachable("not reached: should not occur in vertex shader");
1823    case ir_binop_ldexp:
1824       unreachable("not reached: should be handled by ldexp_to_arith()");
1825    }
1826 }
1827
1828
1829 void
1830 vec4_visitor::visit(ir_swizzle *ir)
1831 {
1832    src_reg src;
1833    int i = 0;
1834    int swizzle[4];
1835
1836    /* Note that this is only swizzles in expressions, not those on the left
1837     * hand side of an assignment, which do write masking.  See ir_assignment
1838     * for that.
1839     */
1840
1841    ir->val->accept(this);
1842    src = this->result;
1843    assert(src.file != BAD_FILE);
1844
1845    for (i = 0; i < ir->type->vector_elements; i++) {
1846       switch (i) {
1847       case 0:
1848          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1849          break;
1850       case 1:
1851          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1852          break;
1853       case 2:
1854          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1855          break;
1856       case 3:
1857          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1858             break;
1859       }
1860    }
1861    for (; i < 4; i++) {
1862       /* Replicate the last channel out. */
1863       swizzle[i] = swizzle[ir->type->vector_elements - 1];
1864    }
1865
1866    src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1867
1868    this->result = src;
1869 }
1870
1871 void
1872 vec4_visitor::visit(ir_dereference_variable *ir)
1873 {
1874    const struct glsl_type *type = ir->type;
1875    dst_reg *reg = variable_storage(ir->var);
1876
1877    if (!reg) {
1878       fail("Failed to find variable storage for %s\n", ir->var->name);
1879       this->result = src_reg(brw_null_reg());
1880       return;
1881    }
1882
1883    this->result = src_reg(*reg);
1884
1885    /* System values get their swizzle from the dst_reg writemask */
1886    if (ir->var->data.mode == ir_var_system_value)
1887       return;
1888
1889    if (type->is_scalar() || type->is_vector() || type->is_matrix())
1890       this->result.swizzle = swizzle_for_size(type->vector_elements);
1891 }
1892
1893
1894 int
1895 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1896 {
1897    /* Under normal circumstances array elements are stored consecutively, so
1898     * the stride is equal to the size of the array element.
1899     */
1900    return type_size(ir->type);
1901 }
1902
1903
1904 void
1905 vec4_visitor::visit(ir_dereference_array *ir)
1906 {
1907    ir_constant *constant_index;
1908    src_reg src;
1909    int array_stride = compute_array_stride(ir);
1910
1911    constant_index = ir->array_index->constant_expression_value();
1912
1913    ir->array->accept(this);
1914    src = this->result;
1915
1916    if (constant_index) {
1917       src.reg_offset += constant_index->value.i[0] * array_stride;
1918    } else {
1919       /* Variable index array dereference.  It eats the "vec4" of the
1920        * base of the array and an index that offsets the Mesa register
1921        * index.
1922        */
1923       ir->array_index->accept(this);
1924
1925       src_reg index_reg;
1926
1927       if (array_stride == 1) {
1928          index_reg = this->result;
1929       } else {
1930          index_reg = src_reg(this, glsl_type::int_type);
1931
1932          emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
1933       }
1934
1935       if (src.reladdr) {
1936          src_reg temp = src_reg(this, glsl_type::int_type);
1937
1938          emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1939
1940          index_reg = temp;
1941       }
1942
1943       src.reladdr = ralloc(mem_ctx, src_reg);
1944       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1945    }
1946
1947    /* If the type is smaller than a vec4, replicate the last channel out. */
1948    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1949       src.swizzle = swizzle_for_size(ir->type->vector_elements);
1950    else
1951       src.swizzle = BRW_SWIZZLE_NOOP;
1952    src.type = brw_type_for_base_type(ir->type);
1953
1954    this->result = src;
1955 }
1956
1957 void
1958 vec4_visitor::visit(ir_dereference_record *ir)
1959 {
1960    unsigned int i;
1961    const glsl_type *struct_type = ir->record->type;
1962    int offset = 0;
1963
1964    ir->record->accept(this);
1965
1966    for (i = 0; i < struct_type->length; i++) {
1967       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1968          break;
1969       offset += type_size(struct_type->fields.structure[i].type);
1970    }
1971
1972    /* If the type is smaller than a vec4, replicate the last channel out. */
1973    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1974       this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1975    else
1976       this->result.swizzle = BRW_SWIZZLE_NOOP;
1977    this->result.type = brw_type_for_base_type(ir->type);
1978
1979    this->result.reg_offset += offset;
1980 }
1981
1982 /**
1983  * We want to be careful in assignment setup to hit the actual storage
1984  * instead of potentially using a temporary like we might with the
1985  * ir_dereference handler.
1986  */
1987 static dst_reg
1988 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1989 {
1990    /* The LHS must be a dereference.  If the LHS is a variable indexed array
1991     * access of a vector, it must be separated into a series conditional moves
1992     * before reaching this point (see ir_vec_index_to_cond_assign).
1993     */
1994    assert(ir->as_dereference());
1995    ir_dereference_array *deref_array = ir->as_dereference_array();
1996    if (deref_array) {
1997       assert(!deref_array->array->type->is_vector());
1998    }
1999
2000    /* Use the rvalue deref handler for the most part.  We'll ignore
2001     * swizzles in it and write swizzles using writemask, though.
2002     */
2003    ir->accept(v);
2004    return dst_reg(v->result);
2005 }
2006
2007 void
2008 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
2009                               const struct glsl_type *type,
2010                               enum brw_predicate predicate)
2011 {
2012    if (type->base_type == GLSL_TYPE_STRUCT) {
2013       for (unsigned int i = 0; i < type->length; i++) {
2014          emit_block_move(dst, src, type->fields.structure[i].type, predicate);
2015       }
2016       return;
2017    }
2018
2019    if (type->is_array()) {
2020       for (unsigned int i = 0; i < type->length; i++) {
2021          emit_block_move(dst, src, type->fields.array, predicate);
2022       }
2023       return;
2024    }
2025
2026    if (type->is_matrix()) {
2027       const struct glsl_type *vec_type;
2028
2029       vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
2030                                          type->vector_elements, 1);
2031
2032       for (int i = 0; i < type->matrix_columns; i++) {
2033          emit_block_move(dst, src, vec_type, predicate);
2034       }
2035       return;
2036    }
2037
2038    assert(type->is_scalar() || type->is_vector());
2039
2040    dst->type = brw_type_for_base_type(type);
2041    src->type = dst->type;
2042
2043    dst->writemask = (1 << type->vector_elements) - 1;
2044
2045    src->swizzle = swizzle_for_size(type->vector_elements);
2046
2047    vec4_instruction *inst = emit(MOV(*dst, *src));
2048    inst->predicate = predicate;
2049
2050    dst->reg_offset++;
2051    src->reg_offset++;
2052 }
2053
2054
2055 /* If the RHS processing resulted in an instruction generating a
2056  * temporary value, and it would be easy to rewrite the instruction to
2057  * generate its result right into the LHS instead, do so.  This ends
2058  * up reliably removing instructions where it can be tricky to do so
2059  * later without real UD chain information.
2060  */
2061 bool
2062 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2063                                      dst_reg dst,
2064                                      src_reg src,
2065                                      vec4_instruction *pre_rhs_inst,
2066                                      vec4_instruction *last_rhs_inst)
2067 {
2068    /* This could be supported, but it would take more smarts. */
2069    if (ir->condition)
2070       return false;
2071
2072    if (pre_rhs_inst == last_rhs_inst)
2073       return false; /* No instructions generated to work with. */
2074
2075    /* Make sure the last instruction generated our source reg. */
2076    if (src.file != GRF ||
2077        src.file != last_rhs_inst->dst.file ||
2078        src.reg != last_rhs_inst->dst.reg ||
2079        src.reg_offset != last_rhs_inst->dst.reg_offset ||
2080        src.reladdr ||
2081        src.abs ||
2082        src.negate ||
2083        last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2084       return false;
2085
2086    /* Check that that last instruction fully initialized the channels
2087     * we want to use, in the order we want to use them.  We could
2088     * potentially reswizzle the operands of many instructions so that
2089     * we could handle out of order channels, but don't yet.
2090     */
2091
2092    for (unsigned i = 0; i < 4; i++) {
2093       if (dst.writemask & (1 << i)) {
2094          if (!(last_rhs_inst->dst.writemask & (1 << i)))
2095             return false;
2096
2097          if (BRW_GET_SWZ(src.swizzle, i) != i)
2098             return false;
2099       }
2100    }
2101
2102    /* Success!  Rewrite the instruction. */
2103    last_rhs_inst->dst.file = dst.file;
2104    last_rhs_inst->dst.reg = dst.reg;
2105    last_rhs_inst->dst.reg_offset = dst.reg_offset;
2106    last_rhs_inst->dst.reladdr = dst.reladdr;
2107    last_rhs_inst->dst.writemask &= dst.writemask;
2108
2109    return true;
2110 }
2111
2112 void
2113 vec4_visitor::visit(ir_assignment *ir)
2114 {
2115    dst_reg dst = get_assignment_lhs(ir->lhs, this);
2116    enum brw_predicate predicate = BRW_PREDICATE_NONE;
2117
2118    if (!ir->lhs->type->is_scalar() &&
2119        !ir->lhs->type->is_vector()) {
2120       ir->rhs->accept(this);
2121       src_reg src = this->result;
2122
2123       if (ir->condition) {
2124          emit_bool_to_cond_code(ir->condition, &predicate);
2125       }
2126
2127       /* emit_block_move doesn't account for swizzles in the source register.
2128        * This should be ok, since the source register is a structure or an
2129        * array, and those can't be swizzled.  But double-check to be sure.
2130        */
2131       assert(src.swizzle ==
2132              (ir->rhs->type->is_matrix()
2133               ? swizzle_for_size(ir->rhs->type->vector_elements)
2134               : BRW_SWIZZLE_NOOP));
2135
2136       emit_block_move(&dst, &src, ir->rhs->type, predicate);
2137       return;
2138    }
2139
2140    /* Now we're down to just a scalar/vector with writemasks. */
2141    int i;
2142
2143    vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2144    pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2145
2146    ir->rhs->accept(this);
2147
2148    last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2149
2150    src_reg src = this->result;
2151
2152    int swizzles[4];
2153    int first_enabled_chan = 0;
2154    int src_chan = 0;
2155
2156    assert(ir->lhs->type->is_vector() ||
2157           ir->lhs->type->is_scalar());
2158    dst.writemask = ir->write_mask;
2159
2160    for (int i = 0; i < 4; i++) {
2161       if (dst.writemask & (1 << i)) {
2162          first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
2163          break;
2164       }
2165    }
2166
2167    /* Swizzle a small RHS vector into the channels being written.
2168     *
2169     * glsl ir treats write_mask as dictating how many channels are
2170     * present on the RHS while in our instructions we need to make
2171     * those channels appear in the slots of the vec4 they're written to.
2172     */
2173    for (int i = 0; i < 4; i++) {
2174       if (dst.writemask & (1 << i))
2175          swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
2176       else
2177          swizzles[i] = first_enabled_chan;
2178    }
2179    src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2180                               swizzles[2], swizzles[3]);
2181
2182    if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2183       return;
2184    }
2185
2186    if (ir->condition) {
2187       emit_bool_to_cond_code(ir->condition, &predicate);
2188    }
2189
2190    for (i = 0; i < type_size(ir->lhs->type); i++) {
2191       vec4_instruction *inst = emit(MOV(dst, src));
2192       inst->predicate = predicate;
2193
2194       dst.reg_offset++;
2195       src.reg_offset++;
2196    }
2197 }
2198
2199 void
2200 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2201 {
2202    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2203       foreach_in_list(ir_constant, field_value, &ir->components) {
2204          emit_constant_values(dst, field_value);
2205       }
2206       return;
2207    }
2208
2209    if (ir->type->is_array()) {
2210       for (unsigned int i = 0; i < ir->type->length; i++) {
2211          emit_constant_values(dst, ir->array_elements[i]);
2212       }
2213       return;
2214    }
2215
2216    if (ir->type->is_matrix()) {
2217       for (int i = 0; i < ir->type->matrix_columns; i++) {
2218          float *vec = &ir->value.f[i * ir->type->vector_elements];
2219
2220          for (int j = 0; j < ir->type->vector_elements; j++) {
2221             dst->writemask = 1 << j;
2222             dst->type = BRW_REGISTER_TYPE_F;
2223
2224             emit(MOV(*dst, src_reg(vec[j])));
2225          }
2226          dst->reg_offset++;
2227       }
2228       return;
2229    }
2230
2231    int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2232
2233    for (int i = 0; i < ir->type->vector_elements; i++) {
2234       if (!(remaining_writemask & (1 << i)))
2235          continue;
2236
2237       dst->writemask = 1 << i;
2238       dst->type = brw_type_for_base_type(ir->type);
2239
2240       /* Find other components that match the one we're about to
2241        * write.  Emits fewer instructions for things like vec4(0.5,
2242        * 1.5, 1.5, 1.5).
2243        */
2244       for (int j = i + 1; j < ir->type->vector_elements; j++) {
2245          if (ir->type->base_type == GLSL_TYPE_BOOL) {
2246             if (ir->value.b[i] == ir->value.b[j])
2247                dst->writemask |= (1 << j);
2248          } else {
2249             /* u, i, and f storage all line up, so no need for a
2250              * switch case for comparing each type.
2251              */
2252             if (ir->value.u[i] == ir->value.u[j])
2253                dst->writemask |= (1 << j);
2254          }
2255       }
2256
2257       switch (ir->type->base_type) {
2258       case GLSL_TYPE_FLOAT:
2259          emit(MOV(*dst, src_reg(ir->value.f[i])));
2260          break;
2261       case GLSL_TYPE_INT:
2262          emit(MOV(*dst, src_reg(ir->value.i[i])));
2263          break;
2264       case GLSL_TYPE_UINT:
2265          emit(MOV(*dst, src_reg(ir->value.u[i])));
2266          break;
2267       case GLSL_TYPE_BOOL:
2268          emit(MOV(*dst,
2269                   src_reg(ir->value.b[i] != 0 ? ctx->Const.UniformBooleanTrue
2270                                               : 0)));
2271          break;
2272       default:
2273          unreachable("Non-float/uint/int/bool constant");
2274       }
2275
2276       remaining_writemask &= ~dst->writemask;
2277    }
2278    dst->reg_offset++;
2279 }
2280
2281 void
2282 vec4_visitor::visit(ir_constant *ir)
2283 {
2284    dst_reg dst = dst_reg(this, ir->type);
2285    this->result = src_reg(dst);
2286
2287    emit_constant_values(&dst, ir);
2288 }
2289
2290 void
2291 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2292 {
2293    ir_dereference *deref = static_cast<ir_dereference *>(
2294       ir->actual_parameters.get_head());
2295    ir_variable *location = deref->variable_referenced();
2296    unsigned surf_index = (prog_data->base.binding_table.abo_start +
2297                           location->data.atomic.buffer_index);
2298
2299    /* Calculate the surface offset */
2300    src_reg offset(this, glsl_type::uint_type);
2301    ir_dereference_array *deref_array = deref->as_dereference_array();
2302    if (deref_array) {
2303       deref_array->array_index->accept(this);
2304
2305       src_reg tmp(this, glsl_type::uint_type);
2306       emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2307       emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2308    } else {
2309       offset = location->data.atomic.offset;
2310    }
2311
2312    /* Emit the appropriate machine instruction */
2313    const char *callee = ir->callee->function_name();
2314    dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2315
2316    if (!strcmp("__intrinsic_atomic_read", callee)) {
2317       emit_untyped_surface_read(surf_index, dst, offset);
2318
2319    } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2320       emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2321                           src_reg(), src_reg());
2322
2323    } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2324       emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2325                           src_reg(), src_reg());
2326    }
2327 }
2328
2329 void
2330 vec4_visitor::visit(ir_call *ir)
2331 {
2332    const char *callee = ir->callee->function_name();
2333
2334    if (!strcmp("__intrinsic_atomic_read", callee) ||
2335        !strcmp("__intrinsic_atomic_increment", callee) ||
2336        !strcmp("__intrinsic_atomic_predecrement", callee)) {
2337       visit_atomic_counter_intrinsic(ir);
2338    } else {
2339       unreachable("Unsupported intrinsic.");
2340    }
2341 }
2342
2343 src_reg
2344 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, src_reg sampler)
2345 {
2346    vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_MCS);
2347    inst->base_mrf = 2;
2348    inst->mlen = 1;
2349    inst->dst = dst_reg(this, glsl_type::uvec4_type);
2350    inst->dst.writemask = WRITEMASK_XYZW;
2351
2352    inst->src[1] = sampler;
2353
2354    /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2355    int param_base = inst->base_mrf;
2356    int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2357    int zero_mask = 0xf & ~coord_mask;
2358
2359    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2360             coordinate));
2361
2362    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2363             src_reg(0)));
2364
2365    emit(inst);
2366    return src_reg(inst->dst);
2367 }
2368
2369 static bool
2370 is_high_sampler(struct brw_context *brw, src_reg sampler)
2371 {
2372    if (brw->gen < 8 && !brw->is_haswell)
2373       return false;
2374
2375    return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
2376 }
2377
2378 void
2379 vec4_visitor::visit(ir_texture *ir)
2380 {
2381    uint32_t sampler =
2382       _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2383
2384    ir_rvalue *nonconst_sampler_index =
2385       _mesa_get_sampler_array_nonconst_index(ir->sampler);
2386
2387    /* Handle non-constant sampler array indexing */
2388    src_reg sampler_reg;
2389    if (nonconst_sampler_index) {
2390       /* The highest sampler which may be used by this operation is
2391        * the last element of the array. Mark it here, because the generator
2392        * doesn't have enough information to determine the bound.
2393        */
2394       uint32_t array_size = ir->sampler->as_dereference_array()
2395          ->array->type->array_size();
2396
2397       uint32_t max_used = sampler + array_size - 1;
2398       if (ir->op == ir_tg4 && brw->gen < 8) {
2399          max_used += prog_data->base.binding_table.gather_texture_start;
2400       } else {
2401          max_used += prog_data->base.binding_table.texture_start;
2402       }
2403
2404       brw_mark_surface_used(&prog_data->base, max_used);
2405
2406       /* Emit code to evaluate the actual indexing expression */
2407       nonconst_sampler_index->accept(this);
2408       dst_reg temp(this, glsl_type::uint_type);
2409       emit(ADD(temp, this->result, src_reg(sampler)))
2410          ->force_writemask_all = true;
2411       sampler_reg = src_reg(temp);
2412    } else {
2413       /* Single sampler, or constant array index; the indexing expression
2414        * is just an immediate.
2415        */
2416       sampler_reg = src_reg(sampler);
2417    }
2418
2419    /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2420     * emitting anything other than setting up the constant result.
2421     */
2422    if (ir->op == ir_tg4) {
2423       ir_constant *chan = ir->lod_info.component->as_constant();
2424       int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2425       if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2426          dst_reg result(this, ir->type);
2427          this->result = src_reg(result);
2428          emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2429          return;
2430       }
2431    }
2432
2433    /* Should be lowered by do_lower_texture_projection */
2434    assert(!ir->projector);
2435
2436    /* Should be lowered */
2437    assert(!ir->offset || !ir->offset->type->is_array());
2438
2439    /* Generate code to compute all the subexpression trees.  This has to be
2440     * done before loading any values into MRFs for the sampler message since
2441     * generating these values may involve SEND messages that need the MRFs.
2442     */
2443    src_reg coordinate;
2444    if (ir->coordinate) {
2445       ir->coordinate->accept(this);
2446       coordinate = this->result;
2447    }
2448
2449    src_reg shadow_comparitor;
2450    if (ir->shadow_comparitor) {
2451       ir->shadow_comparitor->accept(this);
2452       shadow_comparitor = this->result;
2453    }
2454
2455    bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2456    src_reg offset_value;
2457    if (has_nonconstant_offset) {
2458       ir->offset->accept(this);
2459       offset_value = src_reg(this->result);
2460    }
2461
2462    const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2463    src_reg lod, dPdx, dPdy, sample_index, mcs;
2464    switch (ir->op) {
2465    case ir_tex:
2466       lod = src_reg(0.0f);
2467       lod_type = glsl_type::float_type;
2468       break;
2469    case ir_txf:
2470    case ir_txl:
2471    case ir_txs:
2472       ir->lod_info.lod->accept(this);
2473       lod = this->result;
2474       lod_type = ir->lod_info.lod->type;
2475       break;
2476    case ir_query_levels:
2477       lod = src_reg(0);
2478       lod_type = glsl_type::int_type;
2479       break;
2480    case ir_txf_ms:
2481       ir->lod_info.sample_index->accept(this);
2482       sample_index = this->result;
2483       sample_index_type = ir->lod_info.sample_index->type;
2484
2485       if (brw->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2486          mcs = emit_mcs_fetch(ir, coordinate, sampler_reg);
2487       else
2488          mcs = src_reg(0u);
2489       break;
2490    case ir_txd:
2491       ir->lod_info.grad.dPdx->accept(this);
2492       dPdx = this->result;
2493
2494       ir->lod_info.grad.dPdy->accept(this);
2495       dPdy = this->result;
2496
2497       lod_type = ir->lod_info.grad.dPdx->type;
2498       break;
2499    case ir_txb:
2500    case ir_lod:
2501    case ir_tg4:
2502       break;
2503    }
2504
2505    enum opcode opcode;
2506    switch (ir->op) {
2507    case ir_tex: opcode = SHADER_OPCODE_TXL; break;
2508    case ir_txl: opcode = SHADER_OPCODE_TXL; break;
2509    case ir_txd: opcode = SHADER_OPCODE_TXD; break;
2510    case ir_txf: opcode = SHADER_OPCODE_TXF; break;
2511    case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
2512    case ir_txs: opcode = SHADER_OPCODE_TXS; break;
2513    case ir_tg4: opcode = has_nonconstant_offset
2514                          ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
2515    case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
2516    case ir_txb:
2517       unreachable("TXB is not valid for vertex shaders.");
2518    case ir_lod:
2519       unreachable("LOD is not valid for vertex shaders.");
2520    default:
2521       unreachable("Unrecognized tex op");
2522    }
2523
2524    vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, opcode);
2525
2526    if (ir->offset != NULL && ir->op != ir_txf)
2527       inst->texture_offset = brw_texture_offset(ctx, ir->offset->as_constant());
2528
2529    /* Stuff the channel select bits in the top of the texture offset */
2530    if (ir->op == ir_tg4)
2531       inst->texture_offset |= gather_channel(ir, sampler) << 16;
2532
2533    /* The message header is necessary for:
2534     * - Gen4 (always)
2535     * - Texel offsets
2536     * - Gather channel selection
2537     * - Sampler indices too large to fit in a 4-bit value.
2538     */
2539    inst->header_present =
2540       brw->gen < 5 || inst->texture_offset != 0 || ir->op == ir_tg4 ||
2541       is_high_sampler(brw, sampler_reg);
2542    inst->base_mrf = 2;
2543    inst->mlen = inst->header_present + 1; /* always at least one */
2544    inst->dst = dst_reg(this, ir->type);
2545    inst->dst.writemask = WRITEMASK_XYZW;
2546    inst->shadow_compare = ir->shadow_comparitor != NULL;
2547
2548    inst->src[1] = sampler_reg;
2549
2550    /* MRF for the first parameter */
2551    int param_base = inst->base_mrf + inst->header_present;
2552
2553    if (ir->op == ir_txs || ir->op == ir_query_levels) {
2554       int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2555       emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2556    } else {
2557       /* Load the coordinate */
2558       /* FINISHME: gl_clamp_mask and saturate */
2559       int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2560       int zero_mask = 0xf & ~coord_mask;
2561
2562       emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2563                coordinate));
2564
2565       if (zero_mask != 0) {
2566          emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2567                   src_reg(0)));
2568       }
2569       /* Load the shadow comparitor */
2570       if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2571          emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2572                           WRITEMASK_X),
2573                   shadow_comparitor));
2574          inst->mlen++;
2575       }
2576
2577       /* Load the LOD info */
2578       if (ir->op == ir_tex || ir->op == ir_txl) {
2579          int mrf, writemask;
2580          if (brw->gen >= 5) {
2581             mrf = param_base + 1;
2582             if (ir->shadow_comparitor) {
2583                writemask = WRITEMASK_Y;
2584                /* mlen already incremented */
2585             } else {
2586                writemask = WRITEMASK_X;
2587                inst->mlen++;
2588             }
2589          } else /* brw->gen == 4 */ {
2590             mrf = param_base;
2591             writemask = WRITEMASK_W;
2592          }
2593          emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2594       } else if (ir->op == ir_txf) {
2595          emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2596       } else if (ir->op == ir_txf_ms) {
2597          emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2598                   sample_index));
2599          if (brw->gen >= 7)
2600             /* MCS data is in the first channel of `mcs`, but we need to get it into
2601              * the .y channel of the second vec4 of params, so replicate .x across
2602              * the whole vec4 and then mask off everything except .y
2603              */
2604             mcs.swizzle = BRW_SWIZZLE_XXXX;
2605             emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2606                      mcs));
2607          inst->mlen++;
2608       } else if (ir->op == ir_txd) {
2609          const glsl_type *type = lod_type;
2610
2611          if (brw->gen >= 5) {
2612             dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2613             dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2614             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2615             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2616             inst->mlen++;
2617
2618             if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2619                dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2620                dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2621                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2622                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2623                inst->mlen++;
2624
2625                if (ir->shadow_comparitor) {
2626                   emit(MOV(dst_reg(MRF, param_base + 2,
2627                                    ir->shadow_comparitor->type, WRITEMASK_Z),
2628                            shadow_comparitor));
2629                }
2630             }
2631          } else /* brw->gen == 4 */ {
2632             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2633             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2634             inst->mlen += 2;
2635          }
2636       } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2637          if (ir->shadow_comparitor) {
2638             emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2639                      shadow_comparitor));
2640          }
2641
2642          emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2643                   offset_value));
2644          inst->mlen++;
2645       }
2646    }
2647
2648    emit(inst);
2649
2650    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2651     * spec requires layers.
2652     */
2653    if (ir->op == ir_txs) {
2654       glsl_type const *type = ir->sampler->type;
2655       if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2656           type->sampler_array) {
2657          emit_math(SHADER_OPCODE_INT_QUOTIENT,
2658                    writemask(inst->dst, WRITEMASK_Z),
2659                    src_reg(inst->dst), src_reg(6));
2660       }
2661    }
2662
2663    if (brw->gen == 6 && ir->op == ir_tg4) {
2664       emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2665    }
2666
2667    swizzle_result(ir, src_reg(inst->dst), sampler);
2668 }
2669
2670 /**
2671  * Apply workarounds for Gen6 gather with UINT/SINT
2672  */
2673 void
2674 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2675 {
2676    if (!wa)
2677       return;
2678
2679    int width = (wa & WA_8BIT) ? 8 : 16;
2680    dst_reg dst_f = dst;
2681    dst_f.type = BRW_REGISTER_TYPE_F;
2682
2683    /* Convert from UNORM to UINT */
2684    emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2685    emit(MOV(dst, src_reg(dst_f)));
2686
2687    if (wa & WA_SIGN) {
2688       /* Reinterpret the UINT value as a signed INT value by
2689        * shifting the sign bit into place, then shifting back
2690        * preserving sign.
2691        */
2692       emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2693       emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2694    }
2695 }
2696
2697 /**
2698  * Set up the gather channel based on the swizzle, for gather4.
2699  */
2700 uint32_t
2701 vec4_visitor::gather_channel(ir_texture *ir, uint32_t sampler)
2702 {
2703    ir_constant *chan = ir->lod_info.component->as_constant();
2704    int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2705    switch (swiz) {
2706       case SWIZZLE_X: return 0;
2707       case SWIZZLE_Y:
2708          /* gather4 sampler is broken for green channel on RG32F --
2709           * we must ask for blue instead.
2710           */
2711          if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2712             return 2;
2713          return 1;
2714       case SWIZZLE_Z: return 2;
2715       case SWIZZLE_W: return 3;
2716       default:
2717          unreachable("Not reached"); /* zero, one swizzles handled already */
2718    }
2719 }
2720
2721 void
2722 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, uint32_t sampler)
2723 {
2724    int s = key->tex.swizzles[sampler];
2725
2726    this->result = src_reg(this, ir->type);
2727    dst_reg swizzled_result(this->result);
2728
2729    if (ir->op == ir_query_levels) {
2730       /* # levels is in .w */
2731       orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2732       emit(MOV(swizzled_result, orig_val));
2733       return;
2734    }
2735
2736    if (ir->op == ir_txs || ir->type == glsl_type::float_type
2737                         || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2738       emit(MOV(swizzled_result, orig_val));
2739       return;
2740    }
2741
2742
2743    int zero_mask = 0, one_mask = 0, copy_mask = 0;
2744    int swizzle[4] = {0};
2745
2746    for (int i = 0; i < 4; i++) {
2747       switch (GET_SWZ(s, i)) {
2748       case SWIZZLE_ZERO:
2749          zero_mask |= (1 << i);
2750          break;
2751       case SWIZZLE_ONE:
2752          one_mask |= (1 << i);
2753          break;
2754       default:
2755          copy_mask |= (1 << i);
2756          swizzle[i] = GET_SWZ(s, i);
2757          break;
2758       }
2759    }
2760
2761    if (copy_mask) {
2762       orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2763       swizzled_result.writemask = copy_mask;
2764       emit(MOV(swizzled_result, orig_val));
2765    }
2766
2767    if (zero_mask) {
2768       swizzled_result.writemask = zero_mask;
2769       emit(MOV(swizzled_result, src_reg(0.0f)));
2770    }
2771
2772    if (one_mask) {
2773       swizzled_result.writemask = one_mask;
2774       emit(MOV(swizzled_result, src_reg(1.0f)));
2775    }
2776 }
2777
2778 void
2779 vec4_visitor::visit(ir_return *)
2780 {
2781    unreachable("not reached");
2782 }
2783
2784 void
2785 vec4_visitor::visit(ir_discard *)
2786 {
2787    unreachable("not reached");
2788 }
2789
2790 void
2791 vec4_visitor::visit(ir_if *ir)
2792 {
2793    /* Don't point the annotation at the if statement, because then it plus
2794     * the then and else blocks get printed.
2795     */
2796    this->base_ir = ir->condition;
2797
2798    if (brw->gen == 6) {
2799       emit_if_gen6(ir);
2800    } else {
2801       enum brw_predicate predicate;
2802       emit_bool_to_cond_code(ir->condition, &predicate);
2803       emit(IF(predicate));
2804    }
2805
2806    visit_instructions(&ir->then_instructions);
2807
2808    if (!ir->else_instructions.is_empty()) {
2809       this->base_ir = ir->condition;
2810       emit(BRW_OPCODE_ELSE);
2811
2812       visit_instructions(&ir->else_instructions);
2813    }
2814
2815    this->base_ir = ir->condition;
2816    emit(BRW_OPCODE_ENDIF);
2817 }
2818
2819 void
2820 vec4_visitor::visit(ir_emit_vertex *)
2821 {
2822    unreachable("not reached");
2823 }
2824
2825 void
2826 vec4_visitor::visit(ir_end_primitive *)
2827 {
2828    unreachable("not reached");
2829 }
2830
2831 void
2832 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2833                                   dst_reg dst, src_reg offset,
2834                                   src_reg src0, src_reg src1)
2835 {
2836    unsigned mlen = 0;
2837
2838    /* Set the atomic operation offset. */
2839    emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2840    mlen++;
2841
2842    /* Set the atomic operation arguments. */
2843    if (src0.file != BAD_FILE) {
2844       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2845       mlen++;
2846    }
2847
2848    if (src1.file != BAD_FILE) {
2849       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2850       mlen++;
2851    }
2852
2853    /* Emit the instruction.  Note that this maps to the normal SIMD8
2854     * untyped atomic message on Ivy Bridge, but that's OK because
2855     * unused channels will be masked out.
2856     */
2857    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
2858                                  src_reg(atomic_op), src_reg(surf_index));
2859    inst->base_mrf = 0;
2860    inst->mlen = mlen;
2861 }
2862
2863 void
2864 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
2865                                         src_reg offset)
2866 {
2867    /* Set the surface read offset. */
2868    emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
2869
2870    /* Emit the instruction.  Note that this maps to the normal SIMD8
2871     * untyped surface read message, but that's OK because unused
2872     * channels will be masked out.
2873     */
2874    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ,
2875                                  dst, src_reg(surf_index));
2876    inst->base_mrf = 0;
2877    inst->mlen = 1;
2878 }
2879
2880 void
2881 vec4_visitor::emit_ndc_computation()
2882 {
2883    /* Get the position */
2884    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2885
2886    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2887    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2888    output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2889
2890    current_annotation = "NDC";
2891    dst_reg ndc_w = ndc;
2892    ndc_w.writemask = WRITEMASK_W;
2893    src_reg pos_w = pos;
2894    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2895    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2896
2897    dst_reg ndc_xyz = ndc;
2898    ndc_xyz.writemask = WRITEMASK_XYZ;
2899
2900    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2901 }
2902
2903 void
2904 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2905 {
2906    if (brw->gen < 6 &&
2907        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2908         key->userclip_active || brw->has_negative_rhw_bug)) {
2909       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2910       dst_reg header1_w = header1;
2911       header1_w.writemask = WRITEMASK_W;
2912
2913       emit(MOV(header1, 0u));
2914
2915       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2916          src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2917
2918          current_annotation = "Point size";
2919          emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2920          emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2921       }
2922
2923       if (key->userclip_active) {
2924          current_annotation = "Clipping flags";
2925          dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
2926          dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
2927
2928          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
2929          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
2930          emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
2931
2932          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
2933          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
2934          emit(SHL(flags1, src_reg(flags1), src_reg(4)));
2935          emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
2936       }
2937
2938       /* i965 clipping workaround:
2939        * 1) Test for -ve rhw
2940        * 2) If set,
2941        *      set ndc = (0,0,0,0)
2942        *      set ucp[6] = 1
2943        *
2944        * Later, clipping will detect ucp[6] and ensure the primitive is
2945        * clipped against all fixed planes.
2946        */
2947       if (brw->has_negative_rhw_bug) {
2948          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
2949          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
2950          emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
2951          vec4_instruction *inst;
2952          inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
2953          inst->predicate = BRW_PREDICATE_NORMAL;
2954          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
2955          inst->predicate = BRW_PREDICATE_NORMAL;
2956       }
2957
2958       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2959    } else if (brw->gen < 6) {
2960       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2961    } else {
2962       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2963       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2964          emit(MOV(brw_writemask(reg, WRITEMASK_W),
2965                   src_reg(output_reg[VARYING_SLOT_PSIZ])));
2966       }
2967       if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
2968          emit(MOV(retype(brw_writemask(reg, WRITEMASK_Y), BRW_REGISTER_TYPE_D),
2969                   src_reg(output_reg[VARYING_SLOT_LAYER])));
2970       }
2971       if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
2972          emit(MOV(retype(brw_writemask(reg, WRITEMASK_Z), BRW_REGISTER_TYPE_D),
2973                   src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
2974       }
2975    }
2976 }
2977
2978 void
2979 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
2980 {
2981    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2982     *
2983     *     "If a linked set of shaders forming the vertex stage contains no
2984     *     static write to gl_ClipVertex or gl_ClipDistance, but the
2985     *     application has requested clipping against user clip planes through
2986     *     the API, then the coordinate written to gl_Position is used for
2987     *     comparison against the user clip planes."
2988     *
2989     * This function is only called if the shader didn't write to
2990     * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
2991     * if the user wrote to it; otherwise we use gl_Position.
2992     */
2993    gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
2994    if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
2995       clip_vertex = VARYING_SLOT_POS;
2996    }
2997
2998    for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
2999         ++i) {
3000       reg.writemask = 1 << i;
3001       emit(DP4(reg,
3002                src_reg(output_reg[clip_vertex]),
3003                src_reg(this->userplane[i + offset])));
3004    }
3005 }
3006
3007 void
3008 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
3009 {
3010    assert (varying < VARYING_SLOT_MAX);
3011    reg.type = output_reg[varying].type;
3012    current_annotation = output_reg_annotation[varying];
3013    /* Copy the register, saturating if necessary */
3014    vec4_instruction *inst = emit(MOV(reg,
3015                                      src_reg(output_reg[varying])));
3016    if ((varying == VARYING_SLOT_COL0 ||
3017         varying == VARYING_SLOT_COL1 ||
3018         varying == VARYING_SLOT_BFC0 ||
3019         varying == VARYING_SLOT_BFC1) &&
3020        key->clamp_vertex_color) {
3021       inst->saturate = true;
3022    }
3023 }
3024
3025 void
3026 vec4_visitor::emit_urb_slot(int mrf, int varying)
3027 {
3028    struct brw_reg hw_reg = brw_message_reg(mrf);
3029    dst_reg reg = dst_reg(MRF, mrf);
3030    reg.type = BRW_REGISTER_TYPE_F;
3031
3032    switch (varying) {
3033    case VARYING_SLOT_PSIZ:
3034       /* PSIZ is always in slot 0, and is coupled with other flags. */
3035       current_annotation = "indices, point width, clip flags";
3036       emit_psiz_and_flags(hw_reg);
3037       break;
3038    case BRW_VARYING_SLOT_NDC:
3039       current_annotation = "NDC";
3040       emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
3041       break;
3042    case VARYING_SLOT_POS:
3043       current_annotation = "gl_Position";
3044       emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
3045       break;
3046    case VARYING_SLOT_EDGE:
3047       /* This is present when doing unfilled polygons.  We're supposed to copy
3048        * the edge flag from the user-provided vertex array
3049        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
3050        * of that attribute (starts as 1.0f).  This is then used in clipping to
3051        * determine which edges should be drawn as wireframe.
3052        */
3053       current_annotation = "edge flag";
3054       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
3055                                     glsl_type::float_type, WRITEMASK_XYZW))));
3056       break;
3057    case BRW_VARYING_SLOT_PAD:
3058       /* No need to write to this slot */
3059       break;
3060    default:
3061       emit_generic_urb_slot(reg, varying);
3062       break;
3063    }
3064 }
3065
3066 static int
3067 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
3068 {
3069    if (brw->gen >= 6) {
3070       /* URB data written (does not include the message header reg) must
3071        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
3072        * section 5.4.3.2.2: URB_INTERLEAVED.
3073        *
3074        * URB entries are allocated on a multiple of 1024 bits, so an
3075        * extra 128 bits written here to make the end align to 256 is
3076        * no problem.
3077        */
3078       if ((mlen % 2) != 1)
3079          mlen++;
3080    }
3081
3082    return mlen;
3083 }
3084
3085
3086 /**
3087  * Generates the VUE payload plus the necessary URB write instructions to
3088  * output it.
3089  *
3090  * The VUE layout is documented in Volume 2a.
3091  */
3092 void
3093 vec4_visitor::emit_vertex()
3094 {
3095    /* MRF 0 is reserved for the debugger, so start with message header
3096     * in MRF 1.
3097     */
3098    int base_mrf = 1;
3099    int mrf = base_mrf;
3100    /* In the process of generating our URB write message contents, we
3101     * may need to unspill a register or load from an array.  Those
3102     * reads would use MRFs 14-15.
3103     */
3104    int max_usable_mrf = 13;
3105
3106    /* The following assertion verifies that max_usable_mrf causes an
3107     * even-numbered amount of URB write data, which will meet gen6's
3108     * requirements for length alignment.
3109     */
3110    assert ((max_usable_mrf - base_mrf) % 2 == 0);
3111
3112    /* First mrf is the g0-based message header containing URB handles and
3113     * such.
3114     */
3115    emit_urb_write_header(mrf++);
3116
3117    if (brw->gen < 6) {
3118       emit_ndc_computation();
3119    }
3120
3121    /* Lower legacy ff and ClipVertex clipping to clip distances */
3122    if (key->userclip_active && !prog->UsesClipDistanceOut) {
3123       current_annotation = "user clip distances";
3124
3125       output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3126       output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3127
3128       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3129       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3130    }
3131
3132    /* We may need to split this up into several URB writes, so do them in a
3133     * loop.
3134     */
3135    int slot = 0;
3136    bool complete = false;
3137    do {
3138       /* URB offset is in URB row increments, and each of our MRFs is half of
3139        * one of those, since we're doing interleaved writes.
3140        */
3141       int offset = slot / 2;
3142
3143       mrf = base_mrf + 1;
3144       for (; slot < prog_data->vue_map.num_slots; ++slot) {
3145          emit_urb_slot(mrf++, prog_data->vue_map.slot_to_varying[slot]);
3146
3147          /* If this was max_usable_mrf, we can't fit anything more into this
3148           * URB WRITE.
3149           */
3150          if (mrf > max_usable_mrf) {
3151             slot++;
3152             break;
3153          }
3154       }
3155
3156       complete = slot >= prog_data->vue_map.num_slots;
3157       current_annotation = "URB write";
3158       vec4_instruction *inst = emit_urb_write_opcode(complete);
3159       inst->base_mrf = base_mrf;
3160       inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
3161       inst->offset += offset;
3162    } while(!complete);
3163 }
3164
3165
3166 src_reg
3167 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
3168                                  src_reg *reladdr, int reg_offset)
3169 {
3170    /* Because we store the values to scratch interleaved like our
3171     * vertex data, we need to scale the vec4 index by 2.
3172     */
3173    int message_header_scale = 2;
3174
3175    /* Pre-gen6, the message header uses byte offsets instead of vec4
3176     * (16-byte) offset units.
3177     */
3178    if (brw->gen < 6)
3179       message_header_scale *= 16;
3180
3181    if (reladdr) {
3182       src_reg index = src_reg(this, glsl_type::int_type);
3183
3184       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3185       emit_before(inst, MUL(dst_reg(index),
3186                             index, src_reg(message_header_scale)));
3187
3188       return index;
3189    } else {
3190       return src_reg(reg_offset * message_header_scale);
3191    }
3192 }
3193
3194 src_reg
3195 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
3196                                        src_reg *reladdr, int reg_offset)
3197 {
3198    if (reladdr) {
3199       src_reg index = src_reg(this, glsl_type::int_type);
3200
3201       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3202
3203       /* Pre-gen6, the message header uses byte offsets instead of vec4
3204        * (16-byte) offset units.
3205        */
3206       if (brw->gen < 6) {
3207          emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
3208       }
3209
3210       return index;
3211    } else if (brw->gen >= 8) {
3212       /* Store the offset in a GRF so we can send-from-GRF. */
3213       src_reg offset = src_reg(this, glsl_type::int_type);
3214       emit_before(inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3215       return offset;
3216    } else {
3217       int message_header_scale = brw->gen < 6 ? 16 : 1;
3218       return src_reg(reg_offset * message_header_scale);
3219    }
3220 }
3221
3222 /**
3223  * Emits an instruction before @inst to load the value named by @orig_src
3224  * from scratch space at @base_offset to @temp.
3225  *
3226  * @base_offset is measured in 32-byte units (the size of a register).
3227  */
3228 void
3229 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
3230                                 dst_reg temp, src_reg orig_src,
3231                                 int base_offset)
3232 {
3233    int reg_offset = base_offset + orig_src.reg_offset;
3234    src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
3235
3236    emit_before(inst, SCRATCH_READ(temp, index));
3237 }
3238
3239 /**
3240  * Emits an instruction after @inst to store the value to be written
3241  * to @orig_dst to scratch space at @base_offset, from @temp.
3242  *
3243  * @base_offset is measured in 32-byte units (the size of a register).
3244  */
3245 void
3246 vec4_visitor::emit_scratch_write(vec4_instruction *inst, int base_offset)
3247 {
3248    int reg_offset = base_offset + inst->dst.reg_offset;
3249    src_reg index = get_scratch_offset(inst, inst->dst.reladdr, reg_offset);
3250
3251    /* Create a temporary register to store *inst's result in.
3252     *
3253     * We have to be careful in MOVing from our temporary result register in
3254     * the scratch write.  If we swizzle from channels of the temporary that
3255     * weren't initialized, it will confuse live interval analysis, which will
3256     * make spilling fail to make progress.
3257     */
3258    src_reg temp = src_reg(this, glsl_type::vec4_type);
3259    temp.type = inst->dst.type;
3260    int first_writemask_chan = ffs(inst->dst.writemask) - 1;
3261    int swizzles[4];
3262    for (int i = 0; i < 4; i++)
3263       if (inst->dst.writemask & (1 << i))
3264          swizzles[i] = i;
3265       else
3266          swizzles[i] = first_writemask_chan;
3267    temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
3268                                swizzles[2], swizzles[3]);
3269
3270    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3271                                        inst->dst.writemask));
3272    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3273    write->predicate = inst->predicate;
3274    write->ir = inst->ir;
3275    write->annotation = inst->annotation;
3276    inst->insert_after(write);
3277
3278    inst->dst.file = temp.file;
3279    inst->dst.reg = temp.reg;
3280    inst->dst.reg_offset = temp.reg_offset;
3281    inst->dst.reladdr = NULL;
3282 }
3283
3284 /**
3285  * We can't generally support array access in GRF space, because a
3286  * single instruction's destination can only span 2 contiguous
3287  * registers.  So, we send all GRF arrays that get variable index
3288  * access to scratch space.
3289  */
3290 void
3291 vec4_visitor::move_grf_array_access_to_scratch()
3292 {
3293    int scratch_loc[this->virtual_grf_count];
3294
3295    for (int i = 0; i < this->virtual_grf_count; i++) {
3296       scratch_loc[i] = -1;
3297    }
3298
3299    /* First, calculate the set of virtual GRFs that need to be punted
3300     * to scratch due to having any array access on them, and where in
3301     * scratch.
3302     */
3303    foreach_in_list(vec4_instruction, inst, &instructions) {
3304       if (inst->dst.file == GRF && inst->dst.reladdr &&
3305           scratch_loc[inst->dst.reg] == -1) {
3306          scratch_loc[inst->dst.reg] = c->last_scratch;
3307          c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
3308       }
3309
3310       for (int i = 0 ; i < 3; i++) {
3311          src_reg *src = &inst->src[i];
3312
3313          if (src->file == GRF && src->reladdr &&
3314              scratch_loc[src->reg] == -1) {
3315             scratch_loc[src->reg] = c->last_scratch;
3316             c->last_scratch += this->virtual_grf_sizes[src->reg];
3317          }
3318       }
3319    }
3320
3321    /* Now, for anything that will be accessed through scratch, rewrite
3322     * it to load/store.  Note that this is a _safe list walk, because
3323     * we may generate a new scratch_write instruction after the one
3324     * we're processing.
3325     */
3326    foreach_in_list_safe(vec4_instruction, inst, &instructions) {
3327       /* Set up the annotation tracking for new generated instructions. */
3328       base_ir = inst->ir;
3329       current_annotation = inst->annotation;
3330
3331       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
3332          emit_scratch_write(inst, scratch_loc[inst->dst.reg]);
3333       }
3334
3335       for (int i = 0 ; i < 3; i++) {
3336          if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
3337             continue;
3338
3339          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3340
3341          emit_scratch_read(inst, temp, inst->src[i],
3342                            scratch_loc[inst->src[i].reg]);
3343
3344          inst->src[i].file = temp.file;
3345          inst->src[i].reg = temp.reg;
3346          inst->src[i].reg_offset = temp.reg_offset;
3347          inst->src[i].reladdr = NULL;
3348       }
3349    }
3350 }
3351
3352 /**
3353  * Emits an instruction before @inst to load the value named by @orig_src
3354  * from the pull constant buffer (surface) at @base_offset to @temp.
3355  */
3356 void
3357 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
3358                                       dst_reg temp, src_reg orig_src,
3359                                       int base_offset)
3360 {
3361    int reg_offset = base_offset + orig_src.reg_offset;
3362    src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3363    src_reg offset = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
3364    vec4_instruction *load;
3365
3366    if (brw->gen >= 7) {
3367       dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
3368       grf_offset.type = offset.type;
3369       emit_before(inst, MOV(grf_offset, offset));
3370
3371       load = new(mem_ctx) vec4_instruction(this,
3372                                            VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
3373                                            temp, index, src_reg(grf_offset));
3374    } else {
3375       load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
3376                                            temp, index, offset);
3377       load->base_mrf = 14;
3378       load->mlen = 1;
3379    }
3380    emit_before(inst, load);
3381 }
3382
3383 /**
3384  * Implements array access of uniforms by inserting a
3385  * PULL_CONSTANT_LOAD instruction.
3386  *
3387  * Unlike temporary GRF array access (where we don't support it due to
3388  * the difficulty of doing relative addressing on instruction
3389  * destinations), we could potentially do array access of uniforms
3390  * that were loaded in GRF space as push constants.  In real-world
3391  * usage we've seen, though, the arrays being used are always larger
3392  * than we could load as push constants, so just always move all
3393  * uniform array access out to a pull constant buffer.
3394  */
3395 void
3396 vec4_visitor::move_uniform_array_access_to_pull_constants()
3397 {
3398    int pull_constant_loc[this->uniforms];
3399
3400    for (int i = 0; i < this->uniforms; i++) {
3401       pull_constant_loc[i] = -1;
3402    }
3403
3404    /* Walk through and find array access of uniforms.  Put a copy of that
3405     * uniform in the pull constant buffer.
3406     *
3407     * Note that we don't move constant-indexed accesses to arrays.  No
3408     * testing has been done of the performance impact of this choice.
3409     */
3410    foreach_in_list_safe(vec4_instruction, inst, &instructions) {
3411       for (int i = 0 ; i < 3; i++) {
3412          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3413             continue;
3414
3415          int uniform = inst->src[i].reg;
3416
3417          /* If this array isn't already present in the pull constant buffer,
3418           * add it.
3419           */
3420          if (pull_constant_loc[uniform] == -1) {
3421             const gl_constant_value **values =
3422                &stage_prog_data->param[uniform * 4];
3423
3424             pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3425
3426             assert(uniform < uniform_array_size);
3427             for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3428                stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3429                   = values[j];
3430             }
3431          }
3432
3433          /* Set up the annotation tracking for new generated instructions. */
3434          base_ir = inst->ir;
3435          current_annotation = inst->annotation;
3436
3437          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3438
3439          emit_pull_constant_load(inst, temp, inst->src[i],
3440                                  pull_constant_loc[uniform]);
3441
3442          inst->src[i].file = temp.file;
3443          inst->src[i].reg = temp.reg;
3444          inst->src[i].reg_offset = temp.reg_offset;
3445          inst->src[i].reladdr = NULL;
3446       }
3447    }
3448
3449    /* Now there are no accesses of the UNIFORM file with a reladdr, so
3450     * no need to track them as larger-than-vec4 objects.  This will be
3451     * relied on in cutting out unused uniform vectors from push
3452     * constants.
3453     */
3454    split_uniform_registers();
3455 }
3456
3457 void
3458 vec4_visitor::resolve_ud_negate(src_reg *reg)
3459 {
3460    if (reg->type != BRW_REGISTER_TYPE_UD ||
3461        !reg->negate)
3462       return;
3463
3464    src_reg temp = src_reg(this, glsl_type::uvec4_type);
3465    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3466    *reg = temp;
3467 }
3468
3469 vec4_visitor::vec4_visitor(struct brw_context *brw,
3470                            struct brw_vec4_compile *c,
3471                            struct gl_program *prog,
3472                            const struct brw_vec4_prog_key *key,
3473                            struct brw_vec4_prog_data *prog_data,
3474                            struct gl_shader_program *shader_prog,
3475                            gl_shader_stage stage,
3476                            void *mem_ctx,
3477                            bool debug_flag,
3478                            bool no_spills,
3479                            shader_time_shader_type st_base,
3480                            shader_time_shader_type st_written,
3481                            shader_time_shader_type st_reset)
3482    : backend_visitor(brw, shader_prog, prog, &prog_data->base, stage),
3483      c(c),
3484      key(key),
3485      prog_data(prog_data),
3486      sanity_param_count(0),
3487      fail_msg(NULL),
3488      first_non_payload_grf(0),
3489      need_all_constants_in_pull_buffer(false),
3490      debug_flag(debug_flag),
3491      no_spills(no_spills),
3492      st_base(st_base),
3493      st_written(st_written),
3494      st_reset(st_reset)
3495 {
3496    this->mem_ctx = mem_ctx;
3497    this->failed = false;
3498
3499    this->base_ir = NULL;
3500    this->current_annotation = NULL;
3501    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3502
3503    this->variable_ht = hash_table_ctor(0,
3504                                        hash_table_pointer_hash,
3505                                        hash_table_pointer_compare);
3506
3507    this->virtual_grf_start = NULL;
3508    this->virtual_grf_end = NULL;
3509    this->virtual_grf_sizes = NULL;
3510    this->virtual_grf_count = 0;
3511    this->virtual_grf_reg_map = NULL;
3512    this->virtual_grf_reg_count = 0;
3513    this->virtual_grf_array_size = 0;
3514    this->live_intervals_valid = false;
3515
3516    this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3517
3518    this->uniforms = 0;
3519
3520    /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3521     * at least one. See setup_uniforms() in brw_vec4.cpp.
3522     */
3523    this->uniform_array_size = 1;
3524    if (prog_data) {
3525       this->uniform_array_size = MAX2(stage_prog_data->nr_params, 1);
3526    }
3527
3528    this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3529    this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3530 }
3531
3532 vec4_visitor::~vec4_visitor()
3533 {
3534    hash_table_dtor(this->variable_ht);
3535 }
3536
3537
3538 void
3539 vec4_visitor::fail(const char *format, ...)
3540 {
3541    va_list va;
3542    char *msg;
3543
3544    if (failed)
3545       return;
3546
3547    failed = true;
3548
3549    va_start(va, format);
3550    msg = ralloc_vasprintf(mem_ctx, format, va);
3551    va_end(va);
3552    msg = ralloc_asprintf(mem_ctx, "vec4 compile failed: %s\n", msg);
3553
3554    this->fail_msg = msg;
3555
3556    if (debug_flag) {
3557       fprintf(stderr, "%s",  msg);
3558    }
3559 }
3560
3561 } /* namespace brw */