src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "brw_cfg.h"
  26 #include "glsl/ir_uniform.h"
  27 extern "C" {
  28 #include "program/sampler.h"
  29 }
  30
  31 namespace brw {
  32
  33 vec4_instruction::vec4_instruction(vec4_visitor *v,
  34                                    enum opcode opcode, const dst_reg &dst,
  35                                    const src_reg &src0, const src_reg &src1,
  36                                    const src_reg &src2)
  37 {
  38    this->opcode = opcode;
  39    this->dst = dst;
  40    this->src[0] = src0;
  41    this->src[1] = src1;
  42    this->src[2] = src2;
  43    this->saturate = false;
  44    this->force_writemask_all = false;
  45    this->no_dd_clear = false;
  46    this->no_dd_check = false;
  47    this->writes_accumulator = false;
  48    this->conditional_mod = BRW_CONDITIONAL_NONE;
  49    this->texture_offset = 0;
  50    this->target = 0;
  51    this->shadow_compare = false;
  52    this->ir = v->base_ir;
  53    this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
  54    this->header_present = false;
  55    this->mlen = 0;
  56    this->base_mrf = 0;
  57    this->offset = 0;
  58    this->annotation = v->current_annotation;
  59 }
  60
  61 vec4_instruction *
  62 vec4_visitor::emit(vec4_instruction *inst)
  63 {
  64    this->instructions.push_tail(inst);
  65
  66    return inst;
  67 }
  68
  69 vec4_instruction *
  70 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
  71                           vec4_instruction *new_inst)
  72 {
  73    new_inst->ir = inst->ir;
  74    new_inst->annotation = inst->annotation;
  75
  76    inst->insert_before(block, new_inst);
  77
  78    return inst;
  79 }
  80
  81 vec4_instruction *
  82 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
  83                    src_reg src0, src_reg src1, src_reg src2)
  84 {
  85    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
  86                                              src0, src1, src2));
  87 }
  88
  89
  90 vec4_instruction *
  91 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
  92 {
  93    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
  94 }
  95
  96 vec4_instruction *
  97 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
  98 {
  99    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
 100 }
 101
 102 vec4_instruction *
 103 vec4_visitor::emit(enum opcode opcode, dst_reg dst)
 104 {
 105    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst));
 106 }
 107
 108 vec4_instruction *
 109 vec4_visitor::emit(enum opcode opcode)
 110 {
 111    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
 112 }
 113
 114 #define ALU1(op)                                                        \
 115    vec4_instruction *                                                   \
 116    vec4_visitor::op(const dst_reg &dst, const src_reg &src0)            \
 117    {                                                                    \
 118       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 119                                            src0);                       \
 120    }
 121
 122 #define ALU2(op)                                                        \
 123    vec4_instruction *                                                   \
 124    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 125                     const src_reg &src1)                                \
 126    {                                                                    \
 127       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 128                                            src0, src1);                 \
 129    }
 130
 131 #define ALU2_ACC(op)                                                    \
 132    vec4_instruction *                                                   \
 133    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 134                     const src_reg &src1)                                \
 135    {                                                                    \
 136       vec4_instruction *inst = new(mem_ctx) vec4_instruction(this,     \
 137                        BRW_OPCODE_##op, dst, src0, src1);               \
 138       inst->writes_accumulator = true;                                 \
 139       return inst;                                                     \
 140    }
 141
 142 #define ALU3(op)                                                        \
 143    vec4_instruction *                                                   \
 144    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 145                     const src_reg &src1, const src_reg &src2)           \
 146    {                                                                    \
 147       assert(brw->gen >= 6);                                            \
 148       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 149                                            src0, src1, src2);           \
 150    }
 151
 152 ALU1(NOT)
 153 ALU1(MOV)
 154 ALU1(FRC)
 155 ALU1(RNDD)
 156 ALU1(RNDE)
 157 ALU1(RNDZ)
 158 ALU1(F32TO16)
 159 ALU1(F16TO32)
 160 ALU2(ADD)
 161 ALU2(MUL)
 162 ALU2_ACC(MACH)
 163 ALU2(AND)
 164 ALU2(OR)
 165 ALU2(XOR)
 166 ALU2(DP3)
 167 ALU2(DP4)
 168 ALU2(DPH)
 169 ALU2(SHL)
 170 ALU2(SHR)
 171 ALU2(ASR)
 172 ALU3(LRP)
 173 ALU1(BFREV)
 174 ALU3(BFE)
 175 ALU2(BFI1)
 176 ALU3(BFI2)
 177 ALU1(FBH)
 178 ALU1(FBL)
 179 ALU1(CBIT)
 180 ALU3(MAD)
 181 ALU2_ACC(ADDC)
 182 ALU2_ACC(SUBB)
 183 ALU2(MAC)
 184
 185 /** Gen4 predicated IF. */
 186 vec4_instruction *
 187 vec4_visitor::IF(enum brw_predicate predicate)
 188 {
 189    vec4_instruction *inst;
 190
 191    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
 192    inst->predicate = predicate;
 193
 194    return inst;
 195 }
 196
 197 /** Gen6 IF with embedded comparison. */
 198 vec4_instruction *
 199 vec4_visitor::IF(src_reg src0, src_reg src1,
 200                  enum brw_conditional_mod condition)
 201 {
 202    assert(brw->gen == 6);
 203
 204    vec4_instruction *inst;
 205
 206    resolve_ud_negate(&src0);
 207    resolve_ud_negate(&src1);
 208
 209    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
 210                                         src0, src1);
 211    inst->conditional_mod = condition;
 212
 213    return inst;
 214 }
 215
 216 /**
 217  * CMP: Sets the low bit of the destination channels with the result
 218  * of the comparison, while the upper bits are undefined, and updates
 219  * the flag register with the packed 16 bits of the result.
 220  */
 221 vec4_instruction *
 222 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
 223                   enum brw_conditional_mod condition)
 224 {
 225    vec4_instruction *inst;
 226
 227    /* original gen4 does type conversion to the destination type
 228     * before before comparison, producing garbage results for floating
 229     * point comparisons.
 230     */
 231    if (brw->gen == 4) {
 232       dst.type = src0.type;
 233       if (dst.file == HW_REG)
 234          dst.fixed_hw_reg.type = dst.type;
 235    }
 236
 237    resolve_ud_negate(&src0);
 238    resolve_ud_negate(&src1);
 239
 240    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
 241    inst->conditional_mod = condition;
 242
 243    return inst;
 244 }
 245
 246 vec4_instruction *
 247 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
 248 {
 249    vec4_instruction *inst;
 250
 251    inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_READ,
 252                                         dst, index);
 253    inst->base_mrf = 14;
 254    inst->mlen = 2;
 255
 256    return inst;
 257 }
 258
 259 vec4_instruction *
 260 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
 261                             const src_reg &index)
 262 {
 263    vec4_instruction *inst;
 264
 265    inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_WRITE,
 266                                         dst, src, index);
 267    inst->base_mrf = 13;
 268    inst->mlen = 3;
 269
 270    return inst;
 271 }
 272
 273 void
 274 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
 275 {
 276    static enum opcode dot_opcodes[] = {
 277       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
 278    };
 279
 280    emit(dot_opcodes[elements - 2], dst, src0, src1);
 281 }
 282
 283 src_reg
 284 vec4_visitor::fix_3src_operand(src_reg src)
 285 {
 286    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
 287     * able to use vertical stride of zero to replicate the vec4 uniform, like
 288     *
 289     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
 290     *
 291     * But you can't, since vertical stride is always four in three-source
 292     * instructions. Instead, insert a MOV instruction to do the replication so
 293     * that the three-source instruction can consume it.
 294     */
 295
 296    /* The MOV is only needed if the source is a uniform or immediate. */
 297    if (src.file != UNIFORM && src.file != IMM)
 298       return src;
 299
 300    if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
 301       return src;
 302
 303    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 304    expanded.type = src.type;
 305    emit(MOV(expanded, src));
 306    return src_reg(expanded);
 307 }
 308
 309 src_reg
 310 vec4_visitor::fix_math_operand(src_reg src)
 311 {
 312    /* The gen6 math instruction ignores the source modifiers --
 313     * swizzle, abs, negate, and at least some parts of the register
 314     * region description.
 315     *
 316     * Rather than trying to enumerate all these cases, *always* expand the
 317     * operand to a temp GRF for gen6.
 318     *
 319     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 320     * can't use.
 321     */
 322
 323    if (brw->gen == 7 && src.file != IMM)
 324       return src;
 325
 326    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 327    expanded.type = src.type;
 328    emit(MOV(expanded, src));
 329    return src_reg(expanded);
 330 }
 331
 332 void
 333 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
 334 {
 335    src = fix_math_operand(src);
 336
 337    if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
 338       /* The gen6 math instruction must be align1, so we can't do
 339        * writemasks.
 340        */
 341       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 342
 343       emit(opcode, temp_dst, src);
 344
 345       emit(MOV(dst, src_reg(temp_dst)));
 346    } else {
 347       emit(opcode, dst, src);
 348    }
 349 }
 350
 351 void
 352 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
 353 {
 354    vec4_instruction *inst = emit(opcode, dst, src);
 355    inst->base_mrf = 1;
 356    inst->mlen = 1;
 357 }
 358
 359 void
 360 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
 361 {
 362    switch (opcode) {
 363    case SHADER_OPCODE_RCP:
 364    case SHADER_OPCODE_RSQ:
 365    case SHADER_OPCODE_SQRT:
 366    case SHADER_OPCODE_EXP2:
 367    case SHADER_OPCODE_LOG2:
 368    case SHADER_OPCODE_SIN:
 369    case SHADER_OPCODE_COS:
 370       break;
 371    default:
 372       unreachable("not reached: bad math opcode");
 373    }
 374
 375    if (brw->gen >= 8) {
 376       emit(opcode, dst, src);
 377    } else if (brw->gen >= 6) {
 378       emit_math1_gen6(opcode, dst, src);
 379    } else {
 380       emit_math1_gen4(opcode, dst, src);
 381    }
 382 }
 383
 384 void
 385 vec4_visitor::emit_math2_gen6(enum opcode opcode,
 386                               dst_reg dst, src_reg src0, src_reg src1)
 387 {
 388    src0 = fix_math_operand(src0);
 389    src1 = fix_math_operand(src1);
 390
 391    if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
 392       /* The gen6 math instruction must be align1, so we can't do
 393        * writemasks.
 394        */
 395       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 396       temp_dst.type = dst.type;
 397
 398       emit(opcode, temp_dst, src0, src1);
 399
 400       emit(MOV(dst, src_reg(temp_dst)));
 401    } else {
 402       emit(opcode, dst, src0, src1);
 403    }
 404 }
 405
 406 void
 407 vec4_visitor::emit_math2_gen4(enum opcode opcode,
 408                               dst_reg dst, src_reg src0, src_reg src1)
 409 {
 410    vec4_instruction *inst = emit(opcode, dst, src0, src1);
 411    inst->base_mrf = 1;
 412    inst->mlen = 2;
 413 }
 414
 415 void
 416 vec4_visitor::emit_math(enum opcode opcode,
 417                         dst_reg dst, src_reg src0, src_reg src1)
 418 {
 419    switch (opcode) {
 420    case SHADER_OPCODE_POW:
 421    case SHADER_OPCODE_INT_QUOTIENT:
 422    case SHADER_OPCODE_INT_REMAINDER:
 423       break;
 424    default:
 425       unreachable("not reached: unsupported binary math opcode");
 426    }
 427
 428    if (brw->gen >= 8) {
 429       emit(opcode, dst, src0, src1);
 430    } else if (brw->gen >= 6) {
 431       emit_math2_gen6(opcode, dst, src0, src1);
 432    } else {
 433       emit_math2_gen4(opcode, dst, src0, src1);
 434    }
 435 }
 436
 437 void
 438 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 439 {
 440    if (brw->gen < 7) {
 441       unreachable("ir_unop_pack_half_2x16 should be lowered");
 442    }
 443
 444    assert(dst.type == BRW_REGISTER_TYPE_UD);
 445    assert(src0.type == BRW_REGISTER_TYPE_F);
 446
 447    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
 448     *
 449     *   Because this instruction does not have a 16-bit floating-point type,
 450     *   the destination data type must be Word (W).
 451     *
 452     *   The destination must be DWord-aligned and specify a horizontal stride
 453     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
 454     *   each destination channel and the upper word is not modified.
 455     *
 456     * The above restriction implies that the f32to16 instruction must use
 457     * align1 mode, because only in align1 mode is it possible to specify
 458     * horizontal stride.  We choose here to defy the hardware docs and emit
 459     * align16 instructions.
 460     *
 461     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
 462     * instructions. I was partially successful in that the code passed all
 463     * tests.  However, the code was dubiously correct and fragile, and the
 464     * tests were not harsh enough to probe that frailty. Not trusting the
 465     * code, I chose instead to remain in align16 mode in defiance of the hw
 466     * docs).
 467     *
 468     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
 469     * simulator, emitting a f32to16 in align16 mode with UD as destination
 470     * data type is safe. The behavior differs from that specified in the PRM
 471     * in that the upper word of each destination channel is cleared to 0.
 472     */
 473
 474    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 475    src_reg tmp_src(tmp_dst);
 476
 477 #if 0
 478    /* Verify the undocumented behavior on which the following instructions
 479     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
 480     * then the result of the bit-or instruction below will be incorrect.
 481     *
 482     * You should inspect the disasm output in order to verify that the MOV is
 483     * not optimized away.
 484     */
 485    emit(MOV(tmp_dst, src_reg(0x12345678u)));
 486 #endif
 487
 488    /* Give tmp the form below, where "." means untouched.
 489     *
 490     *     w z          y          x w z          y          x
 491     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
 492     *
 493     * That the upper word of each write-channel be 0 is required for the
 494     * following bit-shift and bit-or instructions to work. Note that this
 495     * relies on the undocumented hardware behavior mentioned above.
 496     */
 497    tmp_dst.writemask = WRITEMASK_XY;
 498    emit(F32TO16(tmp_dst, src0));
 499
 500    /* Give the write-channels of dst the form:
 501     *   0xhhhh0000
 502     */
 503    tmp_src.swizzle = BRW_SWIZZLE_YYYY;
 504    emit(SHL(dst, tmp_src, src_reg(16u)));
 505
 506    /* Finally, give the write-channels of dst the form of packHalf2x16's
 507     * output:
 508     *   0xhhhhllll
 509     */
 510    tmp_src.swizzle = BRW_SWIZZLE_XXXX;
 511    emit(OR(dst, src_reg(dst), tmp_src));
 512 }
 513
 514 void
 515 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 516 {
 517    if (brw->gen < 7) {
 518       unreachable("ir_unop_unpack_half_2x16 should be lowered");
 519    }
 520
 521    assert(dst.type == BRW_REGISTER_TYPE_F);
 522    assert(src0.type == BRW_REGISTER_TYPE_UD);
 523
 524    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
 525     *
 526     *   Because this instruction does not have a 16-bit floating-point type,
 527     *   the source data type must be Word (W). The destination type must be
 528     *   F (Float).
 529     *
 530     * To use W as the source data type, we must adjust horizontal strides,
 531     * which is only possible in align1 mode. All my [chadv] attempts at
 532     * emitting align1 instructions for unpackHalf2x16 failed to pass the
 533     * Piglit tests, so I gave up.
 534     *
 535     * I've verified that, on gen7 hardware and the simulator, it is safe to
 536     * emit f16to32 in align16 mode with UD as source data type.
 537     */
 538
 539    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 540    src_reg tmp_src(tmp_dst);
 541
 542    tmp_dst.writemask = WRITEMASK_X;
 543    emit(AND(tmp_dst, src0, src_reg(0xffffu)));
 544
 545    tmp_dst.writemask = WRITEMASK_Y;
 546    emit(SHR(tmp_dst, src0, src_reg(16u)));
 547
 548    dst.writemask = WRITEMASK_XY;
 549    emit(F16TO32(dst, tmp_src));
 550 }
 551
 552 void
 553 vec4_visitor::visit_instructions(const exec_list *list)
 554 {
 555    foreach_in_list(ir_instruction, ir, list) {
 556       base_ir = ir;
 557       ir->accept(this);
 558    }
 559 }
 560
 561
 562 static int
 563 type_size(const struct glsl_type *type)
 564 {
 565    unsigned int i;
 566    int size;
 567
 568    switch (type->base_type) {
 569    case GLSL_TYPE_UINT:
 570    case GLSL_TYPE_INT:
 571    case GLSL_TYPE_FLOAT:
 572    case GLSL_TYPE_BOOL:
 573       if (type->is_matrix()) {
 574          return type->matrix_columns;
 575       } else {
 576          /* Regardless of size of vector, it gets a vec4. This is bad
 577           * packing for things like floats, but otherwise arrays become a
 578           * mess.  Hopefully a later pass over the code can pack scalars
 579           * down if appropriate.
 580           */
 581          return 1;
 582       }
 583    case GLSL_TYPE_ARRAY:
 584       assert(type->length > 0);
 585       return type_size(type->fields.array) * type->length;
 586    case GLSL_TYPE_STRUCT:
 587       size = 0;
 588       for (i = 0; i < type->length; i++) {
 589          size += type_size(type->fields.structure[i].type);
 590       }
 591       return size;
 592    case GLSL_TYPE_SAMPLER:
 593       /* Samplers take up no register space, since they're baked in at
 594        * link time.
 595        */
 596       return 0;
 597    case GLSL_TYPE_ATOMIC_UINT:
 598       return 0;
 599    case GLSL_TYPE_IMAGE:
 600    case GLSL_TYPE_VOID:
 601    case GLSL_TYPE_ERROR:
 602    case GLSL_TYPE_INTERFACE:
 603       unreachable("not reached");
 604    }
 605
 606    return 0;
 607 }
 608
 609 int
 610 vec4_visitor::virtual_grf_alloc(int size)
 611 {
 612    if (virtual_grf_array_size <= virtual_grf_count) {
 613       if (virtual_grf_array_size == 0)
 614          virtual_grf_array_size = 16;
 615       else
 616          virtual_grf_array_size *= 2;
 617       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 618                                    virtual_grf_array_size);
 619       virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
 620                                      virtual_grf_array_size);
 621    }
 622    virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
 623    virtual_grf_reg_count += size;
 624    virtual_grf_sizes[virtual_grf_count] = size;
 625    return virtual_grf_count++;
 626 }
 627
 628 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 629 {
 630    init();
 631
 632    this->file = GRF;
 633    this->reg = v->virtual_grf_alloc(type_size(type));
 634
 635    if (type->is_array() || type->is_record()) {
 636       this->swizzle = BRW_SWIZZLE_NOOP;
 637    } else {
 638       this->swizzle = swizzle_for_size(type->vector_elements);
 639    }
 640
 641    this->type = brw_type_for_base_type(type);
 642 }
 643
 644 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
 645 {
 646    assert(size > 0);
 647
 648    init();
 649
 650    this->file = GRF;
 651    this->reg = v->virtual_grf_alloc(type_size(type) * size);
 652
 653    this->swizzle = BRW_SWIZZLE_NOOP;
 654
 655    this->type = brw_type_for_base_type(type);
 656 }
 657
 658 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 659 {
 660    init();
 661
 662    this->file = GRF;
 663    this->reg = v->virtual_grf_alloc(type_size(type));
 664
 665    if (type->is_array() || type->is_record()) {
 666       this->writemask = WRITEMASK_XYZW;
 667    } else {
 668       this->writemask = (1 << type->vector_elements) - 1;
 669    }
 670
 671    this->type = brw_type_for_base_type(type);
 672 }
 673
 674 /* Our support for uniforms is piggy-backed on the struct
 675  * gl_fragment_program, because that's where the values actually
 676  * get stored, rather than in some global gl_shader_program uniform
 677  * store.
 678  */
 679 void
 680 vec4_visitor::setup_uniform_values(ir_variable *ir)
 681 {
 682    int namelen = strlen(ir->name);
 683
 684    /* The data for our (non-builtin) uniforms is stored in a series of
 685     * gl_uniform_driver_storage structs for each subcomponent that
 686     * glGetUniformLocation() could name.  We know it's been set up in the same
 687     * order we'd walk the type, so walk the list of storage and find anything
 688     * with our name, or the prefix of a component that starts with our name.
 689     */
 690    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 691       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 692
 693       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 694           (storage->name[namelen] != 0 &&
 695            storage->name[namelen] != '.' &&
 696            storage->name[namelen] != '[')) {
 697          continue;
 698       }
 699
 700       gl_constant_value *components = storage->storage;
 701       unsigned vector_count = (MAX2(storage->array_elements, 1) *
 702                                storage->type->matrix_columns);
 703
 704       for (unsigned s = 0; s < vector_count; s++) {
 705          assert(uniforms < uniform_array_size);
 706          uniform_vector_size[uniforms] = storage->type->vector_elements;
 707
 708          int i;
 709          for (i = 0; i < uniform_vector_size[uniforms]; i++) {
 710             stage_prog_data->param[uniforms * 4 + i] = components;
 711             components++;
 712          }
 713          for (; i < 4; i++) {
 714             static gl_constant_value zero = { 0.0 };
 715             stage_prog_data->param[uniforms * 4 + i] = &zero;
 716          }
 717
 718          uniforms++;
 719       }
 720    }
 721 }
 722
 723 void
 724 vec4_visitor::setup_uniform_clipplane_values()
 725 {
 726    gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
 727
 728    for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
 729       assert(this->uniforms < uniform_array_size);
 730       this->uniform_vector_size[this->uniforms] = 4;
 731       this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
 732       this->userplane[i].type = BRW_REGISTER_TYPE_F;
 733       for (int j = 0; j < 4; ++j) {
 734          stage_prog_data->param[this->uniforms * 4 + j] =
 735             (gl_constant_value *) &clip_planes[i][j];
 736       }
 737       ++this->uniforms;
 738    }
 739 }
 740
 741 /* Our support for builtin uniforms is even scarier than non-builtin.
 742  * It sits on top of the PROG_STATE_VAR parameters that are
 743  * automatically updated from GL context state.
 744  */
 745 void
 746 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
 747 {
 748    const ir_state_slot *const slots = ir->get_state_slots();
 749    assert(slots != NULL);
 750
 751    for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
 752       /* This state reference has already been setup by ir_to_mesa,
 753        * but we'll get the same index back here.  We can reference
 754        * ParameterValues directly, since unlike brw_fs.cpp, we never
 755        * add new state references during compile.
 756        */
 757       int index = _mesa_add_state_reference(this->prog->Parameters,
 758                                             (gl_state_index *)slots[i].tokens);
 759       gl_constant_value *values =
 760          &this->prog->Parameters->ParameterValues[index][0];
 761
 762       assert(this->uniforms < uniform_array_size);
 763       this->uniform_vector_size[this->uniforms] = 0;
 764       /* Add each of the unique swizzled channels of the element.
 765        * This will end up matching the size of the glsl_type of this field.
 766        */
 767       int last_swiz = -1;
 768       for (unsigned int j = 0; j < 4; j++) {
 769          int swiz = GET_SWZ(slots[i].swizzle, j);
 770          last_swiz = swiz;
 771
 772          stage_prog_data->param[this->uniforms * 4 + j] = &values[swiz];
 773          assert(this->uniforms < uniform_array_size);
 774          if (swiz <= last_swiz)
 775             this->uniform_vector_size[this->uniforms]++;
 776       }
 777       this->uniforms++;
 778    }
 779 }
 780
 781 dst_reg *
 782 vec4_visitor::variable_storage(ir_variable *var)
 783 {
 784    return (dst_reg *)hash_table_find(this->variable_ht, var);
 785 }
 786
 787 void
 788 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir,
 789                                      enum brw_predicate *predicate)
 790 {
 791    ir_expression *expr = ir->as_expression();
 792
 793    *predicate = BRW_PREDICATE_NORMAL;
 794
 795    if (expr && expr->operation != ir_binop_ubo_load) {
 796       src_reg op[3];
 797       vec4_instruction *inst;
 798
 799       assert(expr->get_num_operands() <= 3);
 800       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 801          expr->operands[i]->accept(this);
 802          op[i] = this->result;
 803
 804          resolve_ud_negate(&op[i]);
 805       }
 806
 807       switch (expr->operation) {
 808       case ir_unop_logic_not:
 809          inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
 810          inst->conditional_mod = BRW_CONDITIONAL_Z;
 811          break;
 812
 813       case ir_binop_logic_xor:
 814          inst = emit(XOR(dst_null_d(), op[0], op[1]));
 815          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 816          break;
 817
 818       case ir_binop_logic_or:
 819          inst = emit(OR(dst_null_d(), op[0], op[1]));
 820          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 821          break;
 822
 823       case ir_binop_logic_and:
 824          inst = emit(AND(dst_null_d(), op[0], op[1]));
 825          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 826          break;
 827
 828       case ir_unop_f2b:
 829          if (brw->gen >= 6) {
 830             emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
 831          } else {
 832             inst = emit(MOV(dst_null_f(), op[0]));
 833             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 834          }
 835          break;
 836
 837       case ir_unop_i2b:
 838          if (brw->gen >= 6) {
 839             emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 840          } else {
 841             inst = emit(MOV(dst_null_d(), op[0]));
 842             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 843          }
 844          break;
 845
 846       case ir_binop_all_equal:
 847          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 848          *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
 849          break;
 850
 851       case ir_binop_any_nequal:
 852          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 853          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 854          break;
 855
 856       case ir_unop_any:
 857          inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 858          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 859          break;
 860
 861       case ir_binop_greater:
 862       case ir_binop_gequal:
 863       case ir_binop_less:
 864       case ir_binop_lequal:
 865       case ir_binop_equal:
 866       case ir_binop_nequal:
 867          emit(CMP(dst_null_d(), op[0], op[1],
 868                   brw_conditional_for_comparison(expr->operation)));
 869          break;
 870
 871       case ir_triop_csel: {
 872          /* Expand the boolean condition into the flag register. */
 873          inst = emit(MOV(dst_null_d(), op[0]));
 874          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 875
 876          /* Select which boolean to return. */
 877          dst_reg temp(this, expr->operands[1]->type);
 878          inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
 879          inst->predicate = BRW_PREDICATE_NORMAL;
 880
 881          /* Expand the result to a condition code. */
 882          inst = emit(MOV(dst_null_d(), src_reg(temp)));
 883          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 884          break;
 885       }
 886
 887       default:
 888          unreachable("not reached");
 889       }
 890       return;
 891    }
 892
 893    ir->accept(this);
 894
 895    resolve_ud_negate(&this->result);
 896
 897    if (brw->gen >= 6) {
 898       vec4_instruction *inst = emit(AND(dst_null_d(),
 899                                         this->result, src_reg(1)));
 900       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 901    } else {
 902       vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
 903       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 904    }
 905 }
 906
 907 /**
 908  * Emit a gen6 IF statement with the comparison folded into the IF
 909  * instruction.
 910  */
 911 void
 912 vec4_visitor::emit_if_gen6(ir_if *ir)
 913 {
 914    ir_expression *expr = ir->condition->as_expression();
 915
 916    if (expr && expr->operation != ir_binop_ubo_load) {
 917       src_reg op[3];
 918       dst_reg temp;
 919
 920       assert(expr->get_num_operands() <= 3);
 921       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 922          expr->operands[i]->accept(this);
 923          op[i] = this->result;
 924       }
 925
 926       switch (expr->operation) {
 927       case ir_unop_logic_not:
 928          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
 929          return;
 930
 931       case ir_binop_logic_xor:
 932          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
 933          return;
 934
 935       case ir_binop_logic_or:
 936          temp = dst_reg(this, glsl_type::bool_type);
 937          emit(OR(temp, op[0], op[1]));
 938          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 939          return;
 940
 941       case ir_binop_logic_and:
 942          temp = dst_reg(this, glsl_type::bool_type);
 943          emit(AND(temp, op[0], op[1]));
 944          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 945          return;
 946
 947       case ir_unop_f2b:
 948          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 949          return;
 950
 951       case ir_unop_i2b:
 952          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 953          return;
 954
 955       case ir_binop_greater:
 956       case ir_binop_gequal:
 957       case ir_binop_less:
 958       case ir_binop_lequal:
 959       case ir_binop_equal:
 960       case ir_binop_nequal:
 961          emit(IF(op[0], op[1],
 962                  brw_conditional_for_comparison(expr->operation)));
 963          return;
 964
 965       case ir_binop_all_equal:
 966          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 967          emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
 968          return;
 969
 970       case ir_binop_any_nequal:
 971          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 972          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 973          return;
 974
 975       case ir_unop_any:
 976          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 977          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 978          return;
 979
 980       case ir_triop_csel: {
 981          /* Expand the boolean condition into the flag register. */
 982          vec4_instruction *inst = emit(MOV(dst_null_d(), op[0]));
 983          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 984
 985          /* Select which boolean to return. */
 986          dst_reg temp(this, expr->operands[1]->type);
 987          inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
 988          inst->predicate = BRW_PREDICATE_NORMAL;
 989
 990          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 991          return;
 992       }
 993
 994       default:
 995          unreachable("not reached");
 996       }
 997       return;
 998    }
 999
1000    ir->condition->accept(this);
1001
1002    emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
1003 }
1004
1005 void
1006 vec4_visitor::visit(ir_variable *ir)
1007 {
1008    dst_reg *reg = NULL;
1009
1010    if (variable_storage(ir))
1011       return;
1012
1013    switch (ir->data.mode) {
1014    case ir_var_shader_in:
1015       reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
1016       break;
1017
1018    case ir_var_shader_out:
1019       reg = new(mem_ctx) dst_reg(this, ir->type);
1020
1021       for (int i = 0; i < type_size(ir->type); i++) {
1022          output_reg[ir->data.location + i] = *reg;
1023          output_reg[ir->data.location + i].reg_offset = i;
1024          output_reg[ir->data.location + i].type =
1025             brw_type_for_base_type(ir->type->get_scalar_type());
1026          output_reg_annotation[ir->data.location + i] = ir->name;
1027       }
1028       break;
1029
1030    case ir_var_auto:
1031    case ir_var_temporary:
1032       reg = new(mem_ctx) dst_reg(this, ir->type);
1033       break;
1034
1035    case ir_var_uniform:
1036       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
1037
1038       /* Thanks to the lower_ubo_reference pass, we will see only
1039        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
1040        * variables, so no need for them to be in variable_ht.
1041        *
1042        * Some uniforms, such as samplers and atomic counters, have no actual
1043        * storage, so we should ignore them.
1044        */
1045       if (ir->is_in_uniform_block() || type_size(ir->type) == 0)
1046          return;
1047
1048       /* Track how big the whole uniform variable is, in case we need to put a
1049        * copy of its data into pull constants for array access.
1050        */
1051       assert(this->uniforms < uniform_array_size);
1052       this->uniform_size[this->uniforms] = type_size(ir->type);
1053
1054       if (!strncmp(ir->name, "gl_", 3)) {
1055          setup_builtin_uniform_values(ir);
1056       } else {
1057          setup_uniform_values(ir);
1058       }
1059       break;
1060
1061    case ir_var_system_value:
1062       reg = make_reg_for_system_value(ir);
1063       break;
1064
1065    default:
1066       unreachable("not reached");
1067    }
1068
1069    reg->type = brw_type_for_base_type(ir->type);
1070    hash_table_insert(this->variable_ht, reg, ir);
1071 }
1072
1073 void
1074 vec4_visitor::visit(ir_loop *ir)
1075 {
1076    /* We don't want debugging output to print the whole body of the
1077     * loop as the annotation.
1078     */
1079    this->base_ir = NULL;
1080
1081    emit(BRW_OPCODE_DO);
1082
1083    visit_instructions(&ir->body_instructions);
1084
1085    emit(BRW_OPCODE_WHILE);
1086 }
1087
1088 void
1089 vec4_visitor::visit(ir_loop_jump *ir)
1090 {
1091    switch (ir->mode) {
1092    case ir_loop_jump::jump_break:
1093       emit(BRW_OPCODE_BREAK);
1094       break;
1095    case ir_loop_jump::jump_continue:
1096       emit(BRW_OPCODE_CONTINUE);
1097       break;
1098    }
1099 }
1100
1101
1102 void
1103 vec4_visitor::visit(ir_function_signature *)
1104 {
1105    unreachable("not reached");
1106 }
1107
1108 void
1109 vec4_visitor::visit(ir_function *ir)
1110 {
1111    /* Ignore function bodies other than main() -- we shouldn't see calls to
1112     * them since they should all be inlined.
1113     */
1114    if (strcmp(ir->name, "main") == 0) {
1115       const ir_function_signature *sig;
1116       exec_list empty;
1117
1118       sig = ir->matching_signature(NULL, &empty, false);
1119
1120       assert(sig);
1121
1122       visit_instructions(&sig->body);
1123    }
1124 }
1125
1126 bool
1127 vec4_visitor::try_emit_mad(ir_expression *ir)
1128 {
1129    /* 3-src instructions were introduced in gen6. */
1130    if (brw->gen < 6)
1131       return false;
1132
1133    /* MAD can only handle floating-point data. */
1134    if (ir->type->base_type != GLSL_TYPE_FLOAT)
1135       return false;
1136
1137    ir_rvalue *nonmul = ir->operands[1];
1138    ir_expression *mul = ir->operands[0]->as_expression();
1139
1140    if (!mul || mul->operation != ir_binop_mul) {
1141       nonmul = ir->operands[0];
1142       mul = ir->operands[1]->as_expression();
1143
1144       if (!mul || mul->operation != ir_binop_mul)
1145          return false;
1146    }
1147
1148    nonmul->accept(this);
1149    src_reg src0 = fix_3src_operand(this->result);
1150
1151    mul->operands[0]->accept(this);
1152    src_reg src1 = fix_3src_operand(this->result);
1153
1154    mul->operands[1]->accept(this);
1155    src_reg src2 = fix_3src_operand(this->result);
1156
1157    this->result = src_reg(this, ir->type);
1158    emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1159
1160    return true;
1161 }
1162
1163 bool
1164 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1165 {
1166    /* This optimization relies on CMP setting the destination to 0 when
1167     * false.  Early hardware only sets the least significant bit, and
1168     * leaves the other bits undefined.  So we can't use it.
1169     */
1170    if (brw->gen < 6)
1171       return false;
1172
1173    ir_expression *const cmp = ir->operands[0]->as_expression();
1174
1175    if (cmp == NULL)
1176       return false;
1177
1178    switch (cmp->operation) {
1179    case ir_binop_less:
1180    case ir_binop_greater:
1181    case ir_binop_lequal:
1182    case ir_binop_gequal:
1183    case ir_binop_equal:
1184    case ir_binop_nequal:
1185       break;
1186
1187    default:
1188       return false;
1189    }
1190
1191    cmp->operands[0]->accept(this);
1192    const src_reg cmp_src0 = this->result;
1193
1194    cmp->operands[1]->accept(this);
1195    const src_reg cmp_src1 = this->result;
1196
1197    this->result = src_reg(this, ir->type);
1198
1199    emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1200             brw_conditional_for_comparison(cmp->operation)));
1201
1202    /* If the comparison is false, this->result will just happen to be zero.
1203     */
1204    vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1205                                        this->result, src_reg(1.0f));
1206    inst->predicate = BRW_PREDICATE_NORMAL;
1207    inst->predicate_inverse = true;
1208
1209    return true;
1210 }
1211
1212 void
1213 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
1214                           src_reg src0, src_reg src1)
1215 {
1216    vec4_instruction *inst;
1217
1218    if (brw->gen >= 6) {
1219       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1220       inst->conditional_mod = conditionalmod;
1221    } else {
1222       emit(CMP(dst, src0, src1, conditionalmod));
1223
1224       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1225       inst->predicate = BRW_PREDICATE_NORMAL;
1226    }
1227 }
1228
1229 void
1230 vec4_visitor::emit_lrp(const dst_reg &dst,
1231                        const src_reg &x, const src_reg &y, const src_reg &a)
1232 {
1233    if (brw->gen >= 6) {
1234       /* Note that the instruction's argument order is reversed from GLSL
1235        * and the IR.
1236        */
1237       emit(LRP(dst,
1238                fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
1239    } else {
1240       /* Earlier generations don't support three source operations, so we
1241        * need to emit x*(1-a) + y*a.
1242        */
1243       dst_reg y_times_a           = dst_reg(this, glsl_type::vec4_type);
1244       dst_reg one_minus_a         = dst_reg(this, glsl_type::vec4_type);
1245       dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1246       y_times_a.writemask           = dst.writemask;
1247       one_minus_a.writemask         = dst.writemask;
1248       x_times_one_minus_a.writemask = dst.writemask;
1249
1250       emit(MUL(y_times_a, y, a));
1251       emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1252       emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1253       emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1254    }
1255 }
1256
1257 void
1258 vec4_visitor::visit(ir_expression *ir)
1259 {
1260    unsigned int operand;
1261    src_reg op[Elements(ir->operands)];
1262    vec4_instruction *inst;
1263
1264    if (ir->operation == ir_binop_add) {
1265       if (try_emit_mad(ir))
1266          return;
1267    }
1268
1269    if (ir->operation == ir_unop_b2f) {
1270       if (try_emit_b2f_of_compare(ir))
1271          return;
1272    }
1273
1274    /* Storage for our result.  Ideally for an assignment we'd be using
1275     * the actual storage for the result here, instead.
1276     */
1277    dst_reg result_dst(this, ir->type);
1278    src_reg result_src(result_dst);
1279
1280    if (ir->operation == ir_triop_csel) {
1281       ir->operands[1]->accept(this);
1282       op[1] = this->result;
1283       ir->operands[2]->accept(this);
1284       op[2] = this->result;
1285
1286       enum brw_predicate predicate;
1287       emit_bool_to_cond_code(ir->operands[0], &predicate);
1288       inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1289       inst->predicate = predicate;
1290       this->result = result_src;
1291       return;
1292    }
1293
1294    for (operand = 0; operand < ir->get_num_operands(); operand++) {
1295       this->result.file = BAD_FILE;
1296       ir->operands[operand]->accept(this);
1297       if (this->result.file == BAD_FILE) {
1298          fprintf(stderr, "Failed to get tree for expression operand:\n");
1299          ir->operands[operand]->fprint(stderr);
1300          exit(1);
1301       }
1302       op[operand] = this->result;
1303
1304       /* Matrix expression operands should have been broken down to vector
1305        * operations already.
1306        */
1307       assert(!ir->operands[operand]->type->is_matrix());
1308    }
1309
1310    /* If nothing special happens, this is the result. */
1311    this->result = result_src;
1312
1313    switch (ir->operation) {
1314    case ir_unop_logic_not:
1315       if (ctx->Const.UniformBooleanTrue != 1) {
1316          emit(NOT(result_dst, op[0]));
1317       } else {
1318          emit(XOR(result_dst, op[0], src_reg(1u)));
1319       }
1320       break;
1321    case ir_unop_neg:
1322       op[0].negate = !op[0].negate;
1323       emit(MOV(result_dst, op[0]));
1324       break;
1325    case ir_unop_abs:
1326       op[0].abs = true;
1327       op[0].negate = false;
1328       emit(MOV(result_dst, op[0]));
1329       break;
1330
1331    case ir_unop_sign:
1332       if (ir->type->is_float()) {
1333          /* AND(val, 0x80000000) gives the sign bit.
1334           *
1335           * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1336           * zero.
1337           */
1338          emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1339
1340          op[0].type = BRW_REGISTER_TYPE_UD;
1341          result_dst.type = BRW_REGISTER_TYPE_UD;
1342          emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1343
1344          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1345          inst->predicate = BRW_PREDICATE_NORMAL;
1346
1347          this->result.type = BRW_REGISTER_TYPE_F;
1348       } else {
1349          /*  ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1350           *               -> non-negative val generates 0x00000000.
1351           *  Predicated OR sets 1 if val is positive.
1352           */
1353          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1354
1355          emit(ASR(result_dst, op[0], src_reg(31)));
1356
1357          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1358          inst->predicate = BRW_PREDICATE_NORMAL;
1359       }
1360       break;
1361
1362    case ir_unop_rcp:
1363       emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1364       break;
1365
1366    case ir_unop_exp2:
1367       emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1368       break;
1369    case ir_unop_log2:
1370       emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1371       break;
1372    case ir_unop_exp:
1373    case ir_unop_log:
1374       unreachable("not reached: should be handled by ir_explog_to_explog2");
1375    case ir_unop_sin:
1376    case ir_unop_sin_reduced:
1377       emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1378       break;
1379    case ir_unop_cos:
1380    case ir_unop_cos_reduced:
1381       emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1382       break;
1383
1384    case ir_unop_dFdx:
1385    case ir_unop_dFdx_coarse:
1386    case ir_unop_dFdx_fine:
1387    case ir_unop_dFdy:
1388    case ir_unop_dFdy_coarse:
1389    case ir_unop_dFdy_fine:
1390       unreachable("derivatives not valid in vertex shader");
1391
1392    case ir_unop_bitfield_reverse:
1393       emit(BFREV(result_dst, op[0]));
1394       break;
1395    case ir_unop_bit_count:
1396       emit(CBIT(result_dst, op[0]));
1397       break;
1398    case ir_unop_find_msb: {
1399       src_reg temp = src_reg(this, glsl_type::uint_type);
1400
1401       inst = emit(FBH(dst_reg(temp), op[0]));
1402       inst->dst.writemask = WRITEMASK_XYZW;
1403
1404       /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1405        * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1406        * subtract the result from 31 to convert the MSB count into an LSB count.
1407        */
1408
1409       /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1410       temp.swizzle = BRW_SWIZZLE_NOOP;
1411       emit(MOV(result_dst, temp));
1412
1413       src_reg src_tmp = src_reg(result_dst);
1414       emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1415
1416       src_tmp.negate = true;
1417       inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1418       inst->predicate = BRW_PREDICATE_NORMAL;
1419       break;
1420    }
1421    case ir_unop_find_lsb:
1422       emit(FBL(result_dst, op[0]));
1423       break;
1424    case ir_unop_saturate:
1425       inst = emit(MOV(result_dst, op[0]));
1426       inst->saturate = true;
1427       break;
1428
1429    case ir_unop_noise:
1430       unreachable("not reached: should be handled by lower_noise");
1431
1432    case ir_binop_add:
1433       emit(ADD(result_dst, op[0], op[1]));
1434       break;
1435    case ir_binop_sub:
1436       unreachable("not reached: should be handled by ir_sub_to_add_neg");
1437
1438    case ir_binop_mul:
1439       if (brw->gen < 8 && ir->type->is_integer()) {
1440          /* For integer multiplication, the MUL uses the low 16 bits of one of
1441           * the operands (src0 through SNB, src1 on IVB and later).  The MACH
1442           * accumulates in the contribution of the upper 16 bits of that
1443           * operand.  If we can determine that one of the args is in the low
1444           * 16 bits, though, we can just emit a single MUL.
1445           */
1446          if (ir->operands[0]->is_uint16_constant()) {
1447             if (brw->gen < 7)
1448                emit(MUL(result_dst, op[0], op[1]));
1449             else
1450                emit(MUL(result_dst, op[1], op[0]));
1451          } else if (ir->operands[1]->is_uint16_constant()) {
1452             if (brw->gen < 7)
1453                emit(MUL(result_dst, op[1], op[0]));
1454             else
1455                emit(MUL(result_dst, op[0], op[1]));
1456          } else {
1457             struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1458
1459             emit(MUL(acc, op[0], op[1]));
1460             emit(MACH(dst_null_d(), op[0], op[1]));
1461             emit(MOV(result_dst, src_reg(acc)));
1462          }
1463       } else {
1464          emit(MUL(result_dst, op[0], op[1]));
1465       }
1466       break;
1467    case ir_binop_imul_high: {
1468       struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1469
1470       emit(MUL(acc, op[0], op[1]));
1471       emit(MACH(result_dst, op[0], op[1]));
1472       break;
1473    }
1474    case ir_binop_div:
1475       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1476       assert(ir->type->is_integer());
1477       emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1478       break;
1479    case ir_binop_carry: {
1480       struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
1481
1482       emit(ADDC(dst_null_ud(), op[0], op[1]));
1483       emit(MOV(result_dst, src_reg(acc)));
1484       break;
1485    }
1486    case ir_binop_borrow: {
1487       struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
1488
1489       emit(SUBB(dst_null_ud(), op[0], op[1]));
1490       emit(MOV(result_dst, src_reg(acc)));
1491       break;
1492    }
1493    case ir_binop_mod:
1494       /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1495       assert(ir->type->is_integer());
1496       emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1497       break;
1498
1499    case ir_binop_less:
1500    case ir_binop_greater:
1501    case ir_binop_lequal:
1502    case ir_binop_gequal:
1503    case ir_binop_equal:
1504    case ir_binop_nequal: {
1505       emit(CMP(result_dst, op[0], op[1],
1506                brw_conditional_for_comparison(ir->operation)));
1507       if (ctx->Const.UniformBooleanTrue == 1) {
1508          emit(AND(result_dst, result_src, src_reg(1u)));
1509       }
1510       break;
1511    }
1512
1513    case ir_binop_all_equal:
1514       /* "==" operator producing a scalar boolean. */
1515       if (ir->operands[0]->type->is_vector() ||
1516           ir->operands[1]->type->is_vector()) {
1517          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1518          emit(MOV(result_dst, src_reg(0)));
1519          inst = emit(MOV(result_dst, src_reg(ctx->Const.UniformBooleanTrue)));
1520          inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1521       } else {
1522          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1523          if (ctx->Const.UniformBooleanTrue == 1) {
1524             emit(AND(result_dst, result_src, src_reg(1u)));
1525          }
1526       }
1527       break;
1528    case ir_binop_any_nequal:
1529       /* "!=" operator producing a scalar boolean. */
1530       if (ir->operands[0]->type->is_vector() ||
1531           ir->operands[1]->type->is_vector()) {
1532          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1533
1534          emit(MOV(result_dst, src_reg(0)));
1535          inst = emit(MOV(result_dst, src_reg(ctx->Const.UniformBooleanTrue)));
1536          inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1537       } else {
1538          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1539          if (ctx->Const.UniformBooleanTrue == 1) {
1540             emit(AND(result_dst, result_src, src_reg(1u)));
1541          }
1542       }
1543       break;
1544
1545    case ir_unop_any:
1546       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1547       emit(MOV(result_dst, src_reg(0)));
1548
1549       inst = emit(MOV(result_dst, src_reg(ctx->Const.UniformBooleanTrue)));
1550       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1551       break;
1552
1553    case ir_binop_logic_xor:
1554       emit(XOR(result_dst, op[0], op[1]));
1555       break;
1556
1557    case ir_binop_logic_or:
1558       emit(OR(result_dst, op[0], op[1]));
1559       break;
1560
1561    case ir_binop_logic_and:
1562       emit(AND(result_dst, op[0], op[1]));
1563       break;
1564
1565    case ir_binop_dot:
1566       assert(ir->operands[0]->type->is_vector());
1567       assert(ir->operands[0]->type == ir->operands[1]->type);
1568       emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1569       break;
1570
1571    case ir_unop_sqrt:
1572       emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1573       break;
1574    case ir_unop_rsq:
1575       emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1576       break;
1577
1578    case ir_unop_bitcast_i2f:
1579    case ir_unop_bitcast_u2f:
1580       this->result = op[0];
1581       this->result.type = BRW_REGISTER_TYPE_F;
1582       break;
1583
1584    case ir_unop_bitcast_f2i:
1585       this->result = op[0];
1586       this->result.type = BRW_REGISTER_TYPE_D;
1587       break;
1588
1589    case ir_unop_bitcast_f2u:
1590       this->result = op[0];
1591       this->result.type = BRW_REGISTER_TYPE_UD;
1592       break;
1593
1594    case ir_unop_i2f:
1595    case ir_unop_i2u:
1596    case ir_unop_u2i:
1597    case ir_unop_u2f:
1598    case ir_unop_f2i:
1599    case ir_unop_f2u:
1600       emit(MOV(result_dst, op[0]));
1601       break;
1602    case ir_unop_b2i:
1603       if (ctx->Const.UniformBooleanTrue != 1) {
1604          emit(AND(result_dst, op[0], src_reg(1u)));
1605       } else {
1606          emit(MOV(result_dst, op[0]));
1607       }
1608       break;
1609    case ir_unop_b2f:
1610       if (ctx->Const.UniformBooleanTrue != 1) {
1611          op[0].type = BRW_REGISTER_TYPE_UD;
1612          result_dst.type = BRW_REGISTER_TYPE_UD;
1613          emit(AND(result_dst, op[0], src_reg(0x3f800000u)));
1614          result_dst.type = BRW_REGISTER_TYPE_F;
1615       } else {
1616          emit(MOV(result_dst, op[0]));
1617       }
1618       break;
1619    case ir_unop_f2b:
1620    case ir_unop_i2b:
1621       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1622       if (ctx->Const.UniformBooleanTrue == 1) {
1623          emit(AND(result_dst, result_src, src_reg(1u)));
1624       }
1625       break;
1626
1627    case ir_unop_trunc:
1628       emit(RNDZ(result_dst, op[0]));
1629       break;
1630    case ir_unop_ceil:
1631       op[0].negate = !op[0].negate;
1632       inst = emit(RNDD(result_dst, op[0]));
1633       this->result.negate = true;
1634       break;
1635    case ir_unop_floor:
1636       inst = emit(RNDD(result_dst, op[0]));
1637       break;
1638    case ir_unop_fract:
1639       inst = emit(FRC(result_dst, op[0]));
1640       break;
1641    case ir_unop_round_even:
1642       emit(RNDE(result_dst, op[0]));
1643       break;
1644
1645    case ir_binop_min:
1646       emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1647       break;
1648    case ir_binop_max:
1649       emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1650       break;
1651
1652    case ir_binop_pow:
1653       emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1654       break;
1655
1656    case ir_unop_bit_not:
1657       inst = emit(NOT(result_dst, op[0]));
1658       break;
1659    case ir_binop_bit_and:
1660       inst = emit(AND(result_dst, op[0], op[1]));
1661       break;
1662    case ir_binop_bit_xor:
1663       inst = emit(XOR(result_dst, op[0], op[1]));
1664       break;
1665    case ir_binop_bit_or:
1666       inst = emit(OR(result_dst, op[0], op[1]));
1667       break;
1668
1669    case ir_binop_lshift:
1670       inst = emit(SHL(result_dst, op[0], op[1]));
1671       break;
1672
1673    case ir_binop_rshift:
1674       if (ir->type->base_type == GLSL_TYPE_INT)
1675          inst = emit(ASR(result_dst, op[0], op[1]));
1676       else
1677          inst = emit(SHR(result_dst, op[0], op[1]));
1678       break;
1679
1680    case ir_binop_bfm:
1681       emit(BFI1(result_dst, op[0], op[1]));
1682       break;
1683
1684    case ir_binop_ubo_load: {
1685       ir_constant *const_uniform_block = ir->operands[0]->as_constant();
1686       ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1687       unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1688       src_reg offset;
1689
1690       /* Now, load the vector from that offset. */
1691       assert(ir->type->is_vector() || ir->type->is_scalar());
1692
1693       src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1694       packed_consts.type = result.type;
1695       src_reg surf_index;
1696
1697       if (const_uniform_block) {
1698          /* The block index is a constant, so just emit the binding table entry
1699           * as an immediate.
1700           */
1701          surf_index = src_reg(prog_data->base.binding_table.ubo_start +
1702                               const_uniform_block->value.u[0]);
1703       } else {
1704          /* The block index is not a constant. Evaluate the index expression
1705           * per-channel and add the base UBO index; the generator will select
1706           * a value from any live channel.
1707           */
1708          surf_index = src_reg(this, glsl_type::uint_type);
1709          emit(ADD(dst_reg(surf_index), op[0],
1710                   src_reg(prog_data->base.binding_table.ubo_start)));
1711
1712          /* Assume this may touch any UBO. It would be nice to provide
1713           * a tighter bound, but the array information is already lowered away.
1714           */
1715          brw_mark_surface_used(&prog_data->base,
1716                                prog_data->base.binding_table.ubo_start +
1717                                shader_prog->NumUniformBlocks - 1);
1718       }
1719
1720       if (const_offset_ir) {
1721          if (brw->gen >= 8) {
1722             /* Store the offset in a GRF so we can send-from-GRF. */
1723             offset = src_reg(this, glsl_type::int_type);
1724             emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1725          } else {
1726             /* Immediates are fine on older generations since they'll be moved
1727              * to a (potentially fake) MRF at the generator level.
1728              */
1729             offset = src_reg(const_offset / 16);
1730          }
1731       } else {
1732          offset = src_reg(this, glsl_type::uint_type);
1733          emit(SHR(dst_reg(offset), op[1], src_reg(4)));
1734       }
1735
1736       if (brw->gen >= 7) {
1737          dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1738          grf_offset.type = offset.type;
1739
1740          emit(MOV(grf_offset, offset));
1741
1742          emit(new(mem_ctx) vec4_instruction(this,
1743                                             VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1744                                             dst_reg(packed_consts),
1745                                             surf_index,
1746                                             src_reg(grf_offset)));
1747       } else {
1748          vec4_instruction *pull =
1749             emit(new(mem_ctx) vec4_instruction(this,
1750                                                VS_OPCODE_PULL_CONSTANT_LOAD,
1751                                                dst_reg(packed_consts),
1752                                                surf_index,
1753                                                offset));
1754          pull->base_mrf = 14;
1755          pull->mlen = 1;
1756       }
1757
1758       packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1759       packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1760                                             const_offset % 16 / 4,
1761                                             const_offset % 16 / 4,
1762                                             const_offset % 16 / 4);
1763
1764       /* UBO bools are any nonzero int.  We need to convert them to use the
1765        * value of true stored in ctx->Const.UniformBooleanTrue.
1766        */
1767       if (ir->type->base_type == GLSL_TYPE_BOOL) {
1768          emit(CMP(result_dst, packed_consts, src_reg(0u),
1769                   BRW_CONDITIONAL_NZ));
1770          if (ctx->Const.UniformBooleanTrue == 1) {
1771             emit(AND(result_dst, result, src_reg(1u)));
1772          }
1773       } else {
1774          emit(MOV(result_dst, packed_consts));
1775       }
1776       break;
1777    }
1778
1779    case ir_binop_vector_extract:
1780       unreachable("should have been lowered by vec_index_to_cond_assign");
1781
1782    case ir_triop_fma:
1783       op[0] = fix_3src_operand(op[0]);
1784       op[1] = fix_3src_operand(op[1]);
1785       op[2] = fix_3src_operand(op[2]);
1786       /* Note that the instruction's argument order is reversed from GLSL
1787        * and the IR.
1788        */
1789       emit(MAD(result_dst, op[2], op[1], op[0]));
1790       break;
1791
1792    case ir_triop_lrp:
1793       emit_lrp(result_dst, op[0], op[1], op[2]);
1794       break;
1795
1796    case ir_triop_csel:
1797       unreachable("already handled above");
1798       break;
1799
1800    case ir_triop_bfi:
1801       op[0] = fix_3src_operand(op[0]);
1802       op[1] = fix_3src_operand(op[1]);
1803       op[2] = fix_3src_operand(op[2]);
1804       emit(BFI2(result_dst, op[0], op[1], op[2]));
1805       break;
1806
1807    case ir_triop_bitfield_extract:
1808       op[0] = fix_3src_operand(op[0]);
1809       op[1] = fix_3src_operand(op[1]);
1810       op[2] = fix_3src_operand(op[2]);
1811       /* Note that the instruction's argument order is reversed from GLSL
1812        * and the IR.
1813        */
1814       emit(BFE(result_dst, op[2], op[1], op[0]));
1815       break;
1816
1817    case ir_triop_vector_insert:
1818       unreachable("should have been lowered by lower_vector_insert");
1819
1820    case ir_quadop_bitfield_insert:
1821       unreachable("not reached: should be handled by "
1822               "bitfield_insert_to_bfm_bfi\n");
1823
1824    case ir_quadop_vector:
1825       unreachable("not reached: should be handled by lower_quadop_vector");
1826
1827    case ir_unop_pack_half_2x16:
1828       emit_pack_half_2x16(result_dst, op[0]);
1829       break;
1830    case ir_unop_unpack_half_2x16:
1831       emit_unpack_half_2x16(result_dst, op[0]);
1832       break;
1833    case ir_unop_pack_snorm_2x16:
1834    case ir_unop_pack_snorm_4x8:
1835    case ir_unop_pack_unorm_2x16:
1836    case ir_unop_pack_unorm_4x8:
1837    case ir_unop_unpack_snorm_2x16:
1838    case ir_unop_unpack_snorm_4x8:
1839    case ir_unop_unpack_unorm_2x16:
1840    case ir_unop_unpack_unorm_4x8:
1841       unreachable("not reached: should be handled by lower_packing_builtins");
1842    case ir_unop_unpack_half_2x16_split_x:
1843    case ir_unop_unpack_half_2x16_split_y:
1844    case ir_binop_pack_half_2x16_split:
1845    case ir_unop_interpolate_at_centroid:
1846    case ir_binop_interpolate_at_sample:
1847    case ir_binop_interpolate_at_offset:
1848       unreachable("not reached: should not occur in vertex shader");
1849    case ir_binop_ldexp:
1850       unreachable("not reached: should be handled by ldexp_to_arith()");
1851    }
1852 }
1853
1854
1855 void
1856 vec4_visitor::visit(ir_swizzle *ir)
1857 {
1858    src_reg src;
1859    int i = 0;
1860    int swizzle[4];
1861
1862    /* Note that this is only swizzles in expressions, not those on the left
1863     * hand side of an assignment, which do write masking.  See ir_assignment
1864     * for that.
1865     */
1866
1867    ir->val->accept(this);
1868    src = this->result;
1869    assert(src.file != BAD_FILE);
1870
1871    for (i = 0; i < ir->type->vector_elements; i++) {
1872       switch (i) {
1873       case 0:
1874          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1875          break;
1876       case 1:
1877          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1878          break;
1879       case 2:
1880          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1881          break;
1882       case 3:
1883          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1884             break;
1885       }
1886    }
1887    for (; i < 4; i++) {
1888       /* Replicate the last channel out. */
1889       swizzle[i] = swizzle[ir->type->vector_elements - 1];
1890    }
1891
1892    src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1893
1894    this->result = src;
1895 }
1896
1897 void
1898 vec4_visitor::visit(ir_dereference_variable *ir)
1899 {
1900    const struct glsl_type *type = ir->type;
1901    dst_reg *reg = variable_storage(ir->var);
1902
1903    if (!reg) {
1904       fail("Failed to find variable storage for %s\n", ir->var->name);
1905       this->result = src_reg(brw_null_reg());
1906       return;
1907    }
1908
1909    this->result = src_reg(*reg);
1910
1911    /* System values get their swizzle from the dst_reg writemask */
1912    if (ir->var->data.mode == ir_var_system_value)
1913       return;
1914
1915    if (type->is_scalar() || type->is_vector() || type->is_matrix())
1916       this->result.swizzle = swizzle_for_size(type->vector_elements);
1917 }
1918
1919
1920 int
1921 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1922 {
1923    /* Under normal circumstances array elements are stored consecutively, so
1924     * the stride is equal to the size of the array element.
1925     */
1926    return type_size(ir->type);
1927 }
1928
1929
1930 void
1931 vec4_visitor::visit(ir_dereference_array *ir)
1932 {
1933    ir_constant *constant_index;
1934    src_reg src;
1935    int array_stride = compute_array_stride(ir);
1936
1937    constant_index = ir->array_index->constant_expression_value();
1938
1939    ir->array->accept(this);
1940    src = this->result;
1941
1942    if (constant_index) {
1943       src.reg_offset += constant_index->value.i[0] * array_stride;
1944    } else {
1945       /* Variable index array dereference.  It eats the "vec4" of the
1946        * base of the array and an index that offsets the Mesa register
1947        * index.
1948        */
1949       ir->array_index->accept(this);
1950
1951       src_reg index_reg;
1952
1953       if (array_stride == 1) {
1954          index_reg = this->result;
1955       } else {
1956          index_reg = src_reg(this, glsl_type::int_type);
1957
1958          emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
1959       }
1960
1961       if (src.reladdr) {
1962          src_reg temp = src_reg(this, glsl_type::int_type);
1963
1964          emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1965
1966          index_reg = temp;
1967       }
1968
1969       src.reladdr = ralloc(mem_ctx, src_reg);
1970       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1971    }
1972
1973    /* If the type is smaller than a vec4, replicate the last channel out. */
1974    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1975       src.swizzle = swizzle_for_size(ir->type->vector_elements);
1976    else
1977       src.swizzle = BRW_SWIZZLE_NOOP;
1978    src.type = brw_type_for_base_type(ir->type);
1979
1980    this->result = src;
1981 }
1982
1983 void
1984 vec4_visitor::visit(ir_dereference_record *ir)
1985 {
1986    unsigned int i;
1987    const glsl_type *struct_type = ir->record->type;
1988    int offset = 0;
1989
1990    ir->record->accept(this);
1991
1992    for (i = 0; i < struct_type->length; i++) {
1993       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1994          break;
1995       offset += type_size(struct_type->fields.structure[i].type);
1996    }
1997
1998    /* If the type is smaller than a vec4, replicate the last channel out. */
1999    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2000       this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
2001    else
2002       this->result.swizzle = BRW_SWIZZLE_NOOP;
2003    this->result.type = brw_type_for_base_type(ir->type);
2004
2005    this->result.reg_offset += offset;
2006 }
2007
2008 /**
2009  * We want to be careful in assignment setup to hit the actual storage
2010  * instead of potentially using a temporary like we might with the
2011  * ir_dereference handler.
2012  */
2013 static dst_reg
2014 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
2015 {
2016    /* The LHS must be a dereference.  If the LHS is a variable indexed array
2017     * access of a vector, it must be separated into a series conditional moves
2018     * before reaching this point (see ir_vec_index_to_cond_assign).
2019     */
2020    assert(ir->as_dereference());
2021    ir_dereference_array *deref_array = ir->as_dereference_array();
2022    if (deref_array) {
2023       assert(!deref_array->array->type->is_vector());
2024    }
2025
2026    /* Use the rvalue deref handler for the most part.  We'll ignore
2027     * swizzles in it and write swizzles using writemask, though.
2028     */
2029    ir->accept(v);
2030    return dst_reg(v->result);
2031 }
2032
2033 void
2034 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
2035                               const struct glsl_type *type,
2036                               enum brw_predicate predicate)
2037 {
2038    if (type->base_type == GLSL_TYPE_STRUCT) {
2039       for (unsigned int i = 0; i < type->length; i++) {
2040          emit_block_move(dst, src, type->fields.structure[i].type, predicate);
2041       }
2042       return;
2043    }
2044
2045    if (type->is_array()) {
2046       for (unsigned int i = 0; i < type->length; i++) {
2047          emit_block_move(dst, src, type->fields.array, predicate);
2048       }
2049       return;
2050    }
2051
2052    if (type->is_matrix()) {
2053       const struct glsl_type *vec_type;
2054
2055       vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
2056                                          type->vector_elements, 1);
2057
2058       for (int i = 0; i < type->matrix_columns; i++) {
2059          emit_block_move(dst, src, vec_type, predicate);
2060       }
2061       return;
2062    }
2063
2064    assert(type->is_scalar() || type->is_vector());
2065
2066    dst->type = brw_type_for_base_type(type);
2067    src->type = dst->type;
2068
2069    dst->writemask = (1 << type->vector_elements) - 1;
2070
2071    src->swizzle = swizzle_for_size(type->vector_elements);
2072
2073    vec4_instruction *inst = emit(MOV(*dst, *src));
2074    inst->predicate = predicate;
2075
2076    dst->reg_offset++;
2077    src->reg_offset++;
2078 }
2079
2080
2081 /* If the RHS processing resulted in an instruction generating a
2082  * temporary value, and it would be easy to rewrite the instruction to
2083  * generate its result right into the LHS instead, do so.  This ends
2084  * up reliably removing instructions where it can be tricky to do so
2085  * later without real UD chain information.
2086  */
2087 bool
2088 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2089                                      dst_reg dst,
2090                                      src_reg src,
2091                                      vec4_instruction *pre_rhs_inst,
2092                                      vec4_instruction *last_rhs_inst)
2093 {
2094    /* This could be supported, but it would take more smarts. */
2095    if (ir->condition)
2096       return false;
2097
2098    if (pre_rhs_inst == last_rhs_inst)
2099       return false; /* No instructions generated to work with. */
2100
2101    /* Make sure the last instruction generated our source reg. */
2102    if (src.file != GRF ||
2103        src.file != last_rhs_inst->dst.file ||
2104        src.reg != last_rhs_inst->dst.reg ||
2105        src.reg_offset != last_rhs_inst->dst.reg_offset ||
2106        src.reladdr ||
2107        src.abs ||
2108        src.negate ||
2109        last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2110       return false;
2111
2112    /* Check that that last instruction fully initialized the channels
2113     * we want to use, in the order we want to use them.  We could
2114     * potentially reswizzle the operands of many instructions so that
2115     * we could handle out of order channels, but don't yet.
2116     */
2117
2118    for (unsigned i = 0; i < 4; i++) {
2119       if (dst.writemask & (1 << i)) {
2120          if (!(last_rhs_inst->dst.writemask & (1 << i)))
2121             return false;
2122
2123          if (BRW_GET_SWZ(src.swizzle, i) != i)
2124             return false;
2125       }
2126    }
2127
2128    /* Success!  Rewrite the instruction. */
2129    last_rhs_inst->dst.file = dst.file;
2130    last_rhs_inst->dst.reg = dst.reg;
2131    last_rhs_inst->dst.reg_offset = dst.reg_offset;
2132    last_rhs_inst->dst.reladdr = dst.reladdr;
2133    last_rhs_inst->dst.writemask &= dst.writemask;
2134
2135    return true;
2136 }
2137
2138 void
2139 vec4_visitor::visit(ir_assignment *ir)
2140 {
2141    dst_reg dst = get_assignment_lhs(ir->lhs, this);
2142    enum brw_predicate predicate = BRW_PREDICATE_NONE;
2143
2144    if (!ir->lhs->type->is_scalar() &&
2145        !ir->lhs->type->is_vector()) {
2146       ir->rhs->accept(this);
2147       src_reg src = this->result;
2148
2149       if (ir->condition) {
2150          emit_bool_to_cond_code(ir->condition, &predicate);
2151       }
2152
2153       /* emit_block_move doesn't account for swizzles in the source register.
2154        * This should be ok, since the source register is a structure or an
2155        * array, and those can't be swizzled.  But double-check to be sure.
2156        */
2157       assert(src.swizzle ==
2158              (ir->rhs->type->is_matrix()
2159               ? swizzle_for_size(ir->rhs->type->vector_elements)
2160               : BRW_SWIZZLE_NOOP));
2161
2162       emit_block_move(&dst, &src, ir->rhs->type, predicate);
2163       return;
2164    }
2165
2166    /* Now we're down to just a scalar/vector with writemasks. */
2167    int i;
2168
2169    vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2170    pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2171
2172    ir->rhs->accept(this);
2173
2174    last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2175
2176    src_reg src = this->result;
2177
2178    int swizzles[4];
2179    int first_enabled_chan = 0;
2180    int src_chan = 0;
2181
2182    assert(ir->lhs->type->is_vector() ||
2183           ir->lhs->type->is_scalar());
2184    dst.writemask = ir->write_mask;
2185
2186    for (int i = 0; i < 4; i++) {
2187       if (dst.writemask & (1 << i)) {
2188          first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
2189          break;
2190       }
2191    }
2192
2193    /* Swizzle a small RHS vector into the channels being written.
2194     *
2195     * glsl ir treats write_mask as dictating how many channels are
2196     * present on the RHS while in our instructions we need to make
2197     * those channels appear in the slots of the vec4 they're written to.
2198     */
2199    for (int i = 0; i < 4; i++) {
2200       if (dst.writemask & (1 << i))
2201          swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
2202       else
2203          swizzles[i] = first_enabled_chan;
2204    }
2205    src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2206                               swizzles[2], swizzles[3]);
2207
2208    if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2209       return;
2210    }
2211
2212    if (ir->condition) {
2213       emit_bool_to_cond_code(ir->condition, &predicate);
2214    }
2215
2216    for (i = 0; i < type_size(ir->lhs->type); i++) {
2217       vec4_instruction *inst = emit(MOV(dst, src));
2218       inst->predicate = predicate;
2219
2220       dst.reg_offset++;
2221       src.reg_offset++;
2222    }
2223 }
2224
2225 void
2226 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2227 {
2228    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2229       foreach_in_list(ir_constant, field_value, &ir->components) {
2230          emit_constant_values(dst, field_value);
2231       }
2232       return;
2233    }
2234
2235    if (ir->type->is_array()) {
2236       for (unsigned int i = 0; i < ir->type->length; i++) {
2237          emit_constant_values(dst, ir->array_elements[i]);
2238       }
2239       return;
2240    }
2241
2242    if (ir->type->is_matrix()) {
2243       for (int i = 0; i < ir->type->matrix_columns; i++) {
2244          float *vec = &ir->value.f[i * ir->type->vector_elements];
2245
2246          for (int j = 0; j < ir->type->vector_elements; j++) {
2247             dst->writemask = 1 << j;
2248             dst->type = BRW_REGISTER_TYPE_F;
2249
2250             emit(MOV(*dst, src_reg(vec[j])));
2251          }
2252          dst->reg_offset++;
2253       }
2254       return;
2255    }
2256
2257    int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2258
2259    for (int i = 0; i < ir->type->vector_elements; i++) {
2260       if (!(remaining_writemask & (1 << i)))
2261          continue;
2262
2263       dst->writemask = 1 << i;
2264       dst->type = brw_type_for_base_type(ir->type);
2265
2266       /* Find other components that match the one we're about to
2267        * write.  Emits fewer instructions for things like vec4(0.5,
2268        * 1.5, 1.5, 1.5).
2269        */
2270       for (int j = i + 1; j < ir->type->vector_elements; j++) {
2271          if (ir->type->base_type == GLSL_TYPE_BOOL) {
2272             if (ir->value.b[i] == ir->value.b[j])
2273                dst->writemask |= (1 << j);
2274          } else {
2275             /* u, i, and f storage all line up, so no need for a
2276              * switch case for comparing each type.
2277              */
2278             if (ir->value.u[i] == ir->value.u[j])
2279                dst->writemask |= (1 << j);
2280          }
2281       }
2282
2283       switch (ir->type->base_type) {
2284       case GLSL_TYPE_FLOAT:
2285          emit(MOV(*dst, src_reg(ir->value.f[i])));
2286          break;
2287       case GLSL_TYPE_INT:
2288          emit(MOV(*dst, src_reg(ir->value.i[i])));
2289          break;
2290       case GLSL_TYPE_UINT:
2291          emit(MOV(*dst, src_reg(ir->value.u[i])));
2292          break;
2293       case GLSL_TYPE_BOOL:
2294          emit(MOV(*dst,
2295                   src_reg(ir->value.b[i] != 0 ? ctx->Const.UniformBooleanTrue
2296                                               : 0u)));
2297          break;
2298       default:
2299          unreachable("Non-float/uint/int/bool constant");
2300       }
2301
2302       remaining_writemask &= ~dst->writemask;
2303    }
2304    dst->reg_offset++;
2305 }
2306
2307 void
2308 vec4_visitor::visit(ir_constant *ir)
2309 {
2310    dst_reg dst = dst_reg(this, ir->type);
2311    this->result = src_reg(dst);
2312
2313    emit_constant_values(&dst, ir);
2314 }
2315
2316 void
2317 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2318 {
2319    ir_dereference *deref = static_cast<ir_dereference *>(
2320       ir->actual_parameters.get_head());
2321    ir_variable *location = deref->variable_referenced();
2322    unsigned surf_index = (prog_data->base.binding_table.abo_start +
2323                           location->data.binding);
2324
2325    /* Calculate the surface offset */
2326    src_reg offset(this, glsl_type::uint_type);
2327    ir_dereference_array *deref_array = deref->as_dereference_array();
2328    if (deref_array) {
2329       deref_array->array_index->accept(this);
2330
2331       src_reg tmp(this, glsl_type::uint_type);
2332       emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2333       emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2334    } else {
2335       offset = location->data.atomic.offset;
2336    }
2337
2338    /* Emit the appropriate machine instruction */
2339    const char *callee = ir->callee->function_name();
2340    dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2341
2342    if (!strcmp("__intrinsic_atomic_read", callee)) {
2343       emit_untyped_surface_read(surf_index, dst, offset);
2344
2345    } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2346       emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2347                           src_reg(), src_reg());
2348
2349    } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2350       emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2351                           src_reg(), src_reg());
2352    }
2353 }
2354
2355 void
2356 vec4_visitor::visit(ir_call *ir)
2357 {
2358    const char *callee = ir->callee->function_name();
2359
2360    if (!strcmp("__intrinsic_atomic_read", callee) ||
2361        !strcmp("__intrinsic_atomic_increment", callee) ||
2362        !strcmp("__intrinsic_atomic_predecrement", callee)) {
2363       visit_atomic_counter_intrinsic(ir);
2364    } else {
2365       unreachable("Unsupported intrinsic.");
2366    }
2367 }
2368
2369 src_reg
2370 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, src_reg sampler)
2371 {
2372    vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_MCS);
2373    inst->base_mrf = 2;
2374    inst->mlen = 1;
2375    inst->dst = dst_reg(this, glsl_type::uvec4_type);
2376    inst->dst.writemask = WRITEMASK_XYZW;
2377
2378    inst->src[1] = sampler;
2379
2380    /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2381    int param_base = inst->base_mrf;
2382    int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2383    int zero_mask = 0xf & ~coord_mask;
2384
2385    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2386             coordinate));
2387
2388    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2389             src_reg(0)));
2390
2391    emit(inst);
2392    return src_reg(inst->dst);
2393 }
2394
2395 static bool
2396 is_high_sampler(struct brw_context *brw, src_reg sampler)
2397 {
2398    if (brw->gen < 8 && !brw->is_haswell)
2399       return false;
2400
2401    return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
2402 }
2403
2404 void
2405 vec4_visitor::visit(ir_texture *ir)
2406 {
2407    uint32_t sampler =
2408       _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2409
2410    ir_rvalue *nonconst_sampler_index =
2411       _mesa_get_sampler_array_nonconst_index(ir->sampler);
2412
2413    /* Handle non-constant sampler array indexing */
2414    src_reg sampler_reg;
2415    if (nonconst_sampler_index) {
2416       /* The highest sampler which may be used by this operation is
2417        * the last element of the array. Mark it here, because the generator
2418        * doesn't have enough information to determine the bound.
2419        */
2420       uint32_t array_size = ir->sampler->as_dereference_array()
2421          ->array->type->array_size();
2422
2423       uint32_t max_used = sampler + array_size - 1;
2424       if (ir->op == ir_tg4 && brw->gen < 8) {
2425          max_used += prog_data->base.binding_table.gather_texture_start;
2426       } else {
2427          max_used += prog_data->base.binding_table.texture_start;
2428       }
2429
2430       brw_mark_surface_used(&prog_data->base, max_used);
2431
2432       /* Emit code to evaluate the actual indexing expression */
2433       nonconst_sampler_index->accept(this);
2434       dst_reg temp(this, glsl_type::uint_type);
2435       emit(ADD(temp, this->result, src_reg(sampler)))
2436          ->force_writemask_all = true;
2437       sampler_reg = src_reg(temp);
2438    } else {
2439       /* Single sampler, or constant array index; the indexing expression
2440        * is just an immediate.
2441        */
2442       sampler_reg = src_reg(sampler);
2443    }
2444
2445    /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2446     * emitting anything other than setting up the constant result.
2447     */
2448    if (ir->op == ir_tg4) {
2449       ir_constant *chan = ir->lod_info.component->as_constant();
2450       int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2451       if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2452          dst_reg result(this, ir->type);
2453          this->result = src_reg(result);
2454          emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2455          return;
2456       }
2457    }
2458
2459    /* Should be lowered by do_lower_texture_projection */
2460    assert(!ir->projector);
2461
2462    /* Should be lowered */
2463    assert(!ir->offset || !ir->offset->type->is_array());
2464
2465    /* Generate code to compute all the subexpression trees.  This has to be
2466     * done before loading any values into MRFs for the sampler message since
2467     * generating these values may involve SEND messages that need the MRFs.
2468     */
2469    src_reg coordinate;
2470    if (ir->coordinate) {
2471       ir->coordinate->accept(this);
2472       coordinate = this->result;
2473    }
2474
2475    src_reg shadow_comparitor;
2476    if (ir->shadow_comparitor) {
2477       ir->shadow_comparitor->accept(this);
2478       shadow_comparitor = this->result;
2479    }
2480
2481    bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2482    src_reg offset_value;
2483    if (has_nonconstant_offset) {
2484       ir->offset->accept(this);
2485       offset_value = src_reg(this->result);
2486    }
2487
2488    const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2489    src_reg lod, dPdx, dPdy, sample_index, mcs;
2490    switch (ir->op) {
2491    case ir_tex:
2492       lod = src_reg(0.0f);
2493       lod_type = glsl_type::float_type;
2494       break;
2495    case ir_txf:
2496    case ir_txl:
2497    case ir_txs:
2498       ir->lod_info.lod->accept(this);
2499       lod = this->result;
2500       lod_type = ir->lod_info.lod->type;
2501       break;
2502    case ir_query_levels:
2503       lod = src_reg(0);
2504       lod_type = glsl_type::int_type;
2505       break;
2506    case ir_txf_ms:
2507       ir->lod_info.sample_index->accept(this);
2508       sample_index = this->result;
2509       sample_index_type = ir->lod_info.sample_index->type;
2510
2511       if (brw->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2512          mcs = emit_mcs_fetch(ir, coordinate, sampler_reg);
2513       else
2514          mcs = src_reg(0u);
2515       break;
2516    case ir_txd:
2517       ir->lod_info.grad.dPdx->accept(this);
2518       dPdx = this->result;
2519
2520       ir->lod_info.grad.dPdy->accept(this);
2521       dPdy = this->result;
2522
2523       lod_type = ir->lod_info.grad.dPdx->type;
2524       break;
2525    case ir_txb:
2526    case ir_lod:
2527    case ir_tg4:
2528       break;
2529    }
2530
2531    enum opcode opcode;
2532    switch (ir->op) {
2533    case ir_tex: opcode = SHADER_OPCODE_TXL; break;
2534    case ir_txl: opcode = SHADER_OPCODE_TXL; break;
2535    case ir_txd: opcode = SHADER_OPCODE_TXD; break;
2536    case ir_txf: opcode = SHADER_OPCODE_TXF; break;
2537    case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
2538    case ir_txs: opcode = SHADER_OPCODE_TXS; break;
2539    case ir_tg4: opcode = has_nonconstant_offset
2540                          ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
2541    case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
2542    case ir_txb:
2543       unreachable("TXB is not valid for vertex shaders.");
2544    case ir_lod:
2545       unreachable("LOD is not valid for vertex shaders.");
2546    default:
2547       unreachable("Unrecognized tex op");
2548    }
2549
2550    vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, opcode);
2551
2552    if (ir->offset != NULL && !has_nonconstant_offset) {
2553       inst->texture_offset =
2554          brw_texture_offset(ctx, ir->offset->as_constant()->value.i,
2555                             ir->offset->type->vector_elements);
2556    }
2557
2558    /* Stuff the channel select bits in the top of the texture offset */
2559    if (ir->op == ir_tg4)
2560       inst->texture_offset |= gather_channel(ir, sampler) << 16;
2561
2562    /* The message header is necessary for:
2563     * - Gen4 (always)
2564     * - Texel offsets
2565     * - Gather channel selection
2566     * - Sampler indices too large to fit in a 4-bit value.
2567     */
2568    inst->header_present =
2569       brw->gen < 5 || inst->texture_offset != 0 || ir->op == ir_tg4 ||
2570       is_high_sampler(brw, sampler_reg);
2571    inst->base_mrf = 2;
2572    inst->mlen = inst->header_present + 1; /* always at least one */
2573    inst->dst = dst_reg(this, ir->type);
2574    inst->dst.writemask = WRITEMASK_XYZW;
2575    inst->shadow_compare = ir->shadow_comparitor != NULL;
2576
2577    inst->src[1] = sampler_reg;
2578
2579    /* MRF for the first parameter */
2580    int param_base = inst->base_mrf + inst->header_present;
2581
2582    if (ir->op == ir_txs || ir->op == ir_query_levels) {
2583       int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2584       emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2585    } else {
2586       /* Load the coordinate */
2587       /* FINISHME: gl_clamp_mask and saturate */
2588       int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2589       int zero_mask = 0xf & ~coord_mask;
2590
2591       emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2592                coordinate));
2593
2594       if (zero_mask != 0) {
2595          emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2596                   src_reg(0)));
2597       }
2598       /* Load the shadow comparitor */
2599       if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2600          emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2601                           WRITEMASK_X),
2602                   shadow_comparitor));
2603          inst->mlen++;
2604       }
2605
2606       /* Load the LOD info */
2607       if (ir->op == ir_tex || ir->op == ir_txl) {
2608          int mrf, writemask;
2609          if (brw->gen >= 5) {
2610             mrf = param_base + 1;
2611             if (ir->shadow_comparitor) {
2612                writemask = WRITEMASK_Y;
2613                /* mlen already incremented */
2614             } else {
2615                writemask = WRITEMASK_X;
2616                inst->mlen++;
2617             }
2618          } else /* brw->gen == 4 */ {
2619             mrf = param_base;
2620             writemask = WRITEMASK_W;
2621          }
2622          emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2623       } else if (ir->op == ir_txf) {
2624          emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2625       } else if (ir->op == ir_txf_ms) {
2626          emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2627                   sample_index));
2628          if (brw->gen >= 7) {
2629             /* MCS data is in the first channel of `mcs`, but we need to get it into
2630              * the .y channel of the second vec4 of params, so replicate .x across
2631              * the whole vec4 and then mask off everything except .y
2632              */
2633             mcs.swizzle = BRW_SWIZZLE_XXXX;
2634             emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2635                      mcs));
2636          }
2637          inst->mlen++;
2638       } else if (ir->op == ir_txd) {
2639          const glsl_type *type = lod_type;
2640
2641          if (brw->gen >= 5) {
2642             dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2643             dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2644             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2645             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2646             inst->mlen++;
2647
2648             if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2649                dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2650                dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2651                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2652                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2653                inst->mlen++;
2654
2655                if (ir->shadow_comparitor) {
2656                   emit(MOV(dst_reg(MRF, param_base + 2,
2657                                    ir->shadow_comparitor->type, WRITEMASK_Z),
2658                            shadow_comparitor));
2659                }
2660             }
2661          } else /* brw->gen == 4 */ {
2662             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2663             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2664             inst->mlen += 2;
2665          }
2666       } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2667          if (ir->shadow_comparitor) {
2668             emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2669                      shadow_comparitor));
2670          }
2671
2672          emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2673                   offset_value));
2674          inst->mlen++;
2675       }
2676    }
2677
2678    emit(inst);
2679
2680    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2681     * spec requires layers.
2682     */
2683    if (ir->op == ir_txs) {
2684       glsl_type const *type = ir->sampler->type;
2685       if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2686           type->sampler_array) {
2687          emit_math(SHADER_OPCODE_INT_QUOTIENT,
2688                    writemask(inst->dst, WRITEMASK_Z),
2689                    src_reg(inst->dst), src_reg(6));
2690       }
2691    }
2692
2693    if (brw->gen == 6 && ir->op == ir_tg4) {
2694       emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2695    }
2696
2697    swizzle_result(ir, src_reg(inst->dst), sampler);
2698 }
2699
2700 /**
2701  * Apply workarounds for Gen6 gather with UINT/SINT
2702  */
2703 void
2704 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2705 {
2706    if (!wa)
2707       return;
2708
2709    int width = (wa & WA_8BIT) ? 8 : 16;
2710    dst_reg dst_f = dst;
2711    dst_f.type = BRW_REGISTER_TYPE_F;
2712
2713    /* Convert from UNORM to UINT */
2714    emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2715    emit(MOV(dst, src_reg(dst_f)));
2716
2717    if (wa & WA_SIGN) {
2718       /* Reinterpret the UINT value as a signed INT value by
2719        * shifting the sign bit into place, then shifting back
2720        * preserving sign.
2721        */
2722       emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2723       emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2724    }
2725 }
2726
2727 /**
2728  * Set up the gather channel based on the swizzle, for gather4.
2729  */
2730 uint32_t
2731 vec4_visitor::gather_channel(ir_texture *ir, uint32_t sampler)
2732 {
2733    ir_constant *chan = ir->lod_info.component->as_constant();
2734    int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2735    switch (swiz) {
2736       case SWIZZLE_X: return 0;
2737       case SWIZZLE_Y:
2738          /* gather4 sampler is broken for green channel on RG32F --
2739           * we must ask for blue instead.
2740           */
2741          if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2742             return 2;
2743          return 1;
2744       case SWIZZLE_Z: return 2;
2745       case SWIZZLE_W: return 3;
2746       default:
2747          unreachable("Not reached"); /* zero, one swizzles handled already */
2748    }
2749 }
2750
2751 void
2752 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, uint32_t sampler)
2753 {
2754    int s = key->tex.swizzles[sampler];
2755
2756    this->result = src_reg(this, ir->type);
2757    dst_reg swizzled_result(this->result);
2758
2759    if (ir->op == ir_query_levels) {
2760       /* # levels is in .w */
2761       orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2762       emit(MOV(swizzled_result, orig_val));
2763       return;
2764    }
2765
2766    if (ir->op == ir_txs || ir->type == glsl_type::float_type
2767                         || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2768       emit(MOV(swizzled_result, orig_val));
2769       return;
2770    }
2771
2772
2773    int zero_mask = 0, one_mask = 0, copy_mask = 0;
2774    int swizzle[4] = {0};
2775
2776    for (int i = 0; i < 4; i++) {
2777       switch (GET_SWZ(s, i)) {
2778       case SWIZZLE_ZERO:
2779          zero_mask |= (1 << i);
2780          break;
2781       case SWIZZLE_ONE:
2782          one_mask |= (1 << i);
2783          break;
2784       default:
2785          copy_mask |= (1 << i);
2786          swizzle[i] = GET_SWZ(s, i);
2787          break;
2788       }
2789    }
2790
2791    if (copy_mask) {
2792       orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2793       swizzled_result.writemask = copy_mask;
2794       emit(MOV(swizzled_result, orig_val));
2795    }
2796
2797    if (zero_mask) {
2798       swizzled_result.writemask = zero_mask;
2799       emit(MOV(swizzled_result, src_reg(0.0f)));
2800    }
2801
2802    if (one_mask) {
2803       swizzled_result.writemask = one_mask;
2804       emit(MOV(swizzled_result, src_reg(1.0f)));
2805    }
2806 }
2807
2808 void
2809 vec4_visitor::visit(ir_return *)
2810 {
2811    unreachable("not reached");
2812 }
2813
2814 void
2815 vec4_visitor::visit(ir_discard *)
2816 {
2817    unreachable("not reached");
2818 }
2819
2820 void
2821 vec4_visitor::visit(ir_if *ir)
2822 {
2823    /* Don't point the annotation at the if statement, because then it plus
2824     * the then and else blocks get printed.
2825     */
2826    this->base_ir = ir->condition;
2827
2828    if (brw->gen == 6) {
2829       emit_if_gen6(ir);
2830    } else {
2831       enum brw_predicate predicate;
2832       emit_bool_to_cond_code(ir->condition, &predicate);
2833       emit(IF(predicate));
2834    }
2835
2836    visit_instructions(&ir->then_instructions);
2837
2838    if (!ir->else_instructions.is_empty()) {
2839       this->base_ir = ir->condition;
2840       emit(BRW_OPCODE_ELSE);
2841
2842       visit_instructions(&ir->else_instructions);
2843    }
2844
2845    this->base_ir = ir->condition;
2846    emit(BRW_OPCODE_ENDIF);
2847 }
2848
2849 void
2850 vec4_visitor::visit(ir_emit_vertex *)
2851 {
2852    unreachable("not reached");
2853 }
2854
2855 void
2856 vec4_visitor::visit(ir_end_primitive *)
2857 {
2858    unreachable("not reached");
2859 }
2860
2861 void
2862 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2863                                   dst_reg dst, src_reg offset,
2864                                   src_reg src0, src_reg src1)
2865 {
2866    unsigned mlen = 0;
2867
2868    /* Set the atomic operation offset. */
2869    emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2870    mlen++;
2871
2872    /* Set the atomic operation arguments. */
2873    if (src0.file != BAD_FILE) {
2874       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2875       mlen++;
2876    }
2877
2878    if (src1.file != BAD_FILE) {
2879       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2880       mlen++;
2881    }
2882
2883    /* Emit the instruction.  Note that this maps to the normal SIMD8
2884     * untyped atomic message on Ivy Bridge, but that's OK because
2885     * unused channels will be masked out.
2886     */
2887    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
2888                                  src_reg(atomic_op), src_reg(surf_index));
2889    inst->base_mrf = 0;
2890    inst->mlen = mlen;
2891 }
2892
2893 void
2894 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
2895                                         src_reg offset)
2896 {
2897    /* Set the surface read offset. */
2898    emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
2899
2900    /* Emit the instruction.  Note that this maps to the normal SIMD8
2901     * untyped surface read message, but that's OK because unused
2902     * channels will be masked out.
2903     */
2904    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ,
2905                                  dst, src_reg(surf_index));
2906    inst->base_mrf = 0;
2907    inst->mlen = 1;
2908 }
2909
2910 void
2911 vec4_visitor::emit_ndc_computation()
2912 {
2913    /* Get the position */
2914    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2915
2916    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2917    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2918    output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2919
2920    current_annotation = "NDC";
2921    dst_reg ndc_w = ndc;
2922    ndc_w.writemask = WRITEMASK_W;
2923    src_reg pos_w = pos;
2924    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2925    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2926
2927    dst_reg ndc_xyz = ndc;
2928    ndc_xyz.writemask = WRITEMASK_XYZ;
2929
2930    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2931 }
2932
2933 void
2934 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
2935 {
2936    if (brw->gen < 6 &&
2937        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2938         key->userclip_active || brw->has_negative_rhw_bug)) {
2939       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2940       dst_reg header1_w = header1;
2941       header1_w.writemask = WRITEMASK_W;
2942
2943       emit(MOV(header1, 0u));
2944
2945       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2946          src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2947
2948          current_annotation = "Point size";
2949          emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2950          emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2951       }
2952
2953       if (key->userclip_active) {
2954          current_annotation = "Clipping flags";
2955          dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
2956          dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
2957
2958          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
2959          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
2960          emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
2961
2962          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
2963          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
2964          emit(SHL(flags1, src_reg(flags1), src_reg(4)));
2965          emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
2966       }
2967
2968       /* i965 clipping workaround:
2969        * 1) Test for -ve rhw
2970        * 2) If set,
2971        *      set ndc = (0,0,0,0)
2972        *      set ucp[6] = 1
2973        *
2974        * Later, clipping will detect ucp[6] and ensure the primitive is
2975        * clipped against all fixed planes.
2976        */
2977       if (brw->has_negative_rhw_bug) {
2978          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
2979          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
2980          emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
2981          vec4_instruction *inst;
2982          inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
2983          inst->predicate = BRW_PREDICATE_NORMAL;
2984          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
2985          inst->predicate = BRW_PREDICATE_NORMAL;
2986       }
2987
2988       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2989    } else if (brw->gen < 6) {
2990       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2991    } else {
2992       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2993       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2994          dst_reg reg_w = reg;
2995          reg_w.writemask = WRITEMASK_W;
2996          emit(MOV(reg_w, src_reg(output_reg[VARYING_SLOT_PSIZ])));
2997       }
2998       if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
2999          dst_reg reg_y = reg;
3000          reg_y.writemask = WRITEMASK_Y;
3001          reg_y.type = BRW_REGISTER_TYPE_D;
3002          emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
3003       }
3004       if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
3005          dst_reg reg_z = reg;
3006          reg_z.writemask = WRITEMASK_Z;
3007          reg_z.type = BRW_REGISTER_TYPE_D;
3008          emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
3009       }
3010    }
3011 }
3012
3013 void
3014 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
3015 {
3016    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
3017     *
3018     *     "If a linked set of shaders forming the vertex stage contains no
3019     *     static write to gl_ClipVertex or gl_ClipDistance, but the
3020     *     application has requested clipping against user clip planes through
3021     *     the API, then the coordinate written to gl_Position is used for
3022     *     comparison against the user clip planes."
3023     *
3024     * This function is only called if the shader didn't write to
3025     * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
3026     * if the user wrote to it; otherwise we use gl_Position.
3027     */
3028    gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
3029    if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
3030       clip_vertex = VARYING_SLOT_POS;
3031    }
3032
3033    for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
3034         ++i) {
3035       reg.writemask = 1 << i;
3036       emit(DP4(reg,
3037                src_reg(output_reg[clip_vertex]),
3038                src_reg(this->userplane[i + offset])));
3039    }
3040 }
3041
3042 void
3043 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
3044 {
3045    assert (varying < VARYING_SLOT_MAX);
3046    reg.type = output_reg[varying].type;
3047    current_annotation = output_reg_annotation[varying];
3048    /* Copy the register, saturating if necessary */
3049    vec4_instruction *inst = emit(MOV(reg,
3050                                      src_reg(output_reg[varying])));
3051    if ((varying == VARYING_SLOT_COL0 ||
3052         varying == VARYING_SLOT_COL1 ||
3053         varying == VARYING_SLOT_BFC0 ||
3054         varying == VARYING_SLOT_BFC1) &&
3055        key->clamp_vertex_color) {
3056       inst->saturate = true;
3057    }
3058 }
3059
3060 void
3061 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
3062 {
3063    reg.type = BRW_REGISTER_TYPE_F;
3064
3065    switch (varying) {
3066    case VARYING_SLOT_PSIZ:
3067    {
3068       /* PSIZ is always in slot 0, and is coupled with other flags. */
3069       current_annotation = "indices, point width, clip flags";
3070       emit_psiz_and_flags(reg);
3071       break;
3072    }
3073    case BRW_VARYING_SLOT_NDC:
3074       current_annotation = "NDC";
3075       emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
3076       break;
3077    case VARYING_SLOT_POS:
3078       current_annotation = "gl_Position";
3079       emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
3080       break;
3081    case VARYING_SLOT_EDGE:
3082       /* This is present when doing unfilled polygons.  We're supposed to copy
3083        * the edge flag from the user-provided vertex array
3084        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
3085        * of that attribute (starts as 1.0f).  This is then used in clipping to
3086        * determine which edges should be drawn as wireframe.
3087        */
3088       current_annotation = "edge flag";
3089       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
3090                                     glsl_type::float_type, WRITEMASK_XYZW))));
3091       break;
3092    case BRW_VARYING_SLOT_PAD:
3093       /* No need to write to this slot */
3094       break;
3095    default:
3096       emit_generic_urb_slot(reg, varying);
3097       break;
3098    }
3099 }
3100
3101 static int
3102 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
3103 {
3104    if (brw->gen >= 6) {
3105       /* URB data written (does not include the message header reg) must
3106        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
3107        * section 5.4.3.2.2: URB_INTERLEAVED.
3108        *
3109        * URB entries are allocated on a multiple of 1024 bits, so an
3110        * extra 128 bits written here to make the end align to 256 is
3111        * no problem.
3112        */
3113       if ((mlen % 2) != 1)
3114          mlen++;
3115    }
3116
3117    return mlen;
3118 }
3119
3120
3121 /**
3122  * Generates the VUE payload plus the necessary URB write instructions to
3123  * output it.
3124  *
3125  * The VUE layout is documented in Volume 2a.
3126  */
3127 void
3128 vec4_visitor::emit_vertex()
3129 {
3130    /* MRF 0 is reserved for the debugger, so start with message header
3131     * in MRF 1.
3132     */
3133    int base_mrf = 1;
3134    int mrf = base_mrf;
3135    /* In the process of generating our URB write message contents, we
3136     * may need to unspill a register or load from an array.  Those
3137     * reads would use MRFs 14-15.
3138     */
3139    int max_usable_mrf = 13;
3140
3141    /* The following assertion verifies that max_usable_mrf causes an
3142     * even-numbered amount of URB write data, which will meet gen6's
3143     * requirements for length alignment.
3144     */
3145    assert ((max_usable_mrf - base_mrf) % 2 == 0);
3146
3147    /* First mrf is the g0-based message header containing URB handles and
3148     * such.
3149     */
3150    emit_urb_write_header(mrf++);
3151
3152    if (brw->gen < 6) {
3153       emit_ndc_computation();
3154    }
3155
3156    /* Lower legacy ff and ClipVertex clipping to clip distances */
3157    if (key->userclip_active && !prog->UsesClipDistanceOut) {
3158       current_annotation = "user clip distances";
3159
3160       output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3161       output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3162
3163       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3164       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3165    }
3166
3167    /* We may need to split this up into several URB writes, so do them in a
3168     * loop.
3169     */
3170    int slot = 0;
3171    bool complete = false;
3172    do {
3173       /* URB offset is in URB row increments, and each of our MRFs is half of
3174        * one of those, since we're doing interleaved writes.
3175        */
3176       int offset = slot / 2;
3177
3178       mrf = base_mrf + 1;
3179       for (; slot < prog_data->vue_map.num_slots; ++slot) {
3180          emit_urb_slot(dst_reg(MRF, mrf++),
3181                        prog_data->vue_map.slot_to_varying[slot]);
3182
3183          /* If this was max_usable_mrf, we can't fit anything more into this
3184           * URB WRITE.
3185           */
3186          if (mrf > max_usable_mrf) {
3187             slot++;
3188             break;
3189          }
3190       }
3191
3192       complete = slot >= prog_data->vue_map.num_slots;
3193       current_annotation = "URB write";
3194       vec4_instruction *inst = emit_urb_write_opcode(complete);
3195       inst->base_mrf = base_mrf;
3196       inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
3197       inst->offset += offset;
3198    } while(!complete);
3199 }
3200
3201
3202 src_reg
3203 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
3204                                  src_reg *reladdr, int reg_offset)
3205 {
3206    /* Because we store the values to scratch interleaved like our
3207     * vertex data, we need to scale the vec4 index by 2.
3208     */
3209    int message_header_scale = 2;
3210
3211    /* Pre-gen6, the message header uses byte offsets instead of vec4
3212     * (16-byte) offset units.
3213     */
3214    if (brw->gen < 6)
3215       message_header_scale *= 16;
3216
3217    if (reladdr) {
3218       src_reg index = src_reg(this, glsl_type::int_type);
3219
3220       emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3221                                    src_reg(reg_offset)));
3222       emit_before(block, inst, MUL(dst_reg(index), index,
3223                                    src_reg(message_header_scale)));
3224
3225       return index;
3226    } else {
3227       return src_reg(reg_offset * message_header_scale);
3228    }
3229 }
3230
3231 src_reg
3232 vec4_visitor::get_pull_constant_offset(bblock_t * block, vec4_instruction *inst,
3233                                        src_reg *reladdr, int reg_offset)
3234 {
3235    if (reladdr) {
3236       src_reg index = src_reg(this, glsl_type::int_type);
3237
3238       emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3239                                    src_reg(reg_offset)));
3240
3241       /* Pre-gen6, the message header uses byte offsets instead of vec4
3242        * (16-byte) offset units.
3243        */
3244       if (brw->gen < 6) {
3245          emit_before(block, inst, MUL(dst_reg(index), index, src_reg(16)));
3246       }
3247
3248       return index;
3249    } else if (brw->gen >= 8) {
3250       /* Store the offset in a GRF so we can send-from-GRF. */
3251       src_reg offset = src_reg(this, glsl_type::int_type);
3252       emit_before(block, inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3253       return offset;
3254    } else {
3255       int message_header_scale = brw->gen < 6 ? 16 : 1;
3256       return src_reg(reg_offset * message_header_scale);
3257    }
3258 }
3259
3260 /**
3261  * Emits an instruction before @inst to load the value named by @orig_src
3262  * from scratch space at @base_offset to @temp.
3263  *
3264  * @base_offset is measured in 32-byte units (the size of a register).
3265  */
3266 void
3267 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
3268                                 dst_reg temp, src_reg orig_src,
3269                                 int base_offset)
3270 {
3271    int reg_offset = base_offset + orig_src.reg_offset;
3272    src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
3273                                       reg_offset);
3274
3275    emit_before(block, inst, SCRATCH_READ(temp, index));
3276 }
3277
3278 /**
3279  * Emits an instruction after @inst to store the value to be written
3280  * to @orig_dst to scratch space at @base_offset, from @temp.
3281  *
3282  * @base_offset is measured in 32-byte units (the size of a register).
3283  */
3284 void
3285 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
3286                                  int base_offset)
3287 {
3288    int reg_offset = base_offset + inst->dst.reg_offset;
3289    src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
3290                                       reg_offset);
3291
3292    /* Create a temporary register to store *inst's result in.
3293     *
3294     * We have to be careful in MOVing from our temporary result register in
3295     * the scratch write.  If we swizzle from channels of the temporary that
3296     * weren't initialized, it will confuse live interval analysis, which will
3297     * make spilling fail to make progress.
3298     */
3299    src_reg temp = src_reg(this, glsl_type::vec4_type);
3300    temp.type = inst->dst.type;
3301    int first_writemask_chan = ffs(inst->dst.writemask) - 1;
3302    int swizzles[4];
3303    for (int i = 0; i < 4; i++)
3304       if (inst->dst.writemask & (1 << i))
3305          swizzles[i] = i;
3306       else
3307          swizzles[i] = first_writemask_chan;
3308    temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
3309                                swizzles[2], swizzles[3]);
3310
3311    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3312                                        inst->dst.writemask));
3313    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3314    write->predicate = inst->predicate;
3315    write->ir = inst->ir;
3316    write->annotation = inst->annotation;
3317    inst->insert_after(block, write);
3318
3319    inst->dst.file = temp.file;
3320    inst->dst.reg = temp.reg;
3321    inst->dst.reg_offset = temp.reg_offset;
3322    inst->dst.reladdr = NULL;
3323 }
3324
3325 /**
3326  * We can't generally support array access in GRF space, because a
3327  * single instruction's destination can only span 2 contiguous
3328  * registers.  So, we send all GRF arrays that get variable index
3329  * access to scratch space.
3330  */
3331 void
3332 vec4_visitor::move_grf_array_access_to_scratch()
3333 {
3334    int scratch_loc[this->virtual_grf_count];
3335    memset(scratch_loc, -1, sizeof(scratch_loc));
3336
3337    /* First, calculate the set of virtual GRFs that need to be punted
3338     * to scratch due to having any array access on them, and where in
3339     * scratch.
3340     */
3341    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
3342       if (inst->dst.file == GRF && inst->dst.reladdr &&
3343           scratch_loc[inst->dst.reg] == -1) {
3344          scratch_loc[inst->dst.reg] = c->last_scratch;
3345          c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
3346       }
3347
3348       for (int i = 0 ; i < 3; i++) {
3349          src_reg *src = &inst->src[i];
3350
3351          if (src->file == GRF && src->reladdr &&
3352              scratch_loc[src->reg] == -1) {
3353             scratch_loc[src->reg] = c->last_scratch;
3354             c->last_scratch += this->virtual_grf_sizes[src->reg];
3355          }
3356       }
3357    }
3358
3359    /* Now, for anything that will be accessed through scratch, rewrite
3360     * it to load/store.  Note that this is a _safe list walk, because
3361     * we may generate a new scratch_write instruction after the one
3362     * we're processing.
3363     */
3364    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3365       /* Set up the annotation tracking for new generated instructions. */
3366       base_ir = inst->ir;
3367       current_annotation = inst->annotation;
3368
3369       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
3370          emit_scratch_write(block, inst, scratch_loc[inst->dst.reg]);
3371       }
3372
3373       for (int i = 0 ; i < 3; i++) {
3374          if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
3375             continue;
3376
3377          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3378
3379          emit_scratch_read(block, inst, temp, inst->src[i],
3380                            scratch_loc[inst->src[i].reg]);
3381
3382          inst->src[i].file = temp.file;
3383          inst->src[i].reg = temp.reg;
3384          inst->src[i].reg_offset = temp.reg_offset;
3385          inst->src[i].reladdr = NULL;
3386       }
3387    }
3388 }
3389
3390 /**
3391  * Emits an instruction before @inst to load the value named by @orig_src
3392  * from the pull constant buffer (surface) at @base_offset to @temp.
3393  */
3394 void
3395 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
3396                                       dst_reg temp, src_reg orig_src,
3397                                       int base_offset)
3398 {
3399    int reg_offset = base_offset + orig_src.reg_offset;
3400    src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3401    src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr,
3402                                              reg_offset);
3403    vec4_instruction *load;
3404
3405    if (brw->gen >= 7) {
3406       dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
3407       grf_offset.type = offset.type;
3408       emit_before(block, inst, MOV(grf_offset, offset));
3409
3410       load = new(mem_ctx) vec4_instruction(this,
3411                                            VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
3412                                            temp, index, src_reg(grf_offset));
3413    } else {
3414       load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
3415                                            temp, index, offset);
3416       load->base_mrf = 14;
3417       load->mlen = 1;
3418    }
3419    emit_before(block, inst, load);
3420 }
3421
3422 /**
3423  * Implements array access of uniforms by inserting a
3424  * PULL_CONSTANT_LOAD instruction.
3425  *
3426  * Unlike temporary GRF array access (where we don't support it due to
3427  * the difficulty of doing relative addressing on instruction
3428  * destinations), we could potentially do array access of uniforms
3429  * that were loaded in GRF space as push constants.  In real-world
3430  * usage we've seen, though, the arrays being used are always larger
3431  * than we could load as push constants, so just always move all
3432  * uniform array access out to a pull constant buffer.
3433  */
3434 void
3435 vec4_visitor::move_uniform_array_access_to_pull_constants()
3436 {
3437    int pull_constant_loc[this->uniforms];
3438    memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
3439
3440    /* Walk through and find array access of uniforms.  Put a copy of that
3441     * uniform in the pull constant buffer.
3442     *
3443     * Note that we don't move constant-indexed accesses to arrays.  No
3444     * testing has been done of the performance impact of this choice.
3445     */
3446    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3447       for (int i = 0 ; i < 3; i++) {
3448          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3449             continue;
3450
3451          int uniform = inst->src[i].reg;
3452
3453          /* If this array isn't already present in the pull constant buffer,
3454           * add it.
3455           */
3456          if (pull_constant_loc[uniform] == -1) {
3457             const gl_constant_value **values =
3458                &stage_prog_data->param[uniform * 4];
3459
3460             pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3461
3462             assert(uniform < uniform_array_size);
3463             for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3464                stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3465                   = values[j];
3466             }
3467          }
3468
3469          /* Set up the annotation tracking for new generated instructions. */
3470          base_ir = inst->ir;
3471          current_annotation = inst->annotation;
3472
3473          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3474
3475          emit_pull_constant_load(block, inst, temp, inst->src[i],
3476                                  pull_constant_loc[uniform]);
3477
3478          inst->src[i].file = temp.file;
3479          inst->src[i].reg = temp.reg;
3480          inst->src[i].reg_offset = temp.reg_offset;
3481          inst->src[i].reladdr = NULL;
3482       }
3483    }
3484
3485    /* Now there are no accesses of the UNIFORM file with a reladdr, so
3486     * no need to track them as larger-than-vec4 objects.  This will be
3487     * relied on in cutting out unused uniform vectors from push
3488     * constants.
3489     */
3490    split_uniform_registers();
3491 }
3492
3493 void
3494 vec4_visitor::resolve_ud_negate(src_reg *reg)
3495 {
3496    if (reg->type != BRW_REGISTER_TYPE_UD ||
3497        !reg->negate)
3498       return;
3499
3500    src_reg temp = src_reg(this, glsl_type::uvec4_type);
3501    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3502    *reg = temp;
3503 }
3504
3505 vec4_visitor::vec4_visitor(struct brw_context *brw,
3506                            struct brw_vec4_compile *c,
3507                            struct gl_program *prog,
3508                            const struct brw_vec4_prog_key *key,
3509                            struct brw_vec4_prog_data *prog_data,
3510                            struct gl_shader_program *shader_prog,
3511                            gl_shader_stage stage,
3512                            void *mem_ctx,
3513                            bool debug_flag,
3514                            bool no_spills,
3515                            shader_time_shader_type st_base,
3516                            shader_time_shader_type st_written,
3517                            shader_time_shader_type st_reset)
3518    : backend_visitor(brw, shader_prog, prog, &prog_data->base, stage),
3519      c(c),
3520      key(key),
3521      prog_data(prog_data),
3522      sanity_param_count(0),
3523      fail_msg(NULL),
3524      first_non_payload_grf(0),
3525      need_all_constants_in_pull_buffer(false),
3526      debug_flag(debug_flag),
3527      no_spills(no_spills),
3528      st_base(st_base),
3529      st_written(st_written),
3530      st_reset(st_reset)
3531 {
3532    this->mem_ctx = mem_ctx;
3533    this->failed = false;
3534
3535    this->base_ir = NULL;
3536    this->current_annotation = NULL;
3537    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3538
3539    this->variable_ht = hash_table_ctor(0,
3540                                        hash_table_pointer_hash,
3541                                        hash_table_pointer_compare);
3542
3543    this->virtual_grf_start = NULL;
3544    this->virtual_grf_end = NULL;
3545    this->virtual_grf_sizes = NULL;
3546    this->virtual_grf_count = 0;
3547    this->virtual_grf_reg_map = NULL;
3548    this->virtual_grf_reg_count = 0;
3549    this->virtual_grf_array_size = 0;
3550    this->live_intervals_valid = false;
3551
3552    this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3553
3554    this->uniforms = 0;
3555
3556    /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3557     * at least one. See setup_uniforms() in brw_vec4.cpp.
3558     */
3559    this->uniform_array_size = 1;
3560    if (prog_data) {
3561       this->uniform_array_size = MAX2(stage_prog_data->nr_params, 1);
3562    }
3563
3564    this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3565    this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3566 }
3567
3568 vec4_visitor::~vec4_visitor()
3569 {
3570    hash_table_dtor(this->variable_ht);
3571 }
3572
3573
3574 void
3575 vec4_visitor::fail(const char *format, ...)
3576 {
3577    va_list va;
3578    char *msg;
3579
3580    if (failed)
3581       return;
3582
3583    failed = true;
3584
3585    va_start(va, format);
3586    msg = ralloc_vasprintf(mem_ctx, format, va);
3587    va_end(va);
3588    msg = ralloc_asprintf(mem_ctx, "vec4 compile failed: %s\n", msg);
3589
3590    this->fail_msg = msg;
3591
3592    if (debug_flag) {
3593       fprintf(stderr, "%s",  msg);
3594    }
3595 }
3596
3597 } /* namespace brw */