src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "brw_cfg.h"
  26 #include "glsl/ir_uniform.h"
  27 extern "C" {
  28 #include "program/sampler.h"
  29 }
  30
  31 namespace brw {
  32
  33 vec4_instruction::vec4_instruction(vec4_visitor *v,
  34                                    enum opcode opcode, const dst_reg &dst,
  35                                    const src_reg &src0, const src_reg &src1,
  36                                    const src_reg &src2)
  37 {
  38    this->opcode = opcode;
  39    this->dst = dst;
  40    this->src[0] = src0;
  41    this->src[1] = src1;
  42    this->src[2] = src2;
  43    this->saturate = false;
  44    this->force_writemask_all = false;
  45    this->no_dd_clear = false;
  46    this->no_dd_check = false;
  47    this->writes_accumulator = false;
  48    this->conditional_mod = BRW_CONDITIONAL_NONE;
  49    this->texture_offset = 0;
  50    this->target = 0;
  51    this->shadow_compare = false;
  52    this->ir = v->base_ir;
  53    this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
  54    this->header_present = false;
  55    this->mlen = 0;
  56    this->base_mrf = 0;
  57    this->offset = 0;
  58    this->annotation = v->current_annotation;
  59 }
  60
  61 vec4_instruction *
  62 vec4_visitor::emit(vec4_instruction *inst)
  63 {
  64    this->instructions.push_tail(inst);
  65
  66    return inst;
  67 }
  68
  69 vec4_instruction *
  70 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
  71                           vec4_instruction *new_inst)
  72 {
  73    new_inst->ir = inst->ir;
  74    new_inst->annotation = inst->annotation;
  75
  76    inst->insert_before(block, new_inst);
  77
  78    return inst;
  79 }
  80
  81 vec4_instruction *
  82 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  83                    const src_reg &src1, const src_reg &src2)
  84 {
  85    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
  86                                              src0, src1, src2));
  87 }
  88
  89
  90 vec4_instruction *
  91 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  92                    const src_reg &src1)
  93 {
  94    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
  95 }
  96
  97 vec4_instruction *
  98 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
  99 {
 100    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
 101 }
 102
 103 vec4_instruction *
 104 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
 105 {
 106    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst));
 107 }
 108
 109 vec4_instruction *
 110 vec4_visitor::emit(enum opcode opcode)
 111 {
 112    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
 113 }
 114
 115 #define ALU1(op)                                                        \
 116    vec4_instruction *                                                   \
 117    vec4_visitor::op(const dst_reg &dst, const src_reg &src0)            \
 118    {                                                                    \
 119       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 120                                            src0);                       \
 121    }
 122
 123 #define ALU2(op)                                                        \
 124    vec4_instruction *                                                   \
 125    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 126                     const src_reg &src1)                                \
 127    {                                                                    \
 128       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 129                                            src0, src1);                 \
 130    }
 131
 132 #define ALU2_ACC(op)                                                    \
 133    vec4_instruction *                                                   \
 134    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 135                     const src_reg &src1)                                \
 136    {                                                                    \
 137       vec4_instruction *inst = new(mem_ctx) vec4_instruction(this,     \
 138                        BRW_OPCODE_##op, dst, src0, src1);               \
 139       inst->writes_accumulator = true;                                 \
 140       return inst;                                                     \
 141    }
 142
 143 #define ALU3(op)                                                        \
 144    vec4_instruction *                                                   \
 145    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 146                     const src_reg &src1, const src_reg &src2)           \
 147    {                                                                    \
 148       assert(brw->gen >= 6);                                            \
 149       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 150                                            src0, src1, src2);           \
 151    }
 152
 153 ALU1(NOT)
 154 ALU1(MOV)
 155 ALU1(FRC)
 156 ALU1(RNDD)
 157 ALU1(RNDE)
 158 ALU1(RNDZ)
 159 ALU1(F32TO16)
 160 ALU1(F16TO32)
 161 ALU2(ADD)
 162 ALU2(MUL)
 163 ALU2_ACC(MACH)
 164 ALU2(AND)
 165 ALU2(OR)
 166 ALU2(XOR)
 167 ALU2(DP3)
 168 ALU2(DP4)
 169 ALU2(DPH)
 170 ALU2(SHL)
 171 ALU2(SHR)
 172 ALU2(ASR)
 173 ALU3(LRP)
 174 ALU1(BFREV)
 175 ALU3(BFE)
 176 ALU2(BFI1)
 177 ALU3(BFI2)
 178 ALU1(FBH)
 179 ALU1(FBL)
 180 ALU1(CBIT)
 181 ALU3(MAD)
 182 ALU2_ACC(ADDC)
 183 ALU2_ACC(SUBB)
 184 ALU2(MAC)
 185
 186 /** Gen4 predicated IF. */
 187 vec4_instruction *
 188 vec4_visitor::IF(enum brw_predicate predicate)
 189 {
 190    vec4_instruction *inst;
 191
 192    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
 193    inst->predicate = predicate;
 194
 195    return inst;
 196 }
 197
 198 /** Gen6 IF with embedded comparison. */
 199 vec4_instruction *
 200 vec4_visitor::IF(src_reg src0, src_reg src1,
 201                  enum brw_conditional_mod condition)
 202 {
 203    assert(brw->gen == 6);
 204
 205    vec4_instruction *inst;
 206
 207    resolve_ud_negate(&src0);
 208    resolve_ud_negate(&src1);
 209
 210    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
 211                                         src0, src1);
 212    inst->conditional_mod = condition;
 213
 214    return inst;
 215 }
 216
 217 /**
 218  * CMP: Sets the low bit of the destination channels with the result
 219  * of the comparison, while the upper bits are undefined, and updates
 220  * the flag register with the packed 16 bits of the result.
 221  */
 222 vec4_instruction *
 223 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
 224                   enum brw_conditional_mod condition)
 225 {
 226    vec4_instruction *inst;
 227
 228    /* original gen4 does type conversion to the destination type
 229     * before before comparison, producing garbage results for floating
 230     * point comparisons.
 231     */
 232    if (brw->gen == 4) {
 233       dst.type = src0.type;
 234       if (dst.file == HW_REG)
 235          dst.fixed_hw_reg.type = dst.type;
 236    }
 237
 238    resolve_ud_negate(&src0);
 239    resolve_ud_negate(&src1);
 240
 241    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
 242    inst->conditional_mod = condition;
 243
 244    return inst;
 245 }
 246
 247 vec4_instruction *
 248 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
 249 {
 250    vec4_instruction *inst;
 251
 252    inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_READ,
 253                                         dst, index);
 254    inst->base_mrf = 14;
 255    inst->mlen = 2;
 256
 257    return inst;
 258 }
 259
 260 vec4_instruction *
 261 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
 262                             const src_reg &index)
 263 {
 264    vec4_instruction *inst;
 265
 266    inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_WRITE,
 267                                         dst, src, index);
 268    inst->base_mrf = 13;
 269    inst->mlen = 3;
 270
 271    return inst;
 272 }
 273
 274 void
 275 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
 276 {
 277    static enum opcode dot_opcodes[] = {
 278       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
 279    };
 280
 281    emit(dot_opcodes[elements - 2], dst, src0, src1);
 282 }
 283
 284 src_reg
 285 vec4_visitor::fix_3src_operand(src_reg src)
 286 {
 287    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
 288     * able to use vertical stride of zero to replicate the vec4 uniform, like
 289     *
 290     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
 291     *
 292     * But you can't, since vertical stride is always four in three-source
 293     * instructions. Instead, insert a MOV instruction to do the replication so
 294     * that the three-source instruction can consume it.
 295     */
 296
 297    /* The MOV is only needed if the source is a uniform or immediate. */
 298    if (src.file != UNIFORM && src.file != IMM)
 299       return src;
 300
 301    if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
 302       return src;
 303
 304    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 305    expanded.type = src.type;
 306    emit(MOV(expanded, src));
 307    return src_reg(expanded);
 308 }
 309
 310 src_reg
 311 vec4_visitor::fix_math_operand(src_reg src)
 312 {
 313    if (brw->gen < 6 || brw->gen >= 8 || src.file == BAD_FILE)
 314       return src;
 315
 316    /* The gen6 math instruction ignores the source modifiers --
 317     * swizzle, abs, negate, and at least some parts of the register
 318     * region description.
 319     *
 320     * Rather than trying to enumerate all these cases, *always* expand the
 321     * operand to a temp GRF for gen6.
 322     *
 323     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 324     * can't use.
 325     */
 326
 327    if (brw->gen == 7 && src.file != IMM)
 328       return src;
 329
 330    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 331    expanded.type = src.type;
 332    emit(MOV(expanded, src));
 333    return src_reg(expanded);
 334 }
 335
 336 void
 337 vec4_visitor::emit_math(enum opcode opcode,
 338                         const dst_reg &dst,
 339                         const src_reg &src0, const src_reg &src1)
 340 {
 341    vec4_instruction *math =
 342       emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
 343
 344    if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
 345       /* MATH on Gen6 must be align1, so we can't do writemasks. */
 346       math->dst = dst_reg(this, glsl_type::vec4_type);
 347       math->dst.type = dst.type;
 348       emit(MOV(dst, src_reg(math->dst)));
 349    } else if (brw->gen < 6) {
 350       math->base_mrf = 1;
 351       math->mlen = src1.file == BAD_FILE ? 1 : 2;
 352    }
 353 }
 354
 355 void
 356 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 357 {
 358    if (brw->gen < 7) {
 359       unreachable("ir_unop_pack_half_2x16 should be lowered");
 360    }
 361
 362    assert(dst.type == BRW_REGISTER_TYPE_UD);
 363    assert(src0.type == BRW_REGISTER_TYPE_F);
 364
 365    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
 366     *
 367     *   Because this instruction does not have a 16-bit floating-point type,
 368     *   the destination data type must be Word (W).
 369     *
 370     *   The destination must be DWord-aligned and specify a horizontal stride
 371     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
 372     *   each destination channel and the upper word is not modified.
 373     *
 374     * The above restriction implies that the f32to16 instruction must use
 375     * align1 mode, because only in align1 mode is it possible to specify
 376     * horizontal stride.  We choose here to defy the hardware docs and emit
 377     * align16 instructions.
 378     *
 379     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
 380     * instructions. I was partially successful in that the code passed all
 381     * tests.  However, the code was dubiously correct and fragile, and the
 382     * tests were not harsh enough to probe that frailty. Not trusting the
 383     * code, I chose instead to remain in align16 mode in defiance of the hw
 384     * docs).
 385     *
 386     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
 387     * simulator, emitting a f32to16 in align16 mode with UD as destination
 388     * data type is safe. The behavior differs from that specified in the PRM
 389     * in that the upper word of each destination channel is cleared to 0.
 390     */
 391
 392    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 393    src_reg tmp_src(tmp_dst);
 394
 395 #if 0
 396    /* Verify the undocumented behavior on which the following instructions
 397     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
 398     * then the result of the bit-or instruction below will be incorrect.
 399     *
 400     * You should inspect the disasm output in order to verify that the MOV is
 401     * not optimized away.
 402     */
 403    emit(MOV(tmp_dst, src_reg(0x12345678u)));
 404 #endif
 405
 406    /* Give tmp the form below, where "." means untouched.
 407     *
 408     *     w z          y          x w z          y          x
 409     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
 410     *
 411     * That the upper word of each write-channel be 0 is required for the
 412     * following bit-shift and bit-or instructions to work. Note that this
 413     * relies on the undocumented hardware behavior mentioned above.
 414     */
 415    tmp_dst.writemask = WRITEMASK_XY;
 416    emit(F32TO16(tmp_dst, src0));
 417
 418    /* Give the write-channels of dst the form:
 419     *   0xhhhh0000
 420     */
 421    tmp_src.swizzle = BRW_SWIZZLE_YYYY;
 422    emit(SHL(dst, tmp_src, src_reg(16u)));
 423
 424    /* Finally, give the write-channels of dst the form of packHalf2x16's
 425     * output:
 426     *   0xhhhhllll
 427     */
 428    tmp_src.swizzle = BRW_SWIZZLE_XXXX;
 429    emit(OR(dst, src_reg(dst), tmp_src));
 430 }
 431
 432 void
 433 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 434 {
 435    if (brw->gen < 7) {
 436       unreachable("ir_unop_unpack_half_2x16 should be lowered");
 437    }
 438
 439    assert(dst.type == BRW_REGISTER_TYPE_F);
 440    assert(src0.type == BRW_REGISTER_TYPE_UD);
 441
 442    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
 443     *
 444     *   Because this instruction does not have a 16-bit floating-point type,
 445     *   the source data type must be Word (W). The destination type must be
 446     *   F (Float).
 447     *
 448     * To use W as the source data type, we must adjust horizontal strides,
 449     * which is only possible in align1 mode. All my [chadv] attempts at
 450     * emitting align1 instructions for unpackHalf2x16 failed to pass the
 451     * Piglit tests, so I gave up.
 452     *
 453     * I've verified that, on gen7 hardware and the simulator, it is safe to
 454     * emit f16to32 in align16 mode with UD as source data type.
 455     */
 456
 457    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 458    src_reg tmp_src(tmp_dst);
 459
 460    tmp_dst.writemask = WRITEMASK_X;
 461    emit(AND(tmp_dst, src0, src_reg(0xffffu)));
 462
 463    tmp_dst.writemask = WRITEMASK_Y;
 464    emit(SHR(tmp_dst, src0, src_reg(16u)));
 465
 466    dst.writemask = WRITEMASK_XY;
 467    emit(F16TO32(dst, tmp_src));
 468 }
 469
 470 void
 471 vec4_visitor::visit_instructions(const exec_list *list)
 472 {
 473    foreach_in_list(ir_instruction, ir, list) {
 474       base_ir = ir;
 475       ir->accept(this);
 476    }
 477 }
 478
 479
 480 static int
 481 type_size(const struct glsl_type *type)
 482 {
 483    unsigned int i;
 484    int size;
 485
 486    switch (type->base_type) {
 487    case GLSL_TYPE_UINT:
 488    case GLSL_TYPE_INT:
 489    case GLSL_TYPE_FLOAT:
 490    case GLSL_TYPE_BOOL:
 491       if (type->is_matrix()) {
 492          return type->matrix_columns;
 493       } else {
 494          /* Regardless of size of vector, it gets a vec4. This is bad
 495           * packing for things like floats, but otherwise arrays become a
 496           * mess.  Hopefully a later pass over the code can pack scalars
 497           * down if appropriate.
 498           */
 499          return 1;
 500       }
 501    case GLSL_TYPE_ARRAY:
 502       assert(type->length > 0);
 503       return type_size(type->fields.array) * type->length;
 504    case GLSL_TYPE_STRUCT:
 505       size = 0;
 506       for (i = 0; i < type->length; i++) {
 507          size += type_size(type->fields.structure[i].type);
 508       }
 509       return size;
 510    case GLSL_TYPE_SAMPLER:
 511       /* Samplers take up no register space, since they're baked in at
 512        * link time.
 513        */
 514       return 0;
 515    case GLSL_TYPE_ATOMIC_UINT:
 516       return 0;
 517    case GLSL_TYPE_IMAGE:
 518    case GLSL_TYPE_VOID:
 519    case GLSL_TYPE_ERROR:
 520    case GLSL_TYPE_INTERFACE:
 521       unreachable("not reached");
 522    }
 523
 524    return 0;
 525 }
 526
 527 int
 528 vec4_visitor::virtual_grf_alloc(int size)
 529 {
 530    if (virtual_grf_array_size <= virtual_grf_count) {
 531       if (virtual_grf_array_size == 0)
 532          virtual_grf_array_size = 16;
 533       else
 534          virtual_grf_array_size *= 2;
 535       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 536                                    virtual_grf_array_size);
 537       virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
 538                                      virtual_grf_array_size);
 539    }
 540    virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
 541    virtual_grf_reg_count += size;
 542    virtual_grf_sizes[virtual_grf_count] = size;
 543    return virtual_grf_count++;
 544 }
 545
 546 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 547 {
 548    init();
 549
 550    this->file = GRF;
 551    this->reg = v->virtual_grf_alloc(type_size(type));
 552
 553    if (type->is_array() || type->is_record()) {
 554       this->swizzle = BRW_SWIZZLE_NOOP;
 555    } else {
 556       this->swizzle = swizzle_for_size(type->vector_elements);
 557    }
 558
 559    this->type = brw_type_for_base_type(type);
 560 }
 561
 562 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
 563 {
 564    assert(size > 0);
 565
 566    init();
 567
 568    this->file = GRF;
 569    this->reg = v->virtual_grf_alloc(type_size(type) * size);
 570
 571    this->swizzle = BRW_SWIZZLE_NOOP;
 572
 573    this->type = brw_type_for_base_type(type);
 574 }
 575
 576 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 577 {
 578    init();
 579
 580    this->file = GRF;
 581    this->reg = v->virtual_grf_alloc(type_size(type));
 582
 583    if (type->is_array() || type->is_record()) {
 584       this->writemask = WRITEMASK_XYZW;
 585    } else {
 586       this->writemask = (1 << type->vector_elements) - 1;
 587    }
 588
 589    this->type = brw_type_for_base_type(type);
 590 }
 591
 592 /* Our support for uniforms is piggy-backed on the struct
 593  * gl_fragment_program, because that's where the values actually
 594  * get stored, rather than in some global gl_shader_program uniform
 595  * store.
 596  */
 597 void
 598 vec4_visitor::setup_uniform_values(ir_variable *ir)
 599 {
 600    int namelen = strlen(ir->name);
 601
 602    /* The data for our (non-builtin) uniforms is stored in a series of
 603     * gl_uniform_driver_storage structs for each subcomponent that
 604     * glGetUniformLocation() could name.  We know it's been set up in the same
 605     * order we'd walk the type, so walk the list of storage and find anything
 606     * with our name, or the prefix of a component that starts with our name.
 607     */
 608    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 609       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 610
 611       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 612           (storage->name[namelen] != 0 &&
 613            storage->name[namelen] != '.' &&
 614            storage->name[namelen] != '[')) {
 615          continue;
 616       }
 617
 618       gl_constant_value *components = storage->storage;
 619       unsigned vector_count = (MAX2(storage->array_elements, 1) *
 620                                storage->type->matrix_columns);
 621
 622       for (unsigned s = 0; s < vector_count; s++) {
 623          assert(uniforms < uniform_array_size);
 624          uniform_vector_size[uniforms] = storage->type->vector_elements;
 625
 626          int i;
 627          for (i = 0; i < uniform_vector_size[uniforms]; i++) {
 628             stage_prog_data->param[uniforms * 4 + i] = components;
 629             components++;
 630          }
 631          for (; i < 4; i++) {
 632             static gl_constant_value zero = { 0.0 };
 633             stage_prog_data->param[uniforms * 4 + i] = &zero;
 634          }
 635
 636          uniforms++;
 637       }
 638    }
 639 }
 640
 641 void
 642 vec4_visitor::setup_uniform_clipplane_values()
 643 {
 644    gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
 645
 646    for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
 647       assert(this->uniforms < uniform_array_size);
 648       this->uniform_vector_size[this->uniforms] = 4;
 649       this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
 650       this->userplane[i].type = BRW_REGISTER_TYPE_F;
 651       for (int j = 0; j < 4; ++j) {
 652          stage_prog_data->param[this->uniforms * 4 + j] =
 653             (gl_constant_value *) &clip_planes[i][j];
 654       }
 655       ++this->uniforms;
 656    }
 657 }
 658
 659 /* Our support for builtin uniforms is even scarier than non-builtin.
 660  * It sits on top of the PROG_STATE_VAR parameters that are
 661  * automatically updated from GL context state.
 662  */
 663 void
 664 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
 665 {
 666    const ir_state_slot *const slots = ir->get_state_slots();
 667    assert(slots != NULL);
 668
 669    for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
 670       /* This state reference has already been setup by ir_to_mesa,
 671        * but we'll get the same index back here.  We can reference
 672        * ParameterValues directly, since unlike brw_fs.cpp, we never
 673        * add new state references during compile.
 674        */
 675       int index = _mesa_add_state_reference(this->prog->Parameters,
 676                                             (gl_state_index *)slots[i].tokens);
 677       gl_constant_value *values =
 678          &this->prog->Parameters->ParameterValues[index][0];
 679
 680       assert(this->uniforms < uniform_array_size);
 681       this->uniform_vector_size[this->uniforms] = 0;
 682       /* Add each of the unique swizzled channels of the element.
 683        * This will end up matching the size of the glsl_type of this field.
 684        */
 685       int last_swiz = -1;
 686       for (unsigned int j = 0; j < 4; j++) {
 687          int swiz = GET_SWZ(slots[i].swizzle, j);
 688          last_swiz = swiz;
 689
 690          stage_prog_data->param[this->uniforms * 4 + j] = &values[swiz];
 691          assert(this->uniforms < uniform_array_size);
 692          if (swiz <= last_swiz)
 693             this->uniform_vector_size[this->uniforms]++;
 694       }
 695       this->uniforms++;
 696    }
 697 }
 698
 699 dst_reg *
 700 vec4_visitor::variable_storage(ir_variable *var)
 701 {
 702    return (dst_reg *)hash_table_find(this->variable_ht, var);
 703 }
 704
 705 void
 706 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir,
 707                                      enum brw_predicate *predicate)
 708 {
 709    ir_expression *expr = ir->as_expression();
 710
 711    *predicate = BRW_PREDICATE_NORMAL;
 712
 713    if (expr && expr->operation != ir_binop_ubo_load) {
 714       src_reg op[3];
 715       vec4_instruction *inst;
 716
 717       assert(expr->get_num_operands() <= 3);
 718       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 719          expr->operands[i]->accept(this);
 720          op[i] = this->result;
 721
 722          resolve_ud_negate(&op[i]);
 723       }
 724
 725       switch (expr->operation) {
 726       case ir_unop_logic_not:
 727          inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
 728          inst->conditional_mod = BRW_CONDITIONAL_Z;
 729          break;
 730
 731       case ir_binop_logic_xor:
 732          inst = emit(XOR(dst_null_d(), op[0], op[1]));
 733          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 734          break;
 735
 736       case ir_binop_logic_or:
 737          inst = emit(OR(dst_null_d(), op[0], op[1]));
 738          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 739          break;
 740
 741       case ir_binop_logic_and:
 742          inst = emit(AND(dst_null_d(), op[0], op[1]));
 743          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 744          break;
 745
 746       case ir_unop_f2b:
 747          if (brw->gen >= 6) {
 748             emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
 749          } else {
 750             inst = emit(MOV(dst_null_f(), op[0]));
 751             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 752          }
 753          break;
 754
 755       case ir_unop_i2b:
 756          if (brw->gen >= 6) {
 757             emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 758          } else {
 759             inst = emit(MOV(dst_null_d(), op[0]));
 760             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 761          }
 762          break;
 763
 764       case ir_binop_all_equal:
 765          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 766          *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
 767          break;
 768
 769       case ir_binop_any_nequal:
 770          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 771          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 772          break;
 773
 774       case ir_unop_any:
 775          inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 776          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 777          break;
 778
 779       case ir_binop_greater:
 780       case ir_binop_gequal:
 781       case ir_binop_less:
 782       case ir_binop_lequal:
 783       case ir_binop_equal:
 784       case ir_binop_nequal:
 785          emit(CMP(dst_null_d(), op[0], op[1],
 786                   brw_conditional_for_comparison(expr->operation)));
 787          break;
 788
 789       case ir_triop_csel: {
 790          /* Expand the boolean condition into the flag register. */
 791          inst = emit(MOV(dst_null_d(), op[0]));
 792          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 793
 794          /* Select which boolean to return. */
 795          dst_reg temp(this, expr->operands[1]->type);
 796          inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
 797          inst->predicate = BRW_PREDICATE_NORMAL;
 798
 799          /* Expand the result to a condition code. */
 800          inst = emit(MOV(dst_null_d(), src_reg(temp)));
 801          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 802          break;
 803       }
 804
 805       default:
 806          unreachable("not reached");
 807       }
 808       return;
 809    }
 810
 811    ir->accept(this);
 812
 813    resolve_ud_negate(&this->result);
 814
 815    if (brw->gen >= 6) {
 816       vec4_instruction *inst = emit(AND(dst_null_d(),
 817                                         this->result, src_reg(1)));
 818       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 819    } else {
 820       vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
 821       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 822    }
 823 }
 824
 825 /**
 826  * Emit a gen6 IF statement with the comparison folded into the IF
 827  * instruction.
 828  */
 829 void
 830 vec4_visitor::emit_if_gen6(ir_if *ir)
 831 {
 832    ir_expression *expr = ir->condition->as_expression();
 833
 834    if (expr && expr->operation != ir_binop_ubo_load) {
 835       src_reg op[3];
 836       dst_reg temp;
 837
 838       assert(expr->get_num_operands() <= 3);
 839       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 840          expr->operands[i]->accept(this);
 841          op[i] = this->result;
 842       }
 843
 844       switch (expr->operation) {
 845       case ir_unop_logic_not:
 846          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
 847          return;
 848
 849       case ir_binop_logic_xor:
 850          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
 851          return;
 852
 853       case ir_binop_logic_or:
 854          temp = dst_reg(this, glsl_type::bool_type);
 855          emit(OR(temp, op[0], op[1]));
 856          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 857          return;
 858
 859       case ir_binop_logic_and:
 860          temp = dst_reg(this, glsl_type::bool_type);
 861          emit(AND(temp, op[0], op[1]));
 862          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 863          return;
 864
 865       case ir_unop_f2b:
 866          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 867          return;
 868
 869       case ir_unop_i2b:
 870          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 871          return;
 872
 873       case ir_binop_greater:
 874       case ir_binop_gequal:
 875       case ir_binop_less:
 876       case ir_binop_lequal:
 877       case ir_binop_equal:
 878       case ir_binop_nequal:
 879          emit(IF(op[0], op[1],
 880                  brw_conditional_for_comparison(expr->operation)));
 881          return;
 882
 883       case ir_binop_all_equal:
 884          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 885          emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
 886          return;
 887
 888       case ir_binop_any_nequal:
 889          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 890          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 891          return;
 892
 893       case ir_unop_any:
 894          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 895          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 896          return;
 897
 898       case ir_triop_csel: {
 899          /* Expand the boolean condition into the flag register. */
 900          vec4_instruction *inst = emit(MOV(dst_null_d(), op[0]));
 901          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 902
 903          /* Select which boolean to return. */
 904          dst_reg temp(this, expr->operands[1]->type);
 905          inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
 906          inst->predicate = BRW_PREDICATE_NORMAL;
 907
 908          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 909          return;
 910       }
 911
 912       default:
 913          unreachable("not reached");
 914       }
 915       return;
 916    }
 917
 918    ir->condition->accept(this);
 919
 920    emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
 921 }
 922
 923 void
 924 vec4_visitor::visit(ir_variable *ir)
 925 {
 926    dst_reg *reg = NULL;
 927
 928    if (variable_storage(ir))
 929       return;
 930
 931    switch (ir->data.mode) {
 932    case ir_var_shader_in:
 933       reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
 934       break;
 935
 936    case ir_var_shader_out:
 937       reg = new(mem_ctx) dst_reg(this, ir->type);
 938
 939       for (int i = 0; i < type_size(ir->type); i++) {
 940          output_reg[ir->data.location + i] = *reg;
 941          output_reg[ir->data.location + i].reg_offset = i;
 942          output_reg[ir->data.location + i].type =
 943             brw_type_for_base_type(ir->type->get_scalar_type());
 944          output_reg_annotation[ir->data.location + i] = ir->name;
 945       }
 946       break;
 947
 948    case ir_var_auto:
 949    case ir_var_temporary:
 950       reg = new(mem_ctx) dst_reg(this, ir->type);
 951       break;
 952
 953    case ir_var_uniform:
 954       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
 955
 956       /* Thanks to the lower_ubo_reference pass, we will see only
 957        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
 958        * variables, so no need for them to be in variable_ht.
 959        *
 960        * Some uniforms, such as samplers and atomic counters, have no actual
 961        * storage, so we should ignore them.
 962        */
 963       if (ir->is_in_uniform_block() || type_size(ir->type) == 0)
 964          return;
 965
 966       /* Track how big the whole uniform variable is, in case we need to put a
 967        * copy of its data into pull constants for array access.
 968        */
 969       assert(this->uniforms < uniform_array_size);
 970       this->uniform_size[this->uniforms] = type_size(ir->type);
 971
 972       if (!strncmp(ir->name, "gl_", 3)) {
 973          setup_builtin_uniform_values(ir);
 974       } else {
 975          setup_uniform_values(ir);
 976       }
 977       break;
 978
 979    case ir_var_system_value:
 980       reg = make_reg_for_system_value(ir);
 981       break;
 982
 983    default:
 984       unreachable("not reached");
 985    }
 986
 987    reg->type = brw_type_for_base_type(ir->type);
 988    hash_table_insert(this->variable_ht, reg, ir);
 989 }
 990
 991 void
 992 vec4_visitor::visit(ir_loop *ir)
 993 {
 994    /* We don't want debugging output to print the whole body of the
 995     * loop as the annotation.
 996     */
 997    this->base_ir = NULL;
 998
 999    emit(BRW_OPCODE_DO);
1000
1001    visit_instructions(&ir->body_instructions);
1002
1003    emit(BRW_OPCODE_WHILE);
1004 }
1005
1006 void
1007 vec4_visitor::visit(ir_loop_jump *ir)
1008 {
1009    switch (ir->mode) {
1010    case ir_loop_jump::jump_break:
1011       emit(BRW_OPCODE_BREAK);
1012       break;
1013    case ir_loop_jump::jump_continue:
1014       emit(BRW_OPCODE_CONTINUE);
1015       break;
1016    }
1017 }
1018
1019
1020 void
1021 vec4_visitor::visit(ir_function_signature *)
1022 {
1023    unreachable("not reached");
1024 }
1025
1026 void
1027 vec4_visitor::visit(ir_function *ir)
1028 {
1029    /* Ignore function bodies other than main() -- we shouldn't see calls to
1030     * them since they should all be inlined.
1031     */
1032    if (strcmp(ir->name, "main") == 0) {
1033       const ir_function_signature *sig;
1034       exec_list empty;
1035
1036       sig = ir->matching_signature(NULL, &empty, false);
1037
1038       assert(sig);
1039
1040       visit_instructions(&sig->body);
1041    }
1042 }
1043
1044 bool
1045 vec4_visitor::try_emit_mad(ir_expression *ir)
1046 {
1047    /* 3-src instructions were introduced in gen6. */
1048    if (brw->gen < 6)
1049       return false;
1050
1051    /* MAD can only handle floating-point data. */
1052    if (ir->type->base_type != GLSL_TYPE_FLOAT)
1053       return false;
1054
1055    ir_rvalue *nonmul = ir->operands[1];
1056    ir_expression *mul = ir->operands[0]->as_expression();
1057
1058    if (!mul || mul->operation != ir_binop_mul) {
1059       nonmul = ir->operands[0];
1060       mul = ir->operands[1]->as_expression();
1061
1062       if (!mul || mul->operation != ir_binop_mul)
1063          return false;
1064    }
1065
1066    nonmul->accept(this);
1067    src_reg src0 = fix_3src_operand(this->result);
1068
1069    mul->operands[0]->accept(this);
1070    src_reg src1 = fix_3src_operand(this->result);
1071
1072    mul->operands[1]->accept(this);
1073    src_reg src2 = fix_3src_operand(this->result);
1074
1075    this->result = src_reg(this, ir->type);
1076    emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1077
1078    return true;
1079 }
1080
1081 bool
1082 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1083 {
1084    /* This optimization relies on CMP setting the destination to 0 when
1085     * false.  Early hardware only sets the least significant bit, and
1086     * leaves the other bits undefined.  So we can't use it.
1087     */
1088    if (brw->gen < 6)
1089       return false;
1090
1091    ir_expression *const cmp = ir->operands[0]->as_expression();
1092
1093    if (cmp == NULL)
1094       return false;
1095
1096    switch (cmp->operation) {
1097    case ir_binop_less:
1098    case ir_binop_greater:
1099    case ir_binop_lequal:
1100    case ir_binop_gequal:
1101    case ir_binop_equal:
1102    case ir_binop_nequal:
1103       break;
1104
1105    default:
1106       return false;
1107    }
1108
1109    cmp->operands[0]->accept(this);
1110    const src_reg cmp_src0 = this->result;
1111
1112    cmp->operands[1]->accept(this);
1113    const src_reg cmp_src1 = this->result;
1114
1115    this->result = src_reg(this, ir->type);
1116
1117    emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1118             brw_conditional_for_comparison(cmp->operation)));
1119
1120    /* If the comparison is false, this->result will just happen to be zero.
1121     */
1122    vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1123                                        this->result, src_reg(1.0f));
1124    inst->predicate = BRW_PREDICATE_NORMAL;
1125    inst->predicate_inverse = true;
1126
1127    return true;
1128 }
1129
1130 void
1131 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
1132                           src_reg src0, src_reg src1)
1133 {
1134    vec4_instruction *inst;
1135
1136    if (brw->gen >= 6) {
1137       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1138       inst->conditional_mod = conditionalmod;
1139    } else {
1140       emit(CMP(dst, src0, src1, conditionalmod));
1141
1142       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1143       inst->predicate = BRW_PREDICATE_NORMAL;
1144    }
1145 }
1146
1147 void
1148 vec4_visitor::emit_lrp(const dst_reg &dst,
1149                        const src_reg &x, const src_reg &y, const src_reg &a)
1150 {
1151    if (brw->gen >= 6) {
1152       /* Note that the instruction's argument order is reversed from GLSL
1153        * and the IR.
1154        */
1155       emit(LRP(dst,
1156                fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
1157    } else {
1158       /* Earlier generations don't support three source operations, so we
1159        * need to emit x*(1-a) + y*a.
1160        */
1161       dst_reg y_times_a           = dst_reg(this, glsl_type::vec4_type);
1162       dst_reg one_minus_a         = dst_reg(this, glsl_type::vec4_type);
1163       dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1164       y_times_a.writemask           = dst.writemask;
1165       one_minus_a.writemask         = dst.writemask;
1166       x_times_one_minus_a.writemask = dst.writemask;
1167
1168       emit(MUL(y_times_a, y, a));
1169       emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1170       emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1171       emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1172    }
1173 }
1174
1175 void
1176 vec4_visitor::visit(ir_expression *ir)
1177 {
1178    unsigned int operand;
1179    src_reg op[Elements(ir->operands)];
1180    vec4_instruction *inst;
1181
1182    if (ir->operation == ir_binop_add) {
1183       if (try_emit_mad(ir))
1184          return;
1185    }
1186
1187    if (ir->operation == ir_unop_b2f) {
1188       if (try_emit_b2f_of_compare(ir))
1189          return;
1190    }
1191
1192    /* Storage for our result.  Ideally for an assignment we'd be using
1193     * the actual storage for the result here, instead.
1194     */
1195    dst_reg result_dst(this, ir->type);
1196    src_reg result_src(result_dst);
1197
1198    if (ir->operation == ir_triop_csel) {
1199       ir->operands[1]->accept(this);
1200       op[1] = this->result;
1201       ir->operands[2]->accept(this);
1202       op[2] = this->result;
1203
1204       enum brw_predicate predicate;
1205       emit_bool_to_cond_code(ir->operands[0], &predicate);
1206       inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1207       inst->predicate = predicate;
1208       this->result = result_src;
1209       return;
1210    }
1211
1212    for (operand = 0; operand < ir->get_num_operands(); operand++) {
1213       this->result.file = BAD_FILE;
1214       ir->operands[operand]->accept(this);
1215       if (this->result.file == BAD_FILE) {
1216          fprintf(stderr, "Failed to get tree for expression operand:\n");
1217          ir->operands[operand]->fprint(stderr);
1218          exit(1);
1219       }
1220       op[operand] = this->result;
1221
1222       /* Matrix expression operands should have been broken down to vector
1223        * operations already.
1224        */
1225       assert(!ir->operands[operand]->type->is_matrix());
1226    }
1227
1228    /* If nothing special happens, this is the result. */
1229    this->result = result_src;
1230
1231    switch (ir->operation) {
1232    case ir_unop_logic_not:
1233       if (ctx->Const.UniformBooleanTrue != 1) {
1234          emit(NOT(result_dst, op[0]));
1235       } else {
1236          emit(XOR(result_dst, op[0], src_reg(1u)));
1237       }
1238       break;
1239    case ir_unop_neg:
1240       op[0].negate = !op[0].negate;
1241       emit(MOV(result_dst, op[0]));
1242       break;
1243    case ir_unop_abs:
1244       op[0].abs = true;
1245       op[0].negate = false;
1246       emit(MOV(result_dst, op[0]));
1247       break;
1248
1249    case ir_unop_sign:
1250       if (ir->type->is_float()) {
1251          /* AND(val, 0x80000000) gives the sign bit.
1252           *
1253           * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1254           * zero.
1255           */
1256          emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1257
1258          op[0].type = BRW_REGISTER_TYPE_UD;
1259          result_dst.type = BRW_REGISTER_TYPE_UD;
1260          emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1261
1262          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1263          inst->predicate = BRW_PREDICATE_NORMAL;
1264
1265          this->result.type = BRW_REGISTER_TYPE_F;
1266       } else {
1267          /*  ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1268           *               -> non-negative val generates 0x00000000.
1269           *  Predicated OR sets 1 if val is positive.
1270           */
1271          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1272
1273          emit(ASR(result_dst, op[0], src_reg(31)));
1274
1275          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1276          inst->predicate = BRW_PREDICATE_NORMAL;
1277       }
1278       break;
1279
1280    case ir_unop_rcp:
1281       emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1282       break;
1283
1284    case ir_unop_exp2:
1285       emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1286       break;
1287    case ir_unop_log2:
1288       emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1289       break;
1290    case ir_unop_exp:
1291    case ir_unop_log:
1292       unreachable("not reached: should be handled by ir_explog_to_explog2");
1293    case ir_unop_sin:
1294    case ir_unop_sin_reduced:
1295       emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1296       break;
1297    case ir_unop_cos:
1298    case ir_unop_cos_reduced:
1299       emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1300       break;
1301
1302    case ir_unop_dFdx:
1303    case ir_unop_dFdx_coarse:
1304    case ir_unop_dFdx_fine:
1305    case ir_unop_dFdy:
1306    case ir_unop_dFdy_coarse:
1307    case ir_unop_dFdy_fine:
1308       unreachable("derivatives not valid in vertex shader");
1309
1310    case ir_unop_bitfield_reverse:
1311       emit(BFREV(result_dst, op[0]));
1312       break;
1313    case ir_unop_bit_count:
1314       emit(CBIT(result_dst, op[0]));
1315       break;
1316    case ir_unop_find_msb: {
1317       src_reg temp = src_reg(this, glsl_type::uint_type);
1318
1319       inst = emit(FBH(dst_reg(temp), op[0]));
1320       inst->dst.writemask = WRITEMASK_XYZW;
1321
1322       /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1323        * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1324        * subtract the result from 31 to convert the MSB count into an LSB count.
1325        */
1326
1327       /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1328       temp.swizzle = BRW_SWIZZLE_NOOP;
1329       emit(MOV(result_dst, temp));
1330
1331       src_reg src_tmp = src_reg(result_dst);
1332       emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1333
1334       src_tmp.negate = true;
1335       inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1336       inst->predicate = BRW_PREDICATE_NORMAL;
1337       break;
1338    }
1339    case ir_unop_find_lsb:
1340       emit(FBL(result_dst, op[0]));
1341       break;
1342    case ir_unop_saturate:
1343       inst = emit(MOV(result_dst, op[0]));
1344       inst->saturate = true;
1345       break;
1346
1347    case ir_unop_noise:
1348       unreachable("not reached: should be handled by lower_noise");
1349
1350    case ir_binop_add:
1351       emit(ADD(result_dst, op[0], op[1]));
1352       break;
1353    case ir_binop_sub:
1354       unreachable("not reached: should be handled by ir_sub_to_add_neg");
1355
1356    case ir_binop_mul:
1357       if (brw->gen < 8 && ir->type->is_integer()) {
1358          /* For integer multiplication, the MUL uses the low 16 bits of one of
1359           * the operands (src0 through SNB, src1 on IVB and later).  The MACH
1360           * accumulates in the contribution of the upper 16 bits of that
1361           * operand.  If we can determine that one of the args is in the low
1362           * 16 bits, though, we can just emit a single MUL.
1363           */
1364          if (ir->operands[0]->is_uint16_constant()) {
1365             if (brw->gen < 7)
1366                emit(MUL(result_dst, op[0], op[1]));
1367             else
1368                emit(MUL(result_dst, op[1], op[0]));
1369          } else if (ir->operands[1]->is_uint16_constant()) {
1370             if (brw->gen < 7)
1371                emit(MUL(result_dst, op[1], op[0]));
1372             else
1373                emit(MUL(result_dst, op[0], op[1]));
1374          } else {
1375             struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1376
1377             emit(MUL(acc, op[0], op[1]));
1378             emit(MACH(dst_null_d(), op[0], op[1]));
1379             emit(MOV(result_dst, src_reg(acc)));
1380          }
1381       } else {
1382          emit(MUL(result_dst, op[0], op[1]));
1383       }
1384       break;
1385    case ir_binop_imul_high: {
1386       struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1387
1388       emit(MUL(acc, op[0], op[1]));
1389       emit(MACH(result_dst, op[0], op[1]));
1390       break;
1391    }
1392    case ir_binop_div:
1393       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1394       assert(ir->type->is_integer());
1395       emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1396       break;
1397    case ir_binop_carry: {
1398       struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
1399
1400       emit(ADDC(dst_null_ud(), op[0], op[1]));
1401       emit(MOV(result_dst, src_reg(acc)));
1402       break;
1403    }
1404    case ir_binop_borrow: {
1405       struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
1406
1407       emit(SUBB(dst_null_ud(), op[0], op[1]));
1408       emit(MOV(result_dst, src_reg(acc)));
1409       break;
1410    }
1411    case ir_binop_mod:
1412       /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1413       assert(ir->type->is_integer());
1414       emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1415       break;
1416
1417    case ir_binop_less:
1418    case ir_binop_greater:
1419    case ir_binop_lequal:
1420    case ir_binop_gequal:
1421    case ir_binop_equal:
1422    case ir_binop_nequal: {
1423       emit(CMP(result_dst, op[0], op[1],
1424                brw_conditional_for_comparison(ir->operation)));
1425       if (ctx->Const.UniformBooleanTrue == 1) {
1426          emit(AND(result_dst, result_src, src_reg(1u)));
1427       }
1428       break;
1429    }
1430
1431    case ir_binop_all_equal:
1432       /* "==" operator producing a scalar boolean. */
1433       if (ir->operands[0]->type->is_vector() ||
1434           ir->operands[1]->type->is_vector()) {
1435          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1436          emit(MOV(result_dst, src_reg(0)));
1437          inst = emit(MOV(result_dst, src_reg(ctx->Const.UniformBooleanTrue)));
1438          inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1439       } else {
1440          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1441          if (ctx->Const.UniformBooleanTrue == 1) {
1442             emit(AND(result_dst, result_src, src_reg(1u)));
1443          }
1444       }
1445       break;
1446    case ir_binop_any_nequal:
1447       /* "!=" operator producing a scalar boolean. */
1448       if (ir->operands[0]->type->is_vector() ||
1449           ir->operands[1]->type->is_vector()) {
1450          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1451
1452          emit(MOV(result_dst, src_reg(0)));
1453          inst = emit(MOV(result_dst, src_reg(ctx->Const.UniformBooleanTrue)));
1454          inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1455       } else {
1456          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1457          if (ctx->Const.UniformBooleanTrue == 1) {
1458             emit(AND(result_dst, result_src, src_reg(1u)));
1459          }
1460       }
1461       break;
1462
1463    case ir_unop_any:
1464       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1465       emit(MOV(result_dst, src_reg(0)));
1466
1467       inst = emit(MOV(result_dst, src_reg(ctx->Const.UniformBooleanTrue)));
1468       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1469       break;
1470
1471    case ir_binop_logic_xor:
1472       emit(XOR(result_dst, op[0], op[1]));
1473       break;
1474
1475    case ir_binop_logic_or:
1476       emit(OR(result_dst, op[0], op[1]));
1477       break;
1478
1479    case ir_binop_logic_and:
1480       emit(AND(result_dst, op[0], op[1]));
1481       break;
1482
1483    case ir_binop_dot:
1484       assert(ir->operands[0]->type->is_vector());
1485       assert(ir->operands[0]->type == ir->operands[1]->type);
1486       emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1487       break;
1488
1489    case ir_unop_sqrt:
1490       emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1491       break;
1492    case ir_unop_rsq:
1493       emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1494       break;
1495
1496    case ir_unop_bitcast_i2f:
1497    case ir_unop_bitcast_u2f:
1498       this->result = op[0];
1499       this->result.type = BRW_REGISTER_TYPE_F;
1500       break;
1501
1502    case ir_unop_bitcast_f2i:
1503       this->result = op[0];
1504       this->result.type = BRW_REGISTER_TYPE_D;
1505       break;
1506
1507    case ir_unop_bitcast_f2u:
1508       this->result = op[0];
1509       this->result.type = BRW_REGISTER_TYPE_UD;
1510       break;
1511
1512    case ir_unop_i2f:
1513    case ir_unop_i2u:
1514    case ir_unop_u2i:
1515    case ir_unop_u2f:
1516    case ir_unop_f2i:
1517    case ir_unop_f2u:
1518       emit(MOV(result_dst, op[0]));
1519       break;
1520    case ir_unop_b2i:
1521       if (ctx->Const.UniformBooleanTrue != 1) {
1522          emit(AND(result_dst, op[0], src_reg(1u)));
1523       } else {
1524          emit(MOV(result_dst, op[0]));
1525       }
1526       break;
1527    case ir_unop_b2f:
1528       if (ctx->Const.UniformBooleanTrue != 1) {
1529          op[0].type = BRW_REGISTER_TYPE_UD;
1530          result_dst.type = BRW_REGISTER_TYPE_UD;
1531          emit(AND(result_dst, op[0], src_reg(0x3f800000u)));
1532          result_dst.type = BRW_REGISTER_TYPE_F;
1533       } else {
1534          emit(MOV(result_dst, op[0]));
1535       }
1536       break;
1537    case ir_unop_f2b:
1538    case ir_unop_i2b:
1539       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1540       if (ctx->Const.UniformBooleanTrue == 1) {
1541          emit(AND(result_dst, result_src, src_reg(1u)));
1542       }
1543       break;
1544
1545    case ir_unop_trunc:
1546       emit(RNDZ(result_dst, op[0]));
1547       break;
1548    case ir_unop_ceil:
1549       op[0].negate = !op[0].negate;
1550       inst = emit(RNDD(result_dst, op[0]));
1551       this->result.negate = true;
1552       break;
1553    case ir_unop_floor:
1554       inst = emit(RNDD(result_dst, op[0]));
1555       break;
1556    case ir_unop_fract:
1557       inst = emit(FRC(result_dst, op[0]));
1558       break;
1559    case ir_unop_round_even:
1560       emit(RNDE(result_dst, op[0]));
1561       break;
1562
1563    case ir_binop_min:
1564       emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1565       break;
1566    case ir_binop_max:
1567       emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1568       break;
1569
1570    case ir_binop_pow:
1571       emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1572       break;
1573
1574    case ir_unop_bit_not:
1575       inst = emit(NOT(result_dst, op[0]));
1576       break;
1577    case ir_binop_bit_and:
1578       inst = emit(AND(result_dst, op[0], op[1]));
1579       break;
1580    case ir_binop_bit_xor:
1581       inst = emit(XOR(result_dst, op[0], op[1]));
1582       break;
1583    case ir_binop_bit_or:
1584       inst = emit(OR(result_dst, op[0], op[1]));
1585       break;
1586
1587    case ir_binop_lshift:
1588       inst = emit(SHL(result_dst, op[0], op[1]));
1589       break;
1590
1591    case ir_binop_rshift:
1592       if (ir->type->base_type == GLSL_TYPE_INT)
1593          inst = emit(ASR(result_dst, op[0], op[1]));
1594       else
1595          inst = emit(SHR(result_dst, op[0], op[1]));
1596       break;
1597
1598    case ir_binop_bfm:
1599       emit(BFI1(result_dst, op[0], op[1]));
1600       break;
1601
1602    case ir_binop_ubo_load: {
1603       ir_constant *const_uniform_block = ir->operands[0]->as_constant();
1604       ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1605       unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1606       src_reg offset;
1607
1608       /* Now, load the vector from that offset. */
1609       assert(ir->type->is_vector() || ir->type->is_scalar());
1610
1611       src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1612       packed_consts.type = result.type;
1613       src_reg surf_index;
1614
1615       if (const_uniform_block) {
1616          /* The block index is a constant, so just emit the binding table entry
1617           * as an immediate.
1618           */
1619          surf_index = src_reg(prog_data->base.binding_table.ubo_start +
1620                               const_uniform_block->value.u[0]);
1621       } else {
1622          /* The block index is not a constant. Evaluate the index expression
1623           * per-channel and add the base UBO index; the generator will select
1624           * a value from any live channel.
1625           */
1626          surf_index = src_reg(this, glsl_type::uint_type);
1627          emit(ADD(dst_reg(surf_index), op[0],
1628                   src_reg(prog_data->base.binding_table.ubo_start)));
1629
1630          /* Assume this may touch any UBO. It would be nice to provide
1631           * a tighter bound, but the array information is already lowered away.
1632           */
1633          brw_mark_surface_used(&prog_data->base,
1634                                prog_data->base.binding_table.ubo_start +
1635                                shader_prog->NumUniformBlocks - 1);
1636       }
1637
1638       if (const_offset_ir) {
1639          if (brw->gen >= 8) {
1640             /* Store the offset in a GRF so we can send-from-GRF. */
1641             offset = src_reg(this, glsl_type::int_type);
1642             emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1643          } else {
1644             /* Immediates are fine on older generations since they'll be moved
1645              * to a (potentially fake) MRF at the generator level.
1646              */
1647             offset = src_reg(const_offset / 16);
1648          }
1649       } else {
1650          offset = src_reg(this, glsl_type::uint_type);
1651          emit(SHR(dst_reg(offset), op[1], src_reg(4)));
1652       }
1653
1654       if (brw->gen >= 7) {
1655          dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1656          grf_offset.type = offset.type;
1657
1658          emit(MOV(grf_offset, offset));
1659
1660          emit(new(mem_ctx) vec4_instruction(this,
1661                                             VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1662                                             dst_reg(packed_consts),
1663                                             surf_index,
1664                                             src_reg(grf_offset)));
1665       } else {
1666          vec4_instruction *pull =
1667             emit(new(mem_ctx) vec4_instruction(this,
1668                                                VS_OPCODE_PULL_CONSTANT_LOAD,
1669                                                dst_reg(packed_consts),
1670                                                surf_index,
1671                                                offset));
1672          pull->base_mrf = 14;
1673          pull->mlen = 1;
1674       }
1675
1676       packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1677       packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1678                                             const_offset % 16 / 4,
1679                                             const_offset % 16 / 4,
1680                                             const_offset % 16 / 4);
1681
1682       /* UBO bools are any nonzero int.  We need to convert them to use the
1683        * value of true stored in ctx->Const.UniformBooleanTrue.
1684        */
1685       if (ir->type->base_type == GLSL_TYPE_BOOL) {
1686          emit(CMP(result_dst, packed_consts, src_reg(0u),
1687                   BRW_CONDITIONAL_NZ));
1688          if (ctx->Const.UniformBooleanTrue == 1) {
1689             emit(AND(result_dst, result, src_reg(1u)));
1690          }
1691       } else {
1692          emit(MOV(result_dst, packed_consts));
1693       }
1694       break;
1695    }
1696
1697    case ir_binop_vector_extract:
1698       unreachable("should have been lowered by vec_index_to_cond_assign");
1699
1700    case ir_triop_fma:
1701       op[0] = fix_3src_operand(op[0]);
1702       op[1] = fix_3src_operand(op[1]);
1703       op[2] = fix_3src_operand(op[2]);
1704       /* Note that the instruction's argument order is reversed from GLSL
1705        * and the IR.
1706        */
1707       emit(MAD(result_dst, op[2], op[1], op[0]));
1708       break;
1709
1710    case ir_triop_lrp:
1711       emit_lrp(result_dst, op[0], op[1], op[2]);
1712       break;
1713
1714    case ir_triop_csel:
1715       unreachable("already handled above");
1716       break;
1717
1718    case ir_triop_bfi:
1719       op[0] = fix_3src_operand(op[0]);
1720       op[1] = fix_3src_operand(op[1]);
1721       op[2] = fix_3src_operand(op[2]);
1722       emit(BFI2(result_dst, op[0], op[1], op[2]));
1723       break;
1724
1725    case ir_triop_bitfield_extract:
1726       op[0] = fix_3src_operand(op[0]);
1727       op[1] = fix_3src_operand(op[1]);
1728       op[2] = fix_3src_operand(op[2]);
1729       /* Note that the instruction's argument order is reversed from GLSL
1730        * and the IR.
1731        */
1732       emit(BFE(result_dst, op[2], op[1], op[0]));
1733       break;
1734
1735    case ir_triop_vector_insert:
1736       unreachable("should have been lowered by lower_vector_insert");
1737
1738    case ir_quadop_bitfield_insert:
1739       unreachable("not reached: should be handled by "
1740               "bitfield_insert_to_bfm_bfi\n");
1741
1742    case ir_quadop_vector:
1743       unreachable("not reached: should be handled by lower_quadop_vector");
1744
1745    case ir_unop_pack_half_2x16:
1746       emit_pack_half_2x16(result_dst, op[0]);
1747       break;
1748    case ir_unop_unpack_half_2x16:
1749       emit_unpack_half_2x16(result_dst, op[0]);
1750       break;
1751    case ir_unop_pack_snorm_2x16:
1752    case ir_unop_pack_snorm_4x8:
1753    case ir_unop_pack_unorm_2x16:
1754    case ir_unop_pack_unorm_4x8:
1755    case ir_unop_unpack_snorm_2x16:
1756    case ir_unop_unpack_snorm_4x8:
1757    case ir_unop_unpack_unorm_2x16:
1758    case ir_unop_unpack_unorm_4x8:
1759       unreachable("not reached: should be handled by lower_packing_builtins");
1760    case ir_unop_unpack_half_2x16_split_x:
1761    case ir_unop_unpack_half_2x16_split_y:
1762    case ir_binop_pack_half_2x16_split:
1763    case ir_unop_interpolate_at_centroid:
1764    case ir_binop_interpolate_at_sample:
1765    case ir_binop_interpolate_at_offset:
1766       unreachable("not reached: should not occur in vertex shader");
1767    case ir_binop_ldexp:
1768       unreachable("not reached: should be handled by ldexp_to_arith()");
1769    }
1770 }
1771
1772
1773 void
1774 vec4_visitor::visit(ir_swizzle *ir)
1775 {
1776    src_reg src;
1777    int i = 0;
1778    int swizzle[4];
1779
1780    /* Note that this is only swizzles in expressions, not those on the left
1781     * hand side of an assignment, which do write masking.  See ir_assignment
1782     * for that.
1783     */
1784
1785    ir->val->accept(this);
1786    src = this->result;
1787    assert(src.file != BAD_FILE);
1788
1789    for (i = 0; i < ir->type->vector_elements; i++) {
1790       switch (i) {
1791       case 0:
1792          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1793          break;
1794       case 1:
1795          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1796          break;
1797       case 2:
1798          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1799          break;
1800       case 3:
1801          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1802             break;
1803       }
1804    }
1805    for (; i < 4; i++) {
1806       /* Replicate the last channel out. */
1807       swizzle[i] = swizzle[ir->type->vector_elements - 1];
1808    }
1809
1810    src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1811
1812    this->result = src;
1813 }
1814
1815 void
1816 vec4_visitor::visit(ir_dereference_variable *ir)
1817 {
1818    const struct glsl_type *type = ir->type;
1819    dst_reg *reg = variable_storage(ir->var);
1820
1821    if (!reg) {
1822       fail("Failed to find variable storage for %s\n", ir->var->name);
1823       this->result = src_reg(brw_null_reg());
1824       return;
1825    }
1826
1827    this->result = src_reg(*reg);
1828
1829    /* System values get their swizzle from the dst_reg writemask */
1830    if (ir->var->data.mode == ir_var_system_value)
1831       return;
1832
1833    if (type->is_scalar() || type->is_vector() || type->is_matrix())
1834       this->result.swizzle = swizzle_for_size(type->vector_elements);
1835 }
1836
1837
1838 int
1839 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1840 {
1841    /* Under normal circumstances array elements are stored consecutively, so
1842     * the stride is equal to the size of the array element.
1843     */
1844    return type_size(ir->type);
1845 }
1846
1847
1848 void
1849 vec4_visitor::visit(ir_dereference_array *ir)
1850 {
1851    ir_constant *constant_index;
1852    src_reg src;
1853    int array_stride = compute_array_stride(ir);
1854
1855    constant_index = ir->array_index->constant_expression_value();
1856
1857    ir->array->accept(this);
1858    src = this->result;
1859
1860    if (constant_index) {
1861       src.reg_offset += constant_index->value.i[0] * array_stride;
1862    } else {
1863       /* Variable index array dereference.  It eats the "vec4" of the
1864        * base of the array and an index that offsets the Mesa register
1865        * index.
1866        */
1867       ir->array_index->accept(this);
1868
1869       src_reg index_reg;
1870
1871       if (array_stride == 1) {
1872          index_reg = this->result;
1873       } else {
1874          index_reg = src_reg(this, glsl_type::int_type);
1875
1876          emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
1877       }
1878
1879       if (src.reladdr) {
1880          src_reg temp = src_reg(this, glsl_type::int_type);
1881
1882          emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1883
1884          index_reg = temp;
1885       }
1886
1887       src.reladdr = ralloc(mem_ctx, src_reg);
1888       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1889    }
1890
1891    /* If the type is smaller than a vec4, replicate the last channel out. */
1892    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1893       src.swizzle = swizzle_for_size(ir->type->vector_elements);
1894    else
1895       src.swizzle = BRW_SWIZZLE_NOOP;
1896    src.type = brw_type_for_base_type(ir->type);
1897
1898    this->result = src;
1899 }
1900
1901 void
1902 vec4_visitor::visit(ir_dereference_record *ir)
1903 {
1904    unsigned int i;
1905    const glsl_type *struct_type = ir->record->type;
1906    int offset = 0;
1907
1908    ir->record->accept(this);
1909
1910    for (i = 0; i < struct_type->length; i++) {
1911       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1912          break;
1913       offset += type_size(struct_type->fields.structure[i].type);
1914    }
1915
1916    /* If the type is smaller than a vec4, replicate the last channel out. */
1917    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1918       this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1919    else
1920       this->result.swizzle = BRW_SWIZZLE_NOOP;
1921    this->result.type = brw_type_for_base_type(ir->type);
1922
1923    this->result.reg_offset += offset;
1924 }
1925
1926 /**
1927  * We want to be careful in assignment setup to hit the actual storage
1928  * instead of potentially using a temporary like we might with the
1929  * ir_dereference handler.
1930  */
1931 static dst_reg
1932 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1933 {
1934    /* The LHS must be a dereference.  If the LHS is a variable indexed array
1935     * access of a vector, it must be separated into a series conditional moves
1936     * before reaching this point (see ir_vec_index_to_cond_assign).
1937     */
1938    assert(ir->as_dereference());
1939    ir_dereference_array *deref_array = ir->as_dereference_array();
1940    if (deref_array) {
1941       assert(!deref_array->array->type->is_vector());
1942    }
1943
1944    /* Use the rvalue deref handler for the most part.  We'll ignore
1945     * swizzles in it and write swizzles using writemask, though.
1946     */
1947    ir->accept(v);
1948    return dst_reg(v->result);
1949 }
1950
1951 void
1952 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1953                               const struct glsl_type *type,
1954                               enum brw_predicate predicate)
1955 {
1956    if (type->base_type == GLSL_TYPE_STRUCT) {
1957       for (unsigned int i = 0; i < type->length; i++) {
1958          emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1959       }
1960       return;
1961    }
1962
1963    if (type->is_array()) {
1964       for (unsigned int i = 0; i < type->length; i++) {
1965          emit_block_move(dst, src, type->fields.array, predicate);
1966       }
1967       return;
1968    }
1969
1970    if (type->is_matrix()) {
1971       const struct glsl_type *vec_type;
1972
1973       vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1974                                          type->vector_elements, 1);
1975
1976       for (int i = 0; i < type->matrix_columns; i++) {
1977          emit_block_move(dst, src, vec_type, predicate);
1978       }
1979       return;
1980    }
1981
1982    assert(type->is_scalar() || type->is_vector());
1983
1984    dst->type = brw_type_for_base_type(type);
1985    src->type = dst->type;
1986
1987    dst->writemask = (1 << type->vector_elements) - 1;
1988
1989    src->swizzle = swizzle_for_size(type->vector_elements);
1990
1991    vec4_instruction *inst = emit(MOV(*dst, *src));
1992    inst->predicate = predicate;
1993
1994    dst->reg_offset++;
1995    src->reg_offset++;
1996 }
1997
1998
1999 /* If the RHS processing resulted in an instruction generating a
2000  * temporary value, and it would be easy to rewrite the instruction to
2001  * generate its result right into the LHS instead, do so.  This ends
2002  * up reliably removing instructions where it can be tricky to do so
2003  * later without real UD chain information.
2004  */
2005 bool
2006 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2007                                      dst_reg dst,
2008                                      src_reg src,
2009                                      vec4_instruction *pre_rhs_inst,
2010                                      vec4_instruction *last_rhs_inst)
2011 {
2012    /* This could be supported, but it would take more smarts. */
2013    if (ir->condition)
2014       return false;
2015
2016    if (pre_rhs_inst == last_rhs_inst)
2017       return false; /* No instructions generated to work with. */
2018
2019    /* Make sure the last instruction generated our source reg. */
2020    if (src.file != GRF ||
2021        src.file != last_rhs_inst->dst.file ||
2022        src.reg != last_rhs_inst->dst.reg ||
2023        src.reg_offset != last_rhs_inst->dst.reg_offset ||
2024        src.reladdr ||
2025        src.abs ||
2026        src.negate ||
2027        last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2028       return false;
2029
2030    /* Check that that last instruction fully initialized the channels
2031     * we want to use, in the order we want to use them.  We could
2032     * potentially reswizzle the operands of many instructions so that
2033     * we could handle out of order channels, but don't yet.
2034     */
2035
2036    for (unsigned i = 0; i < 4; i++) {
2037       if (dst.writemask & (1 << i)) {
2038          if (!(last_rhs_inst->dst.writemask & (1 << i)))
2039             return false;
2040
2041          if (BRW_GET_SWZ(src.swizzle, i) != i)
2042             return false;
2043       }
2044    }
2045
2046    /* Success!  Rewrite the instruction. */
2047    last_rhs_inst->dst.file = dst.file;
2048    last_rhs_inst->dst.reg = dst.reg;
2049    last_rhs_inst->dst.reg_offset = dst.reg_offset;
2050    last_rhs_inst->dst.reladdr = dst.reladdr;
2051    last_rhs_inst->dst.writemask &= dst.writemask;
2052
2053    return true;
2054 }
2055
2056 void
2057 vec4_visitor::visit(ir_assignment *ir)
2058 {
2059    dst_reg dst = get_assignment_lhs(ir->lhs, this);
2060    enum brw_predicate predicate = BRW_PREDICATE_NONE;
2061
2062    if (!ir->lhs->type->is_scalar() &&
2063        !ir->lhs->type->is_vector()) {
2064       ir->rhs->accept(this);
2065       src_reg src = this->result;
2066
2067       if (ir->condition) {
2068          emit_bool_to_cond_code(ir->condition, &predicate);
2069       }
2070
2071       /* emit_block_move doesn't account for swizzles in the source register.
2072        * This should be ok, since the source register is a structure or an
2073        * array, and those can't be swizzled.  But double-check to be sure.
2074        */
2075       assert(src.swizzle ==
2076              (ir->rhs->type->is_matrix()
2077               ? swizzle_for_size(ir->rhs->type->vector_elements)
2078               : BRW_SWIZZLE_NOOP));
2079
2080       emit_block_move(&dst, &src, ir->rhs->type, predicate);
2081       return;
2082    }
2083
2084    /* Now we're down to just a scalar/vector with writemasks. */
2085    int i;
2086
2087    vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2088    pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2089
2090    ir->rhs->accept(this);
2091
2092    last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2093
2094    src_reg src = this->result;
2095
2096    int swizzles[4];
2097    int first_enabled_chan = 0;
2098    int src_chan = 0;
2099
2100    assert(ir->lhs->type->is_vector() ||
2101           ir->lhs->type->is_scalar());
2102    dst.writemask = ir->write_mask;
2103
2104    for (int i = 0; i < 4; i++) {
2105       if (dst.writemask & (1 << i)) {
2106          first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
2107          break;
2108       }
2109    }
2110
2111    /* Swizzle a small RHS vector into the channels being written.
2112     *
2113     * glsl ir treats write_mask as dictating how many channels are
2114     * present on the RHS while in our instructions we need to make
2115     * those channels appear in the slots of the vec4 they're written to.
2116     */
2117    for (int i = 0; i < 4; i++) {
2118       if (dst.writemask & (1 << i))
2119          swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
2120       else
2121          swizzles[i] = first_enabled_chan;
2122    }
2123    src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2124                               swizzles[2], swizzles[3]);
2125
2126    if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2127       return;
2128    }
2129
2130    if (ir->condition) {
2131       emit_bool_to_cond_code(ir->condition, &predicate);
2132    }
2133
2134    for (i = 0; i < type_size(ir->lhs->type); i++) {
2135       vec4_instruction *inst = emit(MOV(dst, src));
2136       inst->predicate = predicate;
2137
2138       dst.reg_offset++;
2139       src.reg_offset++;
2140    }
2141 }
2142
2143 void
2144 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2145 {
2146    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2147       foreach_in_list(ir_constant, field_value, &ir->components) {
2148          emit_constant_values(dst, field_value);
2149       }
2150       return;
2151    }
2152
2153    if (ir->type->is_array()) {
2154       for (unsigned int i = 0; i < ir->type->length; i++) {
2155          emit_constant_values(dst, ir->array_elements[i]);
2156       }
2157       return;
2158    }
2159
2160    if (ir->type->is_matrix()) {
2161       for (int i = 0; i < ir->type->matrix_columns; i++) {
2162          float *vec = &ir->value.f[i * ir->type->vector_elements];
2163
2164          for (int j = 0; j < ir->type->vector_elements; j++) {
2165             dst->writemask = 1 << j;
2166             dst->type = BRW_REGISTER_TYPE_F;
2167
2168             emit(MOV(*dst, src_reg(vec[j])));
2169          }
2170          dst->reg_offset++;
2171       }
2172       return;
2173    }
2174
2175    int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2176
2177    for (int i = 0; i < ir->type->vector_elements; i++) {
2178       if (!(remaining_writemask & (1 << i)))
2179          continue;
2180
2181       dst->writemask = 1 << i;
2182       dst->type = brw_type_for_base_type(ir->type);
2183
2184       /* Find other components that match the one we're about to
2185        * write.  Emits fewer instructions for things like vec4(0.5,
2186        * 1.5, 1.5, 1.5).
2187        */
2188       for (int j = i + 1; j < ir->type->vector_elements; j++) {
2189          if (ir->type->base_type == GLSL_TYPE_BOOL) {
2190             if (ir->value.b[i] == ir->value.b[j])
2191                dst->writemask |= (1 << j);
2192          } else {
2193             /* u, i, and f storage all line up, so no need for a
2194              * switch case for comparing each type.
2195              */
2196             if (ir->value.u[i] == ir->value.u[j])
2197                dst->writemask |= (1 << j);
2198          }
2199       }
2200
2201       switch (ir->type->base_type) {
2202       case GLSL_TYPE_FLOAT:
2203          emit(MOV(*dst, src_reg(ir->value.f[i])));
2204          break;
2205       case GLSL_TYPE_INT:
2206          emit(MOV(*dst, src_reg(ir->value.i[i])));
2207          break;
2208       case GLSL_TYPE_UINT:
2209          emit(MOV(*dst, src_reg(ir->value.u[i])));
2210          break;
2211       case GLSL_TYPE_BOOL:
2212          emit(MOV(*dst,
2213                   src_reg(ir->value.b[i] != 0 ? ctx->Const.UniformBooleanTrue
2214                                               : 0u)));
2215          break;
2216       default:
2217          unreachable("Non-float/uint/int/bool constant");
2218       }
2219
2220       remaining_writemask &= ~dst->writemask;
2221    }
2222    dst->reg_offset++;
2223 }
2224
2225 void
2226 vec4_visitor::visit(ir_constant *ir)
2227 {
2228    dst_reg dst = dst_reg(this, ir->type);
2229    this->result = src_reg(dst);
2230
2231    emit_constant_values(&dst, ir);
2232 }
2233
2234 void
2235 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2236 {
2237    ir_dereference *deref = static_cast<ir_dereference *>(
2238       ir->actual_parameters.get_head());
2239    ir_variable *location = deref->variable_referenced();
2240    unsigned surf_index = (prog_data->base.binding_table.abo_start +
2241                           location->data.binding);
2242
2243    /* Calculate the surface offset */
2244    src_reg offset(this, glsl_type::uint_type);
2245    ir_dereference_array *deref_array = deref->as_dereference_array();
2246    if (deref_array) {
2247       deref_array->array_index->accept(this);
2248
2249       src_reg tmp(this, glsl_type::uint_type);
2250       emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2251       emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2252    } else {
2253       offset = location->data.atomic.offset;
2254    }
2255
2256    /* Emit the appropriate machine instruction */
2257    const char *callee = ir->callee->function_name();
2258    dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2259
2260    if (!strcmp("__intrinsic_atomic_read", callee)) {
2261       emit_untyped_surface_read(surf_index, dst, offset);
2262
2263    } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2264       emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2265                           src_reg(), src_reg());
2266
2267    } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2268       emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2269                           src_reg(), src_reg());
2270    }
2271 }
2272
2273 void
2274 vec4_visitor::visit(ir_call *ir)
2275 {
2276    const char *callee = ir->callee->function_name();
2277
2278    if (!strcmp("__intrinsic_atomic_read", callee) ||
2279        !strcmp("__intrinsic_atomic_increment", callee) ||
2280        !strcmp("__intrinsic_atomic_predecrement", callee)) {
2281       visit_atomic_counter_intrinsic(ir);
2282    } else {
2283       unreachable("Unsupported intrinsic.");
2284    }
2285 }
2286
2287 src_reg
2288 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, src_reg sampler)
2289 {
2290    vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_MCS);
2291    inst->base_mrf = 2;
2292    inst->mlen = 1;
2293    inst->dst = dst_reg(this, glsl_type::uvec4_type);
2294    inst->dst.writemask = WRITEMASK_XYZW;
2295
2296    inst->src[1] = sampler;
2297
2298    /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2299    int param_base = inst->base_mrf;
2300    int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2301    int zero_mask = 0xf & ~coord_mask;
2302
2303    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2304             coordinate));
2305
2306    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2307             src_reg(0)));
2308
2309    emit(inst);
2310    return src_reg(inst->dst);
2311 }
2312
2313 static bool
2314 is_high_sampler(struct brw_context *brw, src_reg sampler)
2315 {
2316    if (brw->gen < 8 && !brw->is_haswell)
2317       return false;
2318
2319    return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
2320 }
2321
2322 void
2323 vec4_visitor::visit(ir_texture *ir)
2324 {
2325    uint32_t sampler =
2326       _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2327
2328    ir_rvalue *nonconst_sampler_index =
2329       _mesa_get_sampler_array_nonconst_index(ir->sampler);
2330
2331    /* Handle non-constant sampler array indexing */
2332    src_reg sampler_reg;
2333    if (nonconst_sampler_index) {
2334       /* The highest sampler which may be used by this operation is
2335        * the last element of the array. Mark it here, because the generator
2336        * doesn't have enough information to determine the bound.
2337        */
2338       uint32_t array_size = ir->sampler->as_dereference_array()
2339          ->array->type->array_size();
2340
2341       uint32_t max_used = sampler + array_size - 1;
2342       if (ir->op == ir_tg4 && brw->gen < 8) {
2343          max_used += prog_data->base.binding_table.gather_texture_start;
2344       } else {
2345          max_used += prog_data->base.binding_table.texture_start;
2346       }
2347
2348       brw_mark_surface_used(&prog_data->base, max_used);
2349
2350       /* Emit code to evaluate the actual indexing expression */
2351       nonconst_sampler_index->accept(this);
2352       dst_reg temp(this, glsl_type::uint_type);
2353       emit(ADD(temp, this->result, src_reg(sampler)))
2354          ->force_writemask_all = true;
2355       sampler_reg = src_reg(temp);
2356    } else {
2357       /* Single sampler, or constant array index; the indexing expression
2358        * is just an immediate.
2359        */
2360       sampler_reg = src_reg(sampler);
2361    }
2362
2363    /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2364     * emitting anything other than setting up the constant result.
2365     */
2366    if (ir->op == ir_tg4) {
2367       ir_constant *chan = ir->lod_info.component->as_constant();
2368       int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2369       if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2370          dst_reg result(this, ir->type);
2371          this->result = src_reg(result);
2372          emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2373          return;
2374       }
2375    }
2376
2377    /* Should be lowered by do_lower_texture_projection */
2378    assert(!ir->projector);
2379
2380    /* Should be lowered */
2381    assert(!ir->offset || !ir->offset->type->is_array());
2382
2383    /* Generate code to compute all the subexpression trees.  This has to be
2384     * done before loading any values into MRFs for the sampler message since
2385     * generating these values may involve SEND messages that need the MRFs.
2386     */
2387    src_reg coordinate;
2388    if (ir->coordinate) {
2389       ir->coordinate->accept(this);
2390       coordinate = this->result;
2391    }
2392
2393    src_reg shadow_comparitor;
2394    if (ir->shadow_comparitor) {
2395       ir->shadow_comparitor->accept(this);
2396       shadow_comparitor = this->result;
2397    }
2398
2399    bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2400    src_reg offset_value;
2401    if (has_nonconstant_offset) {
2402       ir->offset->accept(this);
2403       offset_value = src_reg(this->result);
2404    }
2405
2406    const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2407    src_reg lod, dPdx, dPdy, sample_index, mcs;
2408    switch (ir->op) {
2409    case ir_tex:
2410       lod = src_reg(0.0f);
2411       lod_type = glsl_type::float_type;
2412       break;
2413    case ir_txf:
2414    case ir_txl:
2415    case ir_txs:
2416       ir->lod_info.lod->accept(this);
2417       lod = this->result;
2418       lod_type = ir->lod_info.lod->type;
2419       break;
2420    case ir_query_levels:
2421       lod = src_reg(0);
2422       lod_type = glsl_type::int_type;
2423       break;
2424    case ir_txf_ms:
2425       ir->lod_info.sample_index->accept(this);
2426       sample_index = this->result;
2427       sample_index_type = ir->lod_info.sample_index->type;
2428
2429       if (brw->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2430          mcs = emit_mcs_fetch(ir, coordinate, sampler_reg);
2431       else
2432          mcs = src_reg(0u);
2433       break;
2434    case ir_txd:
2435       ir->lod_info.grad.dPdx->accept(this);
2436       dPdx = this->result;
2437
2438       ir->lod_info.grad.dPdy->accept(this);
2439       dPdy = this->result;
2440
2441       lod_type = ir->lod_info.grad.dPdx->type;
2442       break;
2443    case ir_txb:
2444    case ir_lod:
2445    case ir_tg4:
2446       break;
2447    }
2448
2449    enum opcode opcode;
2450    switch (ir->op) {
2451    case ir_tex: opcode = SHADER_OPCODE_TXL; break;
2452    case ir_txl: opcode = SHADER_OPCODE_TXL; break;
2453    case ir_txd: opcode = SHADER_OPCODE_TXD; break;
2454    case ir_txf: opcode = SHADER_OPCODE_TXF; break;
2455    case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
2456    case ir_txs: opcode = SHADER_OPCODE_TXS; break;
2457    case ir_tg4: opcode = has_nonconstant_offset
2458                          ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
2459    case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
2460    case ir_txb:
2461       unreachable("TXB is not valid for vertex shaders.");
2462    case ir_lod:
2463       unreachable("LOD is not valid for vertex shaders.");
2464    default:
2465       unreachable("Unrecognized tex op");
2466    }
2467
2468    vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, opcode);
2469
2470    if (ir->offset != NULL && !has_nonconstant_offset) {
2471       inst->texture_offset =
2472          brw_texture_offset(ctx, ir->offset->as_constant()->value.i,
2473                             ir->offset->type->vector_elements);
2474    }
2475
2476    /* Stuff the channel select bits in the top of the texture offset */
2477    if (ir->op == ir_tg4)
2478       inst->texture_offset |= gather_channel(ir, sampler) << 16;
2479
2480    /* The message header is necessary for:
2481     * - Gen4 (always)
2482     * - Texel offsets
2483     * - Gather channel selection
2484     * - Sampler indices too large to fit in a 4-bit value.
2485     */
2486    inst->header_present =
2487       brw->gen < 5 || inst->texture_offset != 0 || ir->op == ir_tg4 ||
2488       is_high_sampler(brw, sampler_reg);
2489    inst->base_mrf = 2;
2490    inst->mlen = inst->header_present + 1; /* always at least one */
2491    inst->dst = dst_reg(this, ir->type);
2492    inst->dst.writemask = WRITEMASK_XYZW;
2493    inst->shadow_compare = ir->shadow_comparitor != NULL;
2494
2495    inst->src[1] = sampler_reg;
2496
2497    /* MRF for the first parameter */
2498    int param_base = inst->base_mrf + inst->header_present;
2499
2500    if (ir->op == ir_txs || ir->op == ir_query_levels) {
2501       int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2502       emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2503    } else {
2504       /* Load the coordinate */
2505       /* FINISHME: gl_clamp_mask and saturate */
2506       int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2507       int zero_mask = 0xf & ~coord_mask;
2508
2509       emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2510                coordinate));
2511
2512       if (zero_mask != 0) {
2513          emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2514                   src_reg(0)));
2515       }
2516       /* Load the shadow comparitor */
2517       if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2518          emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2519                           WRITEMASK_X),
2520                   shadow_comparitor));
2521          inst->mlen++;
2522       }
2523
2524       /* Load the LOD info */
2525       if (ir->op == ir_tex || ir->op == ir_txl) {
2526          int mrf, writemask;
2527          if (brw->gen >= 5) {
2528             mrf = param_base + 1;
2529             if (ir->shadow_comparitor) {
2530                writemask = WRITEMASK_Y;
2531                /* mlen already incremented */
2532             } else {
2533                writemask = WRITEMASK_X;
2534                inst->mlen++;
2535             }
2536          } else /* brw->gen == 4 */ {
2537             mrf = param_base;
2538             writemask = WRITEMASK_W;
2539          }
2540          emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2541       } else if (ir->op == ir_txf) {
2542          emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2543       } else if (ir->op == ir_txf_ms) {
2544          emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2545                   sample_index));
2546          if (brw->gen >= 7) {
2547             /* MCS data is in the first channel of `mcs`, but we need to get it into
2548              * the .y channel of the second vec4 of params, so replicate .x across
2549              * the whole vec4 and then mask off everything except .y
2550              */
2551             mcs.swizzle = BRW_SWIZZLE_XXXX;
2552             emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2553                      mcs));
2554          }
2555          inst->mlen++;
2556       } else if (ir->op == ir_txd) {
2557          const glsl_type *type = lod_type;
2558
2559          if (brw->gen >= 5) {
2560             dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2561             dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2562             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2563             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2564             inst->mlen++;
2565
2566             if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2567                dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2568                dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2569                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2570                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2571                inst->mlen++;
2572
2573                if (ir->shadow_comparitor) {
2574                   emit(MOV(dst_reg(MRF, param_base + 2,
2575                                    ir->shadow_comparitor->type, WRITEMASK_Z),
2576                            shadow_comparitor));
2577                }
2578             }
2579          } else /* brw->gen == 4 */ {
2580             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2581             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2582             inst->mlen += 2;
2583          }
2584       } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2585          if (ir->shadow_comparitor) {
2586             emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2587                      shadow_comparitor));
2588          }
2589
2590          emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2591                   offset_value));
2592          inst->mlen++;
2593       }
2594    }
2595
2596    emit(inst);
2597
2598    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2599     * spec requires layers.
2600     */
2601    if (ir->op == ir_txs) {
2602       glsl_type const *type = ir->sampler->type;
2603       if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2604           type->sampler_array) {
2605          emit_math(SHADER_OPCODE_INT_QUOTIENT,
2606                    writemask(inst->dst, WRITEMASK_Z),
2607                    src_reg(inst->dst), src_reg(6));
2608       }
2609    }
2610
2611    if (brw->gen == 6 && ir->op == ir_tg4) {
2612       emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2613    }
2614
2615    swizzle_result(ir, src_reg(inst->dst), sampler);
2616 }
2617
2618 /**
2619  * Apply workarounds for Gen6 gather with UINT/SINT
2620  */
2621 void
2622 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2623 {
2624    if (!wa)
2625       return;
2626
2627    int width = (wa & WA_8BIT) ? 8 : 16;
2628    dst_reg dst_f = dst;
2629    dst_f.type = BRW_REGISTER_TYPE_F;
2630
2631    /* Convert from UNORM to UINT */
2632    emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2633    emit(MOV(dst, src_reg(dst_f)));
2634
2635    if (wa & WA_SIGN) {
2636       /* Reinterpret the UINT value as a signed INT value by
2637        * shifting the sign bit into place, then shifting back
2638        * preserving sign.
2639        */
2640       emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2641       emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2642    }
2643 }
2644
2645 /**
2646  * Set up the gather channel based on the swizzle, for gather4.
2647  */
2648 uint32_t
2649 vec4_visitor::gather_channel(ir_texture *ir, uint32_t sampler)
2650 {
2651    ir_constant *chan = ir->lod_info.component->as_constant();
2652    int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2653    switch (swiz) {
2654       case SWIZZLE_X: return 0;
2655       case SWIZZLE_Y:
2656          /* gather4 sampler is broken for green channel on RG32F --
2657           * we must ask for blue instead.
2658           */
2659          if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2660             return 2;
2661          return 1;
2662       case SWIZZLE_Z: return 2;
2663       case SWIZZLE_W: return 3;
2664       default:
2665          unreachable("Not reached"); /* zero, one swizzles handled already */
2666    }
2667 }
2668
2669 void
2670 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, uint32_t sampler)
2671 {
2672    int s = key->tex.swizzles[sampler];
2673
2674    this->result = src_reg(this, ir->type);
2675    dst_reg swizzled_result(this->result);
2676
2677    if (ir->op == ir_query_levels) {
2678       /* # levels is in .w */
2679       orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2680       emit(MOV(swizzled_result, orig_val));
2681       return;
2682    }
2683
2684    if (ir->op == ir_txs || ir->type == glsl_type::float_type
2685                         || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2686       emit(MOV(swizzled_result, orig_val));
2687       return;
2688    }
2689
2690
2691    int zero_mask = 0, one_mask = 0, copy_mask = 0;
2692    int swizzle[4] = {0};
2693
2694    for (int i = 0; i < 4; i++) {
2695       switch (GET_SWZ(s, i)) {
2696       case SWIZZLE_ZERO:
2697          zero_mask |= (1 << i);
2698          break;
2699       case SWIZZLE_ONE:
2700          one_mask |= (1 << i);
2701          break;
2702       default:
2703          copy_mask |= (1 << i);
2704          swizzle[i] = GET_SWZ(s, i);
2705          break;
2706       }
2707    }
2708
2709    if (copy_mask) {
2710       orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2711       swizzled_result.writemask = copy_mask;
2712       emit(MOV(swizzled_result, orig_val));
2713    }
2714
2715    if (zero_mask) {
2716       swizzled_result.writemask = zero_mask;
2717       emit(MOV(swizzled_result, src_reg(0.0f)));
2718    }
2719
2720    if (one_mask) {
2721       swizzled_result.writemask = one_mask;
2722       emit(MOV(swizzled_result, src_reg(1.0f)));
2723    }
2724 }
2725
2726 void
2727 vec4_visitor::visit(ir_return *)
2728 {
2729    unreachable("not reached");
2730 }
2731
2732 void
2733 vec4_visitor::visit(ir_discard *)
2734 {
2735    unreachable("not reached");
2736 }
2737
2738 void
2739 vec4_visitor::visit(ir_if *ir)
2740 {
2741    /* Don't point the annotation at the if statement, because then it plus
2742     * the then and else blocks get printed.
2743     */
2744    this->base_ir = ir->condition;
2745
2746    if (brw->gen == 6) {
2747       emit_if_gen6(ir);
2748    } else {
2749       enum brw_predicate predicate;
2750       emit_bool_to_cond_code(ir->condition, &predicate);
2751       emit(IF(predicate));
2752    }
2753
2754    visit_instructions(&ir->then_instructions);
2755
2756    if (!ir->else_instructions.is_empty()) {
2757       this->base_ir = ir->condition;
2758       emit(BRW_OPCODE_ELSE);
2759
2760       visit_instructions(&ir->else_instructions);
2761    }
2762
2763    this->base_ir = ir->condition;
2764    emit(BRW_OPCODE_ENDIF);
2765 }
2766
2767 void
2768 vec4_visitor::visit(ir_emit_vertex *)
2769 {
2770    unreachable("not reached");
2771 }
2772
2773 void
2774 vec4_visitor::visit(ir_end_primitive *)
2775 {
2776    unreachable("not reached");
2777 }
2778
2779 void
2780 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2781                                   dst_reg dst, src_reg offset,
2782                                   src_reg src0, src_reg src1)
2783 {
2784    unsigned mlen = 0;
2785
2786    /* Set the atomic operation offset. */
2787    emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2788    mlen++;
2789
2790    /* Set the atomic operation arguments. */
2791    if (src0.file != BAD_FILE) {
2792       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2793       mlen++;
2794    }
2795
2796    if (src1.file != BAD_FILE) {
2797       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2798       mlen++;
2799    }
2800
2801    /* Emit the instruction.  Note that this maps to the normal SIMD8
2802     * untyped atomic message on Ivy Bridge, but that's OK because
2803     * unused channels will be masked out.
2804     */
2805    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
2806                                  src_reg(atomic_op), src_reg(surf_index));
2807    inst->base_mrf = 0;
2808    inst->mlen = mlen;
2809 }
2810
2811 void
2812 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
2813                                         src_reg offset)
2814 {
2815    /* Set the surface read offset. */
2816    emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
2817
2818    /* Emit the instruction.  Note that this maps to the normal SIMD8
2819     * untyped surface read message, but that's OK because unused
2820     * channels will be masked out.
2821     */
2822    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ,
2823                                  dst, src_reg(surf_index));
2824    inst->base_mrf = 0;
2825    inst->mlen = 1;
2826 }
2827
2828 void
2829 vec4_visitor::emit_ndc_computation()
2830 {
2831    /* Get the position */
2832    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2833
2834    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2835    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2836    output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2837
2838    current_annotation = "NDC";
2839    dst_reg ndc_w = ndc;
2840    ndc_w.writemask = WRITEMASK_W;
2841    src_reg pos_w = pos;
2842    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2843    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2844
2845    dst_reg ndc_xyz = ndc;
2846    ndc_xyz.writemask = WRITEMASK_XYZ;
2847
2848    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2849 }
2850
2851 void
2852 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
2853 {
2854    if (brw->gen < 6 &&
2855        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2856         key->userclip_active || brw->has_negative_rhw_bug)) {
2857       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2858       dst_reg header1_w = header1;
2859       header1_w.writemask = WRITEMASK_W;
2860
2861       emit(MOV(header1, 0u));
2862
2863       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2864          src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2865
2866          current_annotation = "Point size";
2867          emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2868          emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2869       }
2870
2871       if (key->userclip_active) {
2872          current_annotation = "Clipping flags";
2873          dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
2874          dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
2875
2876          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
2877          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
2878          emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
2879
2880          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
2881          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
2882          emit(SHL(flags1, src_reg(flags1), src_reg(4)));
2883          emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
2884       }
2885
2886       /* i965 clipping workaround:
2887        * 1) Test for -ve rhw
2888        * 2) If set,
2889        *      set ndc = (0,0,0,0)
2890        *      set ucp[6] = 1
2891        *
2892        * Later, clipping will detect ucp[6] and ensure the primitive is
2893        * clipped against all fixed planes.
2894        */
2895       if (brw->has_negative_rhw_bug) {
2896          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
2897          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
2898          emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
2899          vec4_instruction *inst;
2900          inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
2901          inst->predicate = BRW_PREDICATE_NORMAL;
2902          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
2903          inst->predicate = BRW_PREDICATE_NORMAL;
2904       }
2905
2906       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2907    } else if (brw->gen < 6) {
2908       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2909    } else {
2910       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2911       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2912          dst_reg reg_w = reg;
2913          reg_w.writemask = WRITEMASK_W;
2914          emit(MOV(reg_w, src_reg(output_reg[VARYING_SLOT_PSIZ])));
2915       }
2916       if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
2917          dst_reg reg_y = reg;
2918          reg_y.writemask = WRITEMASK_Y;
2919          reg_y.type = BRW_REGISTER_TYPE_D;
2920          emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
2921       }
2922       if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
2923          dst_reg reg_z = reg;
2924          reg_z.writemask = WRITEMASK_Z;
2925          reg_z.type = BRW_REGISTER_TYPE_D;
2926          emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
2927       }
2928    }
2929 }
2930
2931 void
2932 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
2933 {
2934    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2935     *
2936     *     "If a linked set of shaders forming the vertex stage contains no
2937     *     static write to gl_ClipVertex or gl_ClipDistance, but the
2938     *     application has requested clipping against user clip planes through
2939     *     the API, then the coordinate written to gl_Position is used for
2940     *     comparison against the user clip planes."
2941     *
2942     * This function is only called if the shader didn't write to
2943     * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
2944     * if the user wrote to it; otherwise we use gl_Position.
2945     */
2946    gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
2947    if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
2948       clip_vertex = VARYING_SLOT_POS;
2949    }
2950
2951    for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
2952         ++i) {
2953       reg.writemask = 1 << i;
2954       emit(DP4(reg,
2955                src_reg(output_reg[clip_vertex]),
2956                src_reg(this->userplane[i + offset])));
2957    }
2958 }
2959
2960 void
2961 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
2962 {
2963    assert (varying < VARYING_SLOT_MAX);
2964    reg.type = output_reg[varying].type;
2965    current_annotation = output_reg_annotation[varying];
2966    /* Copy the register, saturating if necessary */
2967    vec4_instruction *inst = emit(MOV(reg,
2968                                      src_reg(output_reg[varying])));
2969    if ((varying == VARYING_SLOT_COL0 ||
2970         varying == VARYING_SLOT_COL1 ||
2971         varying == VARYING_SLOT_BFC0 ||
2972         varying == VARYING_SLOT_BFC1) &&
2973        key->clamp_vertex_color) {
2974       inst->saturate = true;
2975    }
2976 }
2977
2978 void
2979 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
2980 {
2981    reg.type = BRW_REGISTER_TYPE_F;
2982
2983    switch (varying) {
2984    case VARYING_SLOT_PSIZ:
2985    {
2986       /* PSIZ is always in slot 0, and is coupled with other flags. */
2987       current_annotation = "indices, point width, clip flags";
2988       emit_psiz_and_flags(reg);
2989       break;
2990    }
2991    case BRW_VARYING_SLOT_NDC:
2992       current_annotation = "NDC";
2993       emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
2994       break;
2995    case VARYING_SLOT_POS:
2996       current_annotation = "gl_Position";
2997       emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
2998       break;
2999    case VARYING_SLOT_EDGE:
3000       /* This is present when doing unfilled polygons.  We're supposed to copy
3001        * the edge flag from the user-provided vertex array
3002        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
3003        * of that attribute (starts as 1.0f).  This is then used in clipping to
3004        * determine which edges should be drawn as wireframe.
3005        */
3006       current_annotation = "edge flag";
3007       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
3008                                     glsl_type::float_type, WRITEMASK_XYZW))));
3009       break;
3010    case BRW_VARYING_SLOT_PAD:
3011       /* No need to write to this slot */
3012       break;
3013    default:
3014       emit_generic_urb_slot(reg, varying);
3015       break;
3016    }
3017 }
3018
3019 static int
3020 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
3021 {
3022    if (brw->gen >= 6) {
3023       /* URB data written (does not include the message header reg) must
3024        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
3025        * section 5.4.3.2.2: URB_INTERLEAVED.
3026        *
3027        * URB entries are allocated on a multiple of 1024 bits, so an
3028        * extra 128 bits written here to make the end align to 256 is
3029        * no problem.
3030        */
3031       if ((mlen % 2) != 1)
3032          mlen++;
3033    }
3034
3035    return mlen;
3036 }
3037
3038
3039 /**
3040  * Generates the VUE payload plus the necessary URB write instructions to
3041  * output it.
3042  *
3043  * The VUE layout is documented in Volume 2a.
3044  */
3045 void
3046 vec4_visitor::emit_vertex()
3047 {
3048    /* MRF 0 is reserved for the debugger, so start with message header
3049     * in MRF 1.
3050     */
3051    int base_mrf = 1;
3052    int mrf = base_mrf;
3053    /* In the process of generating our URB write message contents, we
3054     * may need to unspill a register or load from an array.  Those
3055     * reads would use MRFs 14-15.
3056     */
3057    int max_usable_mrf = 13;
3058
3059    /* The following assertion verifies that max_usable_mrf causes an
3060     * even-numbered amount of URB write data, which will meet gen6's
3061     * requirements for length alignment.
3062     */
3063    assert ((max_usable_mrf - base_mrf) % 2 == 0);
3064
3065    /* First mrf is the g0-based message header containing URB handles and
3066     * such.
3067     */
3068    emit_urb_write_header(mrf++);
3069
3070    if (brw->gen < 6) {
3071       emit_ndc_computation();
3072    }
3073
3074    /* Lower legacy ff and ClipVertex clipping to clip distances */
3075    if (key->userclip_active && !prog->UsesClipDistanceOut) {
3076       current_annotation = "user clip distances";
3077
3078       output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3079       output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3080
3081       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3082       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3083    }
3084
3085    /* We may need to split this up into several URB writes, so do them in a
3086     * loop.
3087     */
3088    int slot = 0;
3089    bool complete = false;
3090    do {
3091       /* URB offset is in URB row increments, and each of our MRFs is half of
3092        * one of those, since we're doing interleaved writes.
3093        */
3094       int offset = slot / 2;
3095
3096       mrf = base_mrf + 1;
3097       for (; slot < prog_data->vue_map.num_slots; ++slot) {
3098          emit_urb_slot(dst_reg(MRF, mrf++),
3099                        prog_data->vue_map.slot_to_varying[slot]);
3100
3101          /* If this was max_usable_mrf, we can't fit anything more into this
3102           * URB WRITE.
3103           */
3104          if (mrf > max_usable_mrf) {
3105             slot++;
3106             break;
3107          }
3108       }
3109
3110       complete = slot >= prog_data->vue_map.num_slots;
3111       current_annotation = "URB write";
3112       vec4_instruction *inst = emit_urb_write_opcode(complete);
3113       inst->base_mrf = base_mrf;
3114       inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
3115       inst->offset += offset;
3116    } while(!complete);
3117 }
3118
3119
3120 src_reg
3121 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
3122                                  src_reg *reladdr, int reg_offset)
3123 {
3124    /* Because we store the values to scratch interleaved like our
3125     * vertex data, we need to scale the vec4 index by 2.
3126     */
3127    int message_header_scale = 2;
3128
3129    /* Pre-gen6, the message header uses byte offsets instead of vec4
3130     * (16-byte) offset units.
3131     */
3132    if (brw->gen < 6)
3133       message_header_scale *= 16;
3134
3135    if (reladdr) {
3136       src_reg index = src_reg(this, glsl_type::int_type);
3137
3138       emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3139                                    src_reg(reg_offset)));
3140       emit_before(block, inst, MUL(dst_reg(index), index,
3141                                    src_reg(message_header_scale)));
3142
3143       return index;
3144    } else {
3145       return src_reg(reg_offset * message_header_scale);
3146    }
3147 }
3148
3149 src_reg
3150 vec4_visitor::get_pull_constant_offset(bblock_t * block, vec4_instruction *inst,
3151                                        src_reg *reladdr, int reg_offset)
3152 {
3153    if (reladdr) {
3154       src_reg index = src_reg(this, glsl_type::int_type);
3155
3156       emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3157                                    src_reg(reg_offset)));
3158
3159       /* Pre-gen6, the message header uses byte offsets instead of vec4
3160        * (16-byte) offset units.
3161        */
3162       if (brw->gen < 6) {
3163          emit_before(block, inst, MUL(dst_reg(index), index, src_reg(16)));
3164       }
3165
3166       return index;
3167    } else if (brw->gen >= 8) {
3168       /* Store the offset in a GRF so we can send-from-GRF. */
3169       src_reg offset = src_reg(this, glsl_type::int_type);
3170       emit_before(block, inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3171       return offset;
3172    } else {
3173       int message_header_scale = brw->gen < 6 ? 16 : 1;
3174       return src_reg(reg_offset * message_header_scale);
3175    }
3176 }
3177
3178 /**
3179  * Emits an instruction before @inst to load the value named by @orig_src
3180  * from scratch space at @base_offset to @temp.
3181  *
3182  * @base_offset is measured in 32-byte units (the size of a register).
3183  */
3184 void
3185 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
3186                                 dst_reg temp, src_reg orig_src,
3187                                 int base_offset)
3188 {
3189    int reg_offset = base_offset + orig_src.reg_offset;
3190    src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
3191                                       reg_offset);
3192
3193    emit_before(block, inst, SCRATCH_READ(temp, index));
3194 }
3195
3196 /**
3197  * Emits an instruction after @inst to store the value to be written
3198  * to @orig_dst to scratch space at @base_offset, from @temp.
3199  *
3200  * @base_offset is measured in 32-byte units (the size of a register).
3201  */
3202 void
3203 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
3204                                  int base_offset)
3205 {
3206    int reg_offset = base_offset + inst->dst.reg_offset;
3207    src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
3208                                       reg_offset);
3209
3210    /* Create a temporary register to store *inst's result in.
3211     *
3212     * We have to be careful in MOVing from our temporary result register in
3213     * the scratch write.  If we swizzle from channels of the temporary that
3214     * weren't initialized, it will confuse live interval analysis, which will
3215     * make spilling fail to make progress.
3216     */
3217    src_reg temp = src_reg(this, glsl_type::vec4_type);
3218    temp.type = inst->dst.type;
3219    int first_writemask_chan = ffs(inst->dst.writemask) - 1;
3220    int swizzles[4];
3221    for (int i = 0; i < 4; i++)
3222       if (inst->dst.writemask & (1 << i))
3223          swizzles[i] = i;
3224       else
3225          swizzles[i] = first_writemask_chan;
3226    temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
3227                                swizzles[2], swizzles[3]);
3228
3229    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3230                                        inst->dst.writemask));
3231    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3232    write->predicate = inst->predicate;
3233    write->ir = inst->ir;
3234    write->annotation = inst->annotation;
3235    inst->insert_after(block, write);
3236
3237    inst->dst.file = temp.file;
3238    inst->dst.reg = temp.reg;
3239    inst->dst.reg_offset = temp.reg_offset;
3240    inst->dst.reladdr = NULL;
3241 }
3242
3243 /**
3244  * We can't generally support array access in GRF space, because a
3245  * single instruction's destination can only span 2 contiguous
3246  * registers.  So, we send all GRF arrays that get variable index
3247  * access to scratch space.
3248  */
3249 void
3250 vec4_visitor::move_grf_array_access_to_scratch()
3251 {
3252    int scratch_loc[this->virtual_grf_count];
3253    memset(scratch_loc, -1, sizeof(scratch_loc));
3254
3255    /* First, calculate the set of virtual GRFs that need to be punted
3256     * to scratch due to having any array access on them, and where in
3257     * scratch.
3258     */
3259    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
3260       if (inst->dst.file == GRF && inst->dst.reladdr &&
3261           scratch_loc[inst->dst.reg] == -1) {
3262          scratch_loc[inst->dst.reg] = c->last_scratch;
3263          c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
3264       }
3265
3266       for (int i = 0 ; i < 3; i++) {
3267          src_reg *src = &inst->src[i];
3268
3269          if (src->file == GRF && src->reladdr &&
3270              scratch_loc[src->reg] == -1) {
3271             scratch_loc[src->reg] = c->last_scratch;
3272             c->last_scratch += this->virtual_grf_sizes[src->reg];
3273          }
3274       }
3275    }
3276
3277    /* Now, for anything that will be accessed through scratch, rewrite
3278     * it to load/store.  Note that this is a _safe list walk, because
3279     * we may generate a new scratch_write instruction after the one
3280     * we're processing.
3281     */
3282    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3283       /* Set up the annotation tracking for new generated instructions. */
3284       base_ir = inst->ir;
3285       current_annotation = inst->annotation;
3286
3287       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
3288          emit_scratch_write(block, inst, scratch_loc[inst->dst.reg]);
3289       }
3290
3291       for (int i = 0 ; i < 3; i++) {
3292          if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
3293             continue;
3294
3295          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3296
3297          emit_scratch_read(block, inst, temp, inst->src[i],
3298                            scratch_loc[inst->src[i].reg]);
3299
3300          inst->src[i].file = temp.file;
3301          inst->src[i].reg = temp.reg;
3302          inst->src[i].reg_offset = temp.reg_offset;
3303          inst->src[i].reladdr = NULL;
3304       }
3305    }
3306 }
3307
3308 /**
3309  * Emits an instruction before @inst to load the value named by @orig_src
3310  * from the pull constant buffer (surface) at @base_offset to @temp.
3311  */
3312 void
3313 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
3314                                       dst_reg temp, src_reg orig_src,
3315                                       int base_offset)
3316 {
3317    int reg_offset = base_offset + orig_src.reg_offset;
3318    src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3319    src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr,
3320                                              reg_offset);
3321    vec4_instruction *load;
3322
3323    if (brw->gen >= 7) {
3324       dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
3325       grf_offset.type = offset.type;
3326       emit_before(block, inst, MOV(grf_offset, offset));
3327
3328       load = new(mem_ctx) vec4_instruction(this,
3329                                            VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
3330                                            temp, index, src_reg(grf_offset));
3331    } else {
3332       load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
3333                                            temp, index, offset);
3334       load->base_mrf = 14;
3335       load->mlen = 1;
3336    }
3337    emit_before(block, inst, load);
3338 }
3339
3340 /**
3341  * Implements array access of uniforms by inserting a
3342  * PULL_CONSTANT_LOAD instruction.
3343  *
3344  * Unlike temporary GRF array access (where we don't support it due to
3345  * the difficulty of doing relative addressing on instruction
3346  * destinations), we could potentially do array access of uniforms
3347  * that were loaded in GRF space as push constants.  In real-world
3348  * usage we've seen, though, the arrays being used are always larger
3349  * than we could load as push constants, so just always move all
3350  * uniform array access out to a pull constant buffer.
3351  */
3352 void
3353 vec4_visitor::move_uniform_array_access_to_pull_constants()
3354 {
3355    int pull_constant_loc[this->uniforms];
3356    memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
3357
3358    /* Walk through and find array access of uniforms.  Put a copy of that
3359     * uniform in the pull constant buffer.
3360     *
3361     * Note that we don't move constant-indexed accesses to arrays.  No
3362     * testing has been done of the performance impact of this choice.
3363     */
3364    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3365       for (int i = 0 ; i < 3; i++) {
3366          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3367             continue;
3368
3369          int uniform = inst->src[i].reg;
3370
3371          /* If this array isn't already present in the pull constant buffer,
3372           * add it.
3373           */
3374          if (pull_constant_loc[uniform] == -1) {
3375             const gl_constant_value **values =
3376                &stage_prog_data->param[uniform * 4];
3377
3378             pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3379
3380             assert(uniform < uniform_array_size);
3381             for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3382                stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3383                   = values[j];
3384             }
3385          }
3386
3387          /* Set up the annotation tracking for new generated instructions. */
3388          base_ir = inst->ir;
3389          current_annotation = inst->annotation;
3390
3391          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3392
3393          emit_pull_constant_load(block, inst, temp, inst->src[i],
3394                                  pull_constant_loc[uniform]);
3395
3396          inst->src[i].file = temp.file;
3397          inst->src[i].reg = temp.reg;
3398          inst->src[i].reg_offset = temp.reg_offset;
3399          inst->src[i].reladdr = NULL;
3400       }
3401    }
3402
3403    /* Now there are no accesses of the UNIFORM file with a reladdr, so
3404     * no need to track them as larger-than-vec4 objects.  This will be
3405     * relied on in cutting out unused uniform vectors from push
3406     * constants.
3407     */
3408    split_uniform_registers();
3409 }
3410
3411 void
3412 vec4_visitor::resolve_ud_negate(src_reg *reg)
3413 {
3414    if (reg->type != BRW_REGISTER_TYPE_UD ||
3415        !reg->negate)
3416       return;
3417
3418    src_reg temp = src_reg(this, glsl_type::uvec4_type);
3419    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3420    *reg = temp;
3421 }
3422
3423 vec4_visitor::vec4_visitor(struct brw_context *brw,
3424                            struct brw_vec4_compile *c,
3425                            struct gl_program *prog,
3426                            const struct brw_vec4_prog_key *key,
3427                            struct brw_vec4_prog_data *prog_data,
3428                            struct gl_shader_program *shader_prog,
3429                            gl_shader_stage stage,
3430                            void *mem_ctx,
3431                            bool debug_flag,
3432                            bool no_spills,
3433                            shader_time_shader_type st_base,
3434                            shader_time_shader_type st_written,
3435                            shader_time_shader_type st_reset)
3436    : backend_visitor(brw, shader_prog, prog, &prog_data->base, stage),
3437      c(c),
3438      key(key),
3439      prog_data(prog_data),
3440      sanity_param_count(0),
3441      fail_msg(NULL),
3442      first_non_payload_grf(0),
3443      need_all_constants_in_pull_buffer(false),
3444      debug_flag(debug_flag),
3445      no_spills(no_spills),
3446      st_base(st_base),
3447      st_written(st_written),
3448      st_reset(st_reset)
3449 {
3450    this->mem_ctx = mem_ctx;
3451    this->failed = false;
3452
3453    this->base_ir = NULL;
3454    this->current_annotation = NULL;
3455    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3456
3457    this->variable_ht = hash_table_ctor(0,
3458                                        hash_table_pointer_hash,
3459                                        hash_table_pointer_compare);
3460
3461    this->virtual_grf_start = NULL;
3462    this->virtual_grf_end = NULL;
3463    this->virtual_grf_sizes = NULL;
3464    this->virtual_grf_count = 0;
3465    this->virtual_grf_reg_map = NULL;
3466    this->virtual_grf_reg_count = 0;
3467    this->virtual_grf_array_size = 0;
3468    this->live_intervals_valid = false;
3469
3470    this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3471
3472    this->uniforms = 0;
3473
3474    /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3475     * at least one. See setup_uniforms() in brw_vec4.cpp.
3476     */
3477    this->uniform_array_size = 1;
3478    if (prog_data) {
3479       this->uniform_array_size = MAX2(stage_prog_data->nr_params, 1);
3480    }
3481
3482    this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3483    this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3484 }
3485
3486 vec4_visitor::~vec4_visitor()
3487 {
3488    hash_table_dtor(this->variable_ht);
3489 }
3490
3491
3492 void
3493 vec4_visitor::fail(const char *format, ...)
3494 {
3495    va_list va;
3496    char *msg;
3497
3498    if (failed)
3499       return;
3500
3501    failed = true;
3502
3503    va_start(va, format);
3504    msg = ralloc_vasprintf(mem_ctx, format, va);
3505    va_end(va);
3506    msg = ralloc_asprintf(mem_ctx, "vec4 compile failed: %s\n", msg);
3507
3508    this->fail_msg = msg;
3509
3510    if (debug_flag) {
3511       fprintf(stderr, "%s",  msg);
3512    }
3513 }
3514
3515 } /* namespace brw */