src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "glsl/ir_uniform.h"
  26 extern "C" {
  27 #include "program/sampler.h"
  28 }
  29
  30 namespace brw {
  31
  32 vec4_instruction::vec4_instruction(vec4_visitor *v,
  33                                    enum opcode opcode, const dst_reg &dst,
  34                                    const src_reg &src0, const src_reg &src1,
  35                                    const src_reg &src2)
  36 {
  37    this->opcode = opcode;
  38    this->dst = dst;
  39    this->src[0] = src0;
  40    this->src[1] = src1;
  41    this->src[2] = src2;
  42    this->saturate = false;
  43    this->force_writemask_all = false;
  44    this->no_dd_clear = false;
  45    this->no_dd_check = false;
  46    this->writes_accumulator = false;
  47    this->conditional_mod = BRW_CONDITIONAL_NONE;
  48    this->sampler = 0;
  49    this->texture_offset = 0;
  50    this->target = 0;
  51    this->shadow_compare = false;
  52    this->ir = v->base_ir;
  53    this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
  54    this->header_present = false;
  55    this->mlen = 0;
  56    this->base_mrf = 0;
  57    this->offset = 0;
  58    this->annotation = v->current_annotation;
  59 }
  60
  61 vec4_instruction *
  62 vec4_visitor::emit(vec4_instruction *inst)
  63 {
  64    this->instructions.push_tail(inst);
  65
  66    return inst;
  67 }
  68
  69 vec4_instruction *
  70 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
  71 {
  72    new_inst->ir = inst->ir;
  73    new_inst->annotation = inst->annotation;
  74
  75    inst->insert_before(new_inst);
  76
  77    return inst;
  78 }
  79
  80 vec4_instruction *
  81 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
  82                    src_reg src0, src_reg src1, src_reg src2)
  83 {
  84    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
  85                                              src0, src1, src2));
  86 }
  87
  88
  89 vec4_instruction *
  90 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
  91 {
  92    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
  93 }
  94
  95 vec4_instruction *
  96 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
  97 {
  98    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
  99 }
 100
 101 vec4_instruction *
 102 vec4_visitor::emit(enum opcode opcode, dst_reg dst)
 103 {
 104    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst));
 105 }
 106
 107 vec4_instruction *
 108 vec4_visitor::emit(enum opcode opcode)
 109 {
 110    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
 111 }
 112
 113 #define ALU1(op)                                                        \
 114    vec4_instruction *                                                   \
 115    vec4_visitor::op(const dst_reg &dst, const src_reg &src0)            \
 116    {                                                                    \
 117       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 118                                            src0);                       \
 119    }
 120
 121 #define ALU2(op)                                                        \
 122    vec4_instruction *                                                   \
 123    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 124                     const src_reg &src1)                                \
 125    {                                                                    \
 126       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 127                                            src0, src1);                 \
 128    }
 129
 130 #define ALU2_ACC(op)                                                    \
 131    vec4_instruction *                                                   \
 132    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 133                     const src_reg &src1)                                \
 134    {                                                                    \
 135       vec4_instruction *inst = new(mem_ctx) vec4_instruction(this,     \
 136                        BRW_OPCODE_##op, dst, src0, src1);               \
 137       inst->writes_accumulator = true;                                 \
 138       return inst;                                                     \
 139    }
 140
 141 #define ALU3(op)                                                        \
 142    vec4_instruction *                                                   \
 143    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 144                     const src_reg &src1, const src_reg &src2)           \
 145    {                                                                    \
 146       assert(brw->gen >= 6);                                            \
 147       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 148                                            src0, src1, src2);           \
 149    }
 150
 151 ALU1(NOT)
 152 ALU1(MOV)
 153 ALU1(FRC)
 154 ALU1(RNDD)
 155 ALU1(RNDE)
 156 ALU1(RNDZ)
 157 ALU1(F32TO16)
 158 ALU1(F16TO32)
 159 ALU2(ADD)
 160 ALU2(MUL)
 161 ALU2_ACC(MACH)
 162 ALU2(AND)
 163 ALU2(OR)
 164 ALU2(XOR)
 165 ALU2(DP3)
 166 ALU2(DP4)
 167 ALU2(DPH)
 168 ALU2(SHL)
 169 ALU2(SHR)
 170 ALU2(ASR)
 171 ALU3(LRP)
 172 ALU1(BFREV)
 173 ALU3(BFE)
 174 ALU2(BFI1)
 175 ALU3(BFI2)
 176 ALU1(FBH)
 177 ALU1(FBL)
 178 ALU1(CBIT)
 179 ALU3(MAD)
 180 ALU2_ACC(ADDC)
 181 ALU2_ACC(SUBB)
 182 ALU2(MAC)
 183
 184 /** Gen4 predicated IF. */
 185 vec4_instruction *
 186 vec4_visitor::IF(uint32_t predicate)
 187 {
 188    vec4_instruction *inst;
 189
 190    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
 191    inst->predicate = predicate;
 192
 193    return inst;
 194 }
 195
 196 /** Gen6 IF with embedded comparison. */
 197 vec4_instruction *
 198 vec4_visitor::IF(src_reg src0, src_reg src1, uint32_t condition)
 199 {
 200    assert(brw->gen == 6);
 201
 202    vec4_instruction *inst;
 203
 204    resolve_ud_negate(&src0);
 205    resolve_ud_negate(&src1);
 206
 207    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
 208                                         src0, src1);
 209    inst->conditional_mod = condition;
 210
 211    return inst;
 212 }
 213
 214 /**
 215  * CMP: Sets the low bit of the destination channels with the result
 216  * of the comparison, while the upper bits are undefined, and updates
 217  * the flag register with the packed 16 bits of the result.
 218  */
 219 vec4_instruction *
 220 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, uint32_t condition)
 221 {
 222    vec4_instruction *inst;
 223
 224    /* original gen4 does type conversion to the destination type
 225     * before before comparison, producing garbage results for floating
 226     * point comparisons.
 227     */
 228    if (brw->gen == 4) {
 229       dst.type = src0.type;
 230       if (dst.file == HW_REG)
 231          dst.fixed_hw_reg.type = dst.type;
 232    }
 233
 234    resolve_ud_negate(&src0);
 235    resolve_ud_negate(&src1);
 236
 237    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
 238    inst->conditional_mod = condition;
 239
 240    return inst;
 241 }
 242
 243 vec4_instruction *
 244 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
 245 {
 246    vec4_instruction *inst;
 247
 248    inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_READ,
 249                                         dst, index);
 250    inst->base_mrf = 14;
 251    inst->mlen = 2;
 252
 253    return inst;
 254 }
 255
 256 vec4_instruction *
 257 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
 258                             const src_reg &index)
 259 {
 260    vec4_instruction *inst;
 261
 262    inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_WRITE,
 263                                         dst, src, index);
 264    inst->base_mrf = 13;
 265    inst->mlen = 3;
 266
 267    return inst;
 268 }
 269
 270 void
 271 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
 272 {
 273    static enum opcode dot_opcodes[] = {
 274       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
 275    };
 276
 277    emit(dot_opcodes[elements - 2], dst, src0, src1);
 278 }
 279
 280 src_reg
 281 vec4_visitor::fix_3src_operand(src_reg src)
 282 {
 283    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
 284     * able to use vertical stride of zero to replicate the vec4 uniform, like
 285     *
 286     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
 287     *
 288     * But you can't, since vertical stride is always four in three-source
 289     * instructions. Instead, insert a MOV instruction to do the replication so
 290     * that the three-source instruction can consume it.
 291     */
 292
 293    /* The MOV is only needed if the source is a uniform or immediate. */
 294    if (src.file != UNIFORM && src.file != IMM)
 295       return src;
 296
 297    if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
 298       return src;
 299
 300    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 301    expanded.type = src.type;
 302    emit(MOV(expanded, src));
 303    return src_reg(expanded);
 304 }
 305
 306 src_reg
 307 vec4_visitor::fix_math_operand(src_reg src)
 308 {
 309    /* The gen6 math instruction ignores the source modifiers --
 310     * swizzle, abs, negate, and at least some parts of the register
 311     * region description.
 312     *
 313     * Rather than trying to enumerate all these cases, *always* expand the
 314     * operand to a temp GRF for gen6.
 315     *
 316     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 317     * can't use.
 318     */
 319
 320    if (brw->gen == 7 && src.file != IMM)
 321       return src;
 322
 323    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 324    expanded.type = src.type;
 325    emit(MOV(expanded, src));
 326    return src_reg(expanded);
 327 }
 328
 329 void
 330 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
 331 {
 332    src = fix_math_operand(src);
 333
 334    if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
 335       /* The gen6 math instruction must be align1, so we can't do
 336        * writemasks.
 337        */
 338       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 339
 340       emit(opcode, temp_dst, src);
 341
 342       emit(MOV(dst, src_reg(temp_dst)));
 343    } else {
 344       emit(opcode, dst, src);
 345    }
 346 }
 347
 348 void
 349 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
 350 {
 351    vec4_instruction *inst = emit(opcode, dst, src);
 352    inst->base_mrf = 1;
 353    inst->mlen = 1;
 354 }
 355
 356 void
 357 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
 358 {
 359    switch (opcode) {
 360    case SHADER_OPCODE_RCP:
 361    case SHADER_OPCODE_RSQ:
 362    case SHADER_OPCODE_SQRT:
 363    case SHADER_OPCODE_EXP2:
 364    case SHADER_OPCODE_LOG2:
 365    case SHADER_OPCODE_SIN:
 366    case SHADER_OPCODE_COS:
 367       break;
 368    default:
 369       unreachable("not reached: bad math opcode");
 370    }
 371
 372    if (brw->gen >= 8) {
 373       emit(opcode, dst, src);
 374    } else if (brw->gen >= 6) {
 375       emit_math1_gen6(opcode, dst, src);
 376    } else {
 377       emit_math1_gen4(opcode, dst, src);
 378    }
 379 }
 380
 381 void
 382 vec4_visitor::emit_math2_gen6(enum opcode opcode,
 383                               dst_reg dst, src_reg src0, src_reg src1)
 384 {
 385    src0 = fix_math_operand(src0);
 386    src1 = fix_math_operand(src1);
 387
 388    if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
 389       /* The gen6 math instruction must be align1, so we can't do
 390        * writemasks.
 391        */
 392       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 393       temp_dst.type = dst.type;
 394
 395       emit(opcode, temp_dst, src0, src1);
 396
 397       emit(MOV(dst, src_reg(temp_dst)));
 398    } else {
 399       emit(opcode, dst, src0, src1);
 400    }
 401 }
 402
 403 void
 404 vec4_visitor::emit_math2_gen4(enum opcode opcode,
 405                               dst_reg dst, src_reg src0, src_reg src1)
 406 {
 407    vec4_instruction *inst = emit(opcode, dst, src0, src1);
 408    inst->base_mrf = 1;
 409    inst->mlen = 2;
 410 }
 411
 412 void
 413 vec4_visitor::emit_math(enum opcode opcode,
 414                         dst_reg dst, src_reg src0, src_reg src1)
 415 {
 416    switch (opcode) {
 417    case SHADER_OPCODE_POW:
 418    case SHADER_OPCODE_INT_QUOTIENT:
 419    case SHADER_OPCODE_INT_REMAINDER:
 420       break;
 421    default:
 422       unreachable("not reached: unsupported binary math opcode");
 423    }
 424
 425    if (brw->gen >= 8) {
 426       emit(opcode, dst, src0, src1);
 427    } else if (brw->gen >= 6) {
 428       emit_math2_gen6(opcode, dst, src0, src1);
 429    } else {
 430       emit_math2_gen4(opcode, dst, src0, src1);
 431    }
 432 }
 433
 434 void
 435 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 436 {
 437    if (brw->gen < 7) {
 438       unreachable("ir_unop_pack_half_2x16 should be lowered");
 439    }
 440
 441    assert(dst.type == BRW_REGISTER_TYPE_UD);
 442    assert(src0.type == BRW_REGISTER_TYPE_F);
 443
 444    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
 445     *
 446     *   Because this instruction does not have a 16-bit floating-point type,
 447     *   the destination data type must be Word (W).
 448     *
 449     *   The destination must be DWord-aligned and specify a horizontal stride
 450     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
 451     *   each destination channel and the upper word is not modified.
 452     *
 453     * The above restriction implies that the f32to16 instruction must use
 454     * align1 mode, because only in align1 mode is it possible to specify
 455     * horizontal stride.  We choose here to defy the hardware docs and emit
 456     * align16 instructions.
 457     *
 458     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
 459     * instructions. I was partially successful in that the code passed all
 460     * tests.  However, the code was dubiously correct and fragile, and the
 461     * tests were not harsh enough to probe that frailty. Not trusting the
 462     * code, I chose instead to remain in align16 mode in defiance of the hw
 463     * docs).
 464     *
 465     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
 466     * simulator, emitting a f32to16 in align16 mode with UD as destination
 467     * data type is safe. The behavior differs from that specified in the PRM
 468     * in that the upper word of each destination channel is cleared to 0.
 469     */
 470
 471    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 472    src_reg tmp_src(tmp_dst);
 473
 474 #if 0
 475    /* Verify the undocumented behavior on which the following instructions
 476     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
 477     * then the result of the bit-or instruction below will be incorrect.
 478     *
 479     * You should inspect the disasm output in order to verify that the MOV is
 480     * not optimized away.
 481     */
 482    emit(MOV(tmp_dst, src_reg(0x12345678u)));
 483 #endif
 484
 485    /* Give tmp the form below, where "." means untouched.
 486     *
 487     *     w z          y          x w z          y          x
 488     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
 489     *
 490     * That the upper word of each write-channel be 0 is required for the
 491     * following bit-shift and bit-or instructions to work. Note that this
 492     * relies on the undocumented hardware behavior mentioned above.
 493     */
 494    tmp_dst.writemask = WRITEMASK_XY;
 495    emit(F32TO16(tmp_dst, src0));
 496
 497    /* Give the write-channels of dst the form:
 498     *   0xhhhh0000
 499     */
 500    tmp_src.swizzle = BRW_SWIZZLE_YYYY;
 501    emit(SHL(dst, tmp_src, src_reg(16u)));
 502
 503    /* Finally, give the write-channels of dst the form of packHalf2x16's
 504     * output:
 505     *   0xhhhhllll
 506     */
 507    tmp_src.swizzle = BRW_SWIZZLE_XXXX;
 508    emit(OR(dst, src_reg(dst), tmp_src));
 509 }
 510
 511 void
 512 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 513 {
 514    if (brw->gen < 7) {
 515       unreachable("ir_unop_unpack_half_2x16 should be lowered");
 516    }
 517
 518    assert(dst.type == BRW_REGISTER_TYPE_F);
 519    assert(src0.type == BRW_REGISTER_TYPE_UD);
 520
 521    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
 522     *
 523     *   Because this instruction does not have a 16-bit floating-point type,
 524     *   the source data type must be Word (W). The destination type must be
 525     *   F (Float).
 526     *
 527     * To use W as the source data type, we must adjust horizontal strides,
 528     * which is only possible in align1 mode. All my [chadv] attempts at
 529     * emitting align1 instructions for unpackHalf2x16 failed to pass the
 530     * Piglit tests, so I gave up.
 531     *
 532     * I've verified that, on gen7 hardware and the simulator, it is safe to
 533     * emit f16to32 in align16 mode with UD as source data type.
 534     */
 535
 536    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 537    src_reg tmp_src(tmp_dst);
 538
 539    tmp_dst.writemask = WRITEMASK_X;
 540    emit(AND(tmp_dst, src0, src_reg(0xffffu)));
 541
 542    tmp_dst.writemask = WRITEMASK_Y;
 543    emit(SHR(tmp_dst, src0, src_reg(16u)));
 544
 545    dst.writemask = WRITEMASK_XY;
 546    emit(F16TO32(dst, tmp_src));
 547 }
 548
 549 void
 550 vec4_visitor::visit_instructions(const exec_list *list)
 551 {
 552    foreach_in_list(ir_instruction, ir, list) {
 553       base_ir = ir;
 554       ir->accept(this);
 555    }
 556 }
 557
 558
 559 static int
 560 type_size(const struct glsl_type *type)
 561 {
 562    unsigned int i;
 563    int size;
 564
 565    switch (type->base_type) {
 566    case GLSL_TYPE_UINT:
 567    case GLSL_TYPE_INT:
 568    case GLSL_TYPE_FLOAT:
 569    case GLSL_TYPE_BOOL:
 570       if (type->is_matrix()) {
 571          return type->matrix_columns;
 572       } else {
 573          /* Regardless of size of vector, it gets a vec4. This is bad
 574           * packing for things like floats, but otherwise arrays become a
 575           * mess.  Hopefully a later pass over the code can pack scalars
 576           * down if appropriate.
 577           */
 578          return 1;
 579       }
 580    case GLSL_TYPE_ARRAY:
 581       assert(type->length > 0);
 582       return type_size(type->fields.array) * type->length;
 583    case GLSL_TYPE_STRUCT:
 584       size = 0;
 585       for (i = 0; i < type->length; i++) {
 586          size += type_size(type->fields.structure[i].type);
 587       }
 588       return size;
 589    case GLSL_TYPE_SAMPLER:
 590       /* Samplers take up one slot in UNIFORMS[], but they're baked in
 591        * at link time.
 592        */
 593       return 1;
 594    case GLSL_TYPE_ATOMIC_UINT:
 595       return 0;
 596    case GLSL_TYPE_IMAGE:
 597    case GLSL_TYPE_VOID:
 598    case GLSL_TYPE_ERROR:
 599    case GLSL_TYPE_INTERFACE:
 600       unreachable("not reached");
 601    }
 602
 603    return 0;
 604 }
 605
 606 int
 607 vec4_visitor::virtual_grf_alloc(int size)
 608 {
 609    if (virtual_grf_array_size <= virtual_grf_count) {
 610       if (virtual_grf_array_size == 0)
 611          virtual_grf_array_size = 16;
 612       else
 613          virtual_grf_array_size *= 2;
 614       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 615                                    virtual_grf_array_size);
 616       virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
 617                                      virtual_grf_array_size);
 618    }
 619    virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
 620    virtual_grf_reg_count += size;
 621    virtual_grf_sizes[virtual_grf_count] = size;
 622    return virtual_grf_count++;
 623 }
 624
 625 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 626 {
 627    init();
 628
 629    this->file = GRF;
 630    this->reg = v->virtual_grf_alloc(type_size(type));
 631
 632    if (type->is_array() || type->is_record()) {
 633       this->swizzle = BRW_SWIZZLE_NOOP;
 634    } else {
 635       this->swizzle = swizzle_for_size(type->vector_elements);
 636    }
 637
 638    this->type = brw_type_for_base_type(type);
 639 }
 640
 641 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 642 {
 643    init();
 644
 645    this->file = GRF;
 646    this->reg = v->virtual_grf_alloc(type_size(type));
 647
 648    if (type->is_array() || type->is_record()) {
 649       this->writemask = WRITEMASK_XYZW;
 650    } else {
 651       this->writemask = (1 << type->vector_elements) - 1;
 652    }
 653
 654    this->type = brw_type_for_base_type(type);
 655 }
 656
 657 /* Our support for uniforms is piggy-backed on the struct
 658  * gl_fragment_program, because that's where the values actually
 659  * get stored, rather than in some global gl_shader_program uniform
 660  * store.
 661  */
 662 void
 663 vec4_visitor::setup_uniform_values(ir_variable *ir)
 664 {
 665    int namelen = strlen(ir->name);
 666
 667    /* The data for our (non-builtin) uniforms is stored in a series of
 668     * gl_uniform_driver_storage structs for each subcomponent that
 669     * glGetUniformLocation() could name.  We know it's been set up in the same
 670     * order we'd walk the type, so walk the list of storage and find anything
 671     * with our name, or the prefix of a component that starts with our name.
 672     */
 673    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 674       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 675
 676       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 677           (storage->name[namelen] != 0 &&
 678            storage->name[namelen] != '.' &&
 679            storage->name[namelen] != '[')) {
 680          continue;
 681       }
 682
 683       gl_constant_value *components = storage->storage;
 684       unsigned vector_count = (MAX2(storage->array_elements, 1) *
 685                                storage->type->matrix_columns);
 686
 687       for (unsigned s = 0; s < vector_count; s++) {
 688          assert(uniforms < uniform_array_size);
 689          uniform_vector_size[uniforms] = storage->type->vector_elements;
 690
 691          int i;
 692          for (i = 0; i < uniform_vector_size[uniforms]; i++) {
 693             stage_prog_data->param[uniforms * 4 + i] = &components->f;
 694             components++;
 695          }
 696          for (; i < 4; i++) {
 697             static float zero = 0;
 698             stage_prog_data->param[uniforms * 4 + i] = &zero;
 699          }
 700
 701          uniforms++;
 702       }
 703    }
 704 }
 705
 706 void
 707 vec4_visitor::setup_uniform_clipplane_values()
 708 {
 709    gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
 710
 711    for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
 712       assert(this->uniforms < uniform_array_size);
 713       this->uniform_vector_size[this->uniforms] = 4;
 714       this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
 715       this->userplane[i].type = BRW_REGISTER_TYPE_F;
 716       for (int j = 0; j < 4; ++j) {
 717          stage_prog_data->param[this->uniforms * 4 + j] = &clip_planes[i][j];
 718       }
 719       ++this->uniforms;
 720    }
 721 }
 722
 723 /* Our support for builtin uniforms is even scarier than non-builtin.
 724  * It sits on top of the PROG_STATE_VAR parameters that are
 725  * automatically updated from GL context state.
 726  */
 727 void
 728 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
 729 {
 730    const ir_state_slot *const slots = ir->state_slots;
 731    assert(ir->state_slots != NULL);
 732
 733    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 734       /* This state reference has already been setup by ir_to_mesa,
 735        * but we'll get the same index back here.  We can reference
 736        * ParameterValues directly, since unlike brw_fs.cpp, we never
 737        * add new state references during compile.
 738        */
 739       int index = _mesa_add_state_reference(this->prog->Parameters,
 740                                             (gl_state_index *)slots[i].tokens);
 741       float *values = &this->prog->Parameters->ParameterValues[index][0].f;
 742
 743       assert(this->uniforms < uniform_array_size);
 744       this->uniform_vector_size[this->uniforms] = 0;
 745       /* Add each of the unique swizzled channels of the element.
 746        * This will end up matching the size of the glsl_type of this field.
 747        */
 748       int last_swiz = -1;
 749       for (unsigned int j = 0; j < 4; j++) {
 750          int swiz = GET_SWZ(slots[i].swizzle, j);
 751          last_swiz = swiz;
 752
 753          stage_prog_data->param[this->uniforms * 4 + j] = &values[swiz];
 754          assert(this->uniforms < uniform_array_size);
 755          if (swiz <= last_swiz)
 756             this->uniform_vector_size[this->uniforms]++;
 757       }
 758       this->uniforms++;
 759    }
 760 }
 761
 762 dst_reg *
 763 vec4_visitor::variable_storage(ir_variable *var)
 764 {
 765    return (dst_reg *)hash_table_find(this->variable_ht, var);
 766 }
 767
 768 void
 769 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir, uint32_t *predicate)
 770 {
 771    ir_expression *expr = ir->as_expression();
 772
 773    *predicate = BRW_PREDICATE_NORMAL;
 774
 775    if (expr) {
 776       src_reg op[2];
 777       vec4_instruction *inst;
 778
 779       assert(expr->get_num_operands() <= 2);
 780       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 781          expr->operands[i]->accept(this);
 782          op[i] = this->result;
 783
 784          resolve_ud_negate(&op[i]);
 785       }
 786
 787       switch (expr->operation) {
 788       case ir_unop_logic_not:
 789          inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
 790          inst->conditional_mod = BRW_CONDITIONAL_Z;
 791          break;
 792
 793       case ir_binop_logic_xor:
 794          inst = emit(XOR(dst_null_d(), op[0], op[1]));
 795          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 796          break;
 797
 798       case ir_binop_logic_or:
 799          inst = emit(OR(dst_null_d(), op[0], op[1]));
 800          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 801          break;
 802
 803       case ir_binop_logic_and:
 804          inst = emit(AND(dst_null_d(), op[0], op[1]));
 805          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 806          break;
 807
 808       case ir_unop_f2b:
 809          if (brw->gen >= 6) {
 810             emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
 811          } else {
 812             inst = emit(MOV(dst_null_f(), op[0]));
 813             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 814          }
 815          break;
 816
 817       case ir_unop_i2b:
 818          if (brw->gen >= 6) {
 819             emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 820          } else {
 821             inst = emit(MOV(dst_null_d(), op[0]));
 822             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 823          }
 824          break;
 825
 826       case ir_binop_all_equal:
 827          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 828          *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
 829          break;
 830
 831       case ir_binop_any_nequal:
 832          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 833          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 834          break;
 835
 836       case ir_unop_any:
 837          inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 838          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 839          break;
 840
 841       case ir_binop_greater:
 842       case ir_binop_gequal:
 843       case ir_binop_less:
 844       case ir_binop_lequal:
 845       case ir_binop_equal:
 846       case ir_binop_nequal:
 847          emit(CMP(dst_null_d(), op[0], op[1],
 848                   brw_conditional_for_comparison(expr->operation)));
 849          break;
 850
 851       default:
 852          unreachable("not reached");
 853       }
 854       return;
 855    }
 856
 857    ir->accept(this);
 858
 859    resolve_ud_negate(&this->result);
 860
 861    if (brw->gen >= 6) {
 862       vec4_instruction *inst = emit(AND(dst_null_d(),
 863                                         this->result, src_reg(1)));
 864       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 865    } else {
 866       vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
 867       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 868    }
 869 }
 870
 871 /**
 872  * Emit a gen6 IF statement with the comparison folded into the IF
 873  * instruction.
 874  */
 875 void
 876 vec4_visitor::emit_if_gen6(ir_if *ir)
 877 {
 878    ir_expression *expr = ir->condition->as_expression();
 879
 880    if (expr) {
 881       src_reg op[2];
 882       dst_reg temp;
 883
 884       assert(expr->get_num_operands() <= 2);
 885       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 886          expr->operands[i]->accept(this);
 887          op[i] = this->result;
 888       }
 889
 890       switch (expr->operation) {
 891       case ir_unop_logic_not:
 892          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
 893          return;
 894
 895       case ir_binop_logic_xor:
 896          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
 897          return;
 898
 899       case ir_binop_logic_or:
 900          temp = dst_reg(this, glsl_type::bool_type);
 901          emit(OR(temp, op[0], op[1]));
 902          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 903          return;
 904
 905       case ir_binop_logic_and:
 906          temp = dst_reg(this, glsl_type::bool_type);
 907          emit(AND(temp, op[0], op[1]));
 908          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 909          return;
 910
 911       case ir_unop_f2b:
 912          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 913          return;
 914
 915       case ir_unop_i2b:
 916          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 917          return;
 918
 919       case ir_binop_greater:
 920       case ir_binop_gequal:
 921       case ir_binop_less:
 922       case ir_binop_lequal:
 923       case ir_binop_equal:
 924       case ir_binop_nequal:
 925          emit(IF(op[0], op[1],
 926                  brw_conditional_for_comparison(expr->operation)));
 927          return;
 928
 929       case ir_binop_all_equal:
 930          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 931          emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
 932          return;
 933
 934       case ir_binop_any_nequal:
 935          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 936          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 937          return;
 938
 939       case ir_unop_any:
 940          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 941          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 942          return;
 943
 944       default:
 945          unreachable("not reached");
 946       }
 947       return;
 948    }
 949
 950    ir->condition->accept(this);
 951
 952    emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
 953 }
 954
 955 void
 956 vec4_visitor::visit(ir_variable *ir)
 957 {
 958    dst_reg *reg = NULL;
 959
 960    if (variable_storage(ir))
 961       return;
 962
 963    switch (ir->data.mode) {
 964    case ir_var_shader_in:
 965       reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
 966       break;
 967
 968    case ir_var_shader_out:
 969       reg = new(mem_ctx) dst_reg(this, ir->type);
 970
 971       for (int i = 0; i < type_size(ir->type); i++) {
 972          output_reg[ir->data.location + i] = *reg;
 973          output_reg[ir->data.location + i].reg_offset = i;
 974          output_reg[ir->data.location + i].type =
 975             brw_type_for_base_type(ir->type->get_scalar_type());
 976          output_reg_annotation[ir->data.location + i] = ir->name;
 977       }
 978       break;
 979
 980    case ir_var_auto:
 981    case ir_var_temporary:
 982       reg = new(mem_ctx) dst_reg(this, ir->type);
 983       break;
 984
 985    case ir_var_uniform:
 986       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
 987
 988       /* Thanks to the lower_ubo_reference pass, we will see only
 989        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
 990        * variables, so no need for them to be in variable_ht.
 991        *
 992        * Atomic counters take no uniform storage, no need to do
 993        * anything here.
 994        */
 995       if (ir->is_in_uniform_block() || ir->type->contains_atomic())
 996          return;
 997
 998       /* Track how big the whole uniform variable is, in case we need to put a
 999        * copy of its data into pull constants for array access.
1000        */
1001       assert(this->uniforms < uniform_array_size);
1002       this->uniform_size[this->uniforms] = type_size(ir->type);
1003
1004       if (!strncmp(ir->name, "gl_", 3)) {
1005          setup_builtin_uniform_values(ir);
1006       } else {
1007          setup_uniform_values(ir);
1008       }
1009       break;
1010
1011    case ir_var_system_value:
1012       reg = make_reg_for_system_value(ir);
1013       break;
1014
1015    default:
1016       unreachable("not reached");
1017    }
1018
1019    reg->type = brw_type_for_base_type(ir->type);
1020    hash_table_insert(this->variable_ht, reg, ir);
1021 }
1022
1023 void
1024 vec4_visitor::visit(ir_loop *ir)
1025 {
1026    /* We don't want debugging output to print the whole body of the
1027     * loop as the annotation.
1028     */
1029    this->base_ir = NULL;
1030
1031    emit(BRW_OPCODE_DO);
1032
1033    visit_instructions(&ir->body_instructions);
1034
1035    emit(BRW_OPCODE_WHILE);
1036 }
1037
1038 void
1039 vec4_visitor::visit(ir_loop_jump *ir)
1040 {
1041    switch (ir->mode) {
1042    case ir_loop_jump::jump_break:
1043       emit(BRW_OPCODE_BREAK);
1044       break;
1045    case ir_loop_jump::jump_continue:
1046       emit(BRW_OPCODE_CONTINUE);
1047       break;
1048    }
1049 }
1050
1051
1052 void
1053 vec4_visitor::visit(ir_function_signature *)
1054 {
1055    unreachable("not reached");
1056 }
1057
1058 void
1059 vec4_visitor::visit(ir_function *ir)
1060 {
1061    /* Ignore function bodies other than main() -- we shouldn't see calls to
1062     * them since they should all be inlined.
1063     */
1064    if (strcmp(ir->name, "main") == 0) {
1065       const ir_function_signature *sig;
1066       exec_list empty;
1067
1068       sig = ir->matching_signature(NULL, &empty);
1069
1070       assert(sig);
1071
1072       visit_instructions(&sig->body);
1073    }
1074 }
1075
1076 bool
1077 vec4_visitor::try_emit_sat(ir_expression *ir)
1078 {
1079    ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
1080    if (!sat_src)
1081       return false;
1082
1083    sat_src->accept(this);
1084    src_reg src = this->result;
1085
1086    this->result = src_reg(this, ir->type);
1087    vec4_instruction *inst;
1088    inst = emit(MOV(dst_reg(this->result), src));
1089    inst->saturate = true;
1090
1091    return true;
1092 }
1093
1094 bool
1095 vec4_visitor::try_emit_mad(ir_expression *ir)
1096 {
1097    /* 3-src instructions were introduced in gen6. */
1098    if (brw->gen < 6)
1099       return false;
1100
1101    /* MAD can only handle floating-point data. */
1102    if (ir->type->base_type != GLSL_TYPE_FLOAT)
1103       return false;
1104
1105    ir_rvalue *nonmul = ir->operands[1];
1106    ir_expression *mul = ir->operands[0]->as_expression();
1107
1108    if (!mul || mul->operation != ir_binop_mul) {
1109       nonmul = ir->operands[0];
1110       mul = ir->operands[1]->as_expression();
1111
1112       if (!mul || mul->operation != ir_binop_mul)
1113          return false;
1114    }
1115
1116    nonmul->accept(this);
1117    src_reg src0 = fix_3src_operand(this->result);
1118
1119    mul->operands[0]->accept(this);
1120    src_reg src1 = fix_3src_operand(this->result);
1121
1122    mul->operands[1]->accept(this);
1123    src_reg src2 = fix_3src_operand(this->result);
1124
1125    this->result = src_reg(this, ir->type);
1126    emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1127
1128    return true;
1129 }
1130
1131 bool
1132 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1133 {
1134    ir_expression *const cmp = ir->operands[0]->as_expression();
1135
1136    if (cmp == NULL)
1137       return false;
1138
1139    switch (cmp->operation) {
1140    case ir_binop_less:
1141    case ir_binop_greater:
1142    case ir_binop_lequal:
1143    case ir_binop_gequal:
1144    case ir_binop_equal:
1145    case ir_binop_nequal:
1146       break;
1147
1148    default:
1149       return false;
1150    }
1151
1152    cmp->operands[0]->accept(this);
1153    const src_reg cmp_src0 = this->result;
1154
1155    cmp->operands[1]->accept(this);
1156    const src_reg cmp_src1 = this->result;
1157
1158    this->result = src_reg(this, ir->type);
1159
1160    emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1161             brw_conditional_for_comparison(cmp->operation)));
1162
1163    /* If the comparison is false, this->result will just happen to be zero.
1164     */
1165    vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1166                                        this->result, src_reg(1.0f));
1167    inst->predicate = BRW_PREDICATE_NORMAL;
1168    inst->predicate_inverse = true;
1169
1170    return true;
1171 }
1172
1173 void
1174 vec4_visitor::emit_bool_comparison(unsigned int op,
1175                                  dst_reg dst, src_reg src0, src_reg src1)
1176 {
1177    /* original gen4 does destination conversion before comparison. */
1178    if (brw->gen < 5)
1179       dst.type = src0.type;
1180
1181    emit(CMP(dst, src0, src1, brw_conditional_for_comparison(op)));
1182
1183    dst.type = BRW_REGISTER_TYPE_D;
1184    emit(AND(dst, src_reg(dst), src_reg(0x1)));
1185 }
1186
1187 void
1188 vec4_visitor::emit_minmax(uint32_t conditionalmod, dst_reg dst,
1189                           src_reg src0, src_reg src1)
1190 {
1191    vec4_instruction *inst;
1192
1193    if (brw->gen >= 6) {
1194       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1195       inst->conditional_mod = conditionalmod;
1196    } else {
1197       emit(CMP(dst, src0, src1, conditionalmod));
1198
1199       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1200       inst->predicate = BRW_PREDICATE_NORMAL;
1201    }
1202 }
1203
1204 void
1205 vec4_visitor::emit_lrp(const dst_reg &dst,
1206                        const src_reg &x, const src_reg &y, const src_reg &a)
1207 {
1208    if (brw->gen >= 6) {
1209       /* Note that the instruction's argument order is reversed from GLSL
1210        * and the IR.
1211        */
1212       emit(LRP(dst,
1213                fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
1214    } else {
1215       /* Earlier generations don't support three source operations, so we
1216        * need to emit x*(1-a) + y*a.
1217        */
1218       dst_reg y_times_a           = dst_reg(this, glsl_type::vec4_type);
1219       dst_reg one_minus_a         = dst_reg(this, glsl_type::vec4_type);
1220       dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1221       y_times_a.writemask           = dst.writemask;
1222       one_minus_a.writemask         = dst.writemask;
1223       x_times_one_minus_a.writemask = dst.writemask;
1224
1225       emit(MUL(y_times_a, y, a));
1226       emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1227       emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1228       emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1229    }
1230 }
1231
1232 void
1233 vec4_visitor::visit(ir_expression *ir)
1234 {
1235    unsigned int operand;
1236    src_reg op[Elements(ir->operands)];
1237    src_reg result_src;
1238    dst_reg result_dst;
1239    vec4_instruction *inst;
1240
1241    if (try_emit_sat(ir))
1242       return;
1243
1244    if (ir->operation == ir_binop_add) {
1245       if (try_emit_mad(ir))
1246          return;
1247    }
1248
1249    if (ir->operation == ir_unop_b2f) {
1250       if (try_emit_b2f_of_compare(ir))
1251          return;
1252    }
1253
1254    for (operand = 0; operand < ir->get_num_operands(); operand++) {
1255       this->result.file = BAD_FILE;
1256       ir->operands[operand]->accept(this);
1257       if (this->result.file == BAD_FILE) {
1258          fprintf(stderr, "Failed to get tree for expression operand:\n");
1259          ir->operands[operand]->fprint(stderr);
1260          exit(1);
1261       }
1262       op[operand] = this->result;
1263
1264       /* Matrix expression operands should have been broken down to vector
1265        * operations already.
1266        */
1267       assert(!ir->operands[operand]->type->is_matrix());
1268    }
1269
1270    int vector_elements = ir->operands[0]->type->vector_elements;
1271    if (ir->operands[1]) {
1272       vector_elements = MAX2(vector_elements,
1273                              ir->operands[1]->type->vector_elements);
1274    }
1275
1276    this->result.file = BAD_FILE;
1277
1278    /* Storage for our result.  Ideally for an assignment we'd be using
1279     * the actual storage for the result here, instead.
1280     */
1281    result_src = src_reg(this, ir->type);
1282    /* convenience for the emit functions below. */
1283    result_dst = dst_reg(result_src);
1284    /* If nothing special happens, this is the result. */
1285    this->result = result_src;
1286    /* Limit writes to the channels that will be used by result_src later.
1287     * This does limit this temp's use as a temporary for multi-instruction
1288     * sequences.
1289     */
1290    result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1291
1292    switch (ir->operation) {
1293    case ir_unop_logic_not:
1294       /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
1295        * ones complement of the whole register, not just bit 0.
1296        */
1297       emit(XOR(result_dst, op[0], src_reg(1)));
1298       break;
1299    case ir_unop_neg:
1300       op[0].negate = !op[0].negate;
1301       emit(MOV(result_dst, op[0]));
1302       break;
1303    case ir_unop_abs:
1304       op[0].abs = true;
1305       op[0].negate = false;
1306       emit(MOV(result_dst, op[0]));
1307       break;
1308
1309    case ir_unop_sign:
1310       if (ir->type->is_float()) {
1311          /* AND(val, 0x80000000) gives the sign bit.
1312           *
1313           * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1314           * zero.
1315           */
1316          emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1317
1318          op[0].type = BRW_REGISTER_TYPE_UD;
1319          result_dst.type = BRW_REGISTER_TYPE_UD;
1320          emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1321
1322          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1323          inst->predicate = BRW_PREDICATE_NORMAL;
1324
1325          this->result.type = BRW_REGISTER_TYPE_F;
1326       } else {
1327          /*  ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1328           *               -> non-negative val generates 0x00000000.
1329           *  Predicated OR sets 1 if val is positive.
1330           */
1331          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1332
1333          emit(ASR(result_dst, op[0], src_reg(31)));
1334
1335          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1336          inst->predicate = BRW_PREDICATE_NORMAL;
1337       }
1338       break;
1339
1340    case ir_unop_rcp:
1341       emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1342       break;
1343
1344    case ir_unop_exp2:
1345       emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1346       break;
1347    case ir_unop_log2:
1348       emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1349       break;
1350    case ir_unop_exp:
1351    case ir_unop_log:
1352       unreachable("not reached: should be handled by ir_explog_to_explog2");
1353    case ir_unop_sin:
1354    case ir_unop_sin_reduced:
1355       emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1356       break;
1357    case ir_unop_cos:
1358    case ir_unop_cos_reduced:
1359       emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1360       break;
1361
1362    case ir_unop_dFdx:
1363    case ir_unop_dFdy:
1364       unreachable("derivatives not valid in vertex shader");
1365
1366    case ir_unop_bitfield_reverse:
1367       emit(BFREV(result_dst, op[0]));
1368       break;
1369    case ir_unop_bit_count:
1370       emit(CBIT(result_dst, op[0]));
1371       break;
1372    case ir_unop_find_msb: {
1373       src_reg temp = src_reg(this, glsl_type::uint_type);
1374
1375       inst = emit(FBH(dst_reg(temp), op[0]));
1376       inst->dst.writemask = WRITEMASK_XYZW;
1377
1378       /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1379        * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1380        * subtract the result from 31 to convert the MSB count into an LSB count.
1381        */
1382
1383       /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1384       temp.swizzle = BRW_SWIZZLE_NOOP;
1385       emit(MOV(result_dst, temp));
1386
1387       src_reg src_tmp = src_reg(result_dst);
1388       emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1389
1390       src_tmp.negate = true;
1391       inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1392       inst->predicate = BRW_PREDICATE_NORMAL;
1393       break;
1394    }
1395    case ir_unop_find_lsb:
1396       emit(FBL(result_dst, op[0]));
1397       break;
1398
1399    case ir_unop_noise:
1400       unreachable("not reached: should be handled by lower_noise");
1401
1402    case ir_binop_add:
1403       emit(ADD(result_dst, op[0], op[1]));
1404       break;
1405    case ir_binop_sub:
1406       unreachable("not reached: should be handled by ir_sub_to_add_neg");
1407
1408    case ir_binop_mul:
1409       if (brw->gen < 8 && ir->type->is_integer()) {
1410          /* For integer multiplication, the MUL uses the low 16 bits of one of
1411           * the operands (src0 through SNB, src1 on IVB and later).  The MACH
1412           * accumulates in the contribution of the upper 16 bits of that
1413           * operand.  If we can determine that one of the args is in the low
1414           * 16 bits, though, we can just emit a single MUL.
1415           */
1416          if (ir->operands[0]->is_uint16_constant()) {
1417             if (brw->gen < 7)
1418                emit(MUL(result_dst, op[0], op[1]));
1419             else
1420                emit(MUL(result_dst, op[1], op[0]));
1421          } else if (ir->operands[1]->is_uint16_constant()) {
1422             if (brw->gen < 7)
1423                emit(MUL(result_dst, op[1], op[0]));
1424             else
1425                emit(MUL(result_dst, op[0], op[1]));
1426          } else {
1427             struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1428
1429             emit(MUL(acc, op[0], op[1]));
1430             emit(MACH(dst_null_d(), op[0], op[1]));
1431             emit(MOV(result_dst, src_reg(acc)));
1432          }
1433       } else {
1434          emit(MUL(result_dst, op[0], op[1]));
1435       }
1436       break;
1437    case ir_binop_imul_high: {
1438       struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1439
1440       emit(MUL(acc, op[0], op[1]));
1441       emit(MACH(result_dst, op[0], op[1]));
1442       break;
1443    }
1444    case ir_binop_div:
1445       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1446       assert(ir->type->is_integer());
1447       emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1448       break;
1449    case ir_binop_carry: {
1450       struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1451
1452       emit(ADDC(dst_null_ud(), op[0], op[1]));
1453       emit(MOV(result_dst, src_reg(acc)));
1454       break;
1455    }
1456    case ir_binop_borrow: {
1457       struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1458
1459       emit(SUBB(dst_null_ud(), op[0], op[1]));
1460       emit(MOV(result_dst, src_reg(acc)));
1461       break;
1462    }
1463    case ir_binop_mod:
1464       /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1465       assert(ir->type->is_integer());
1466       emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1467       break;
1468
1469    case ir_binop_less:
1470    case ir_binop_greater:
1471    case ir_binop_lequal:
1472    case ir_binop_gequal:
1473    case ir_binop_equal:
1474    case ir_binop_nequal: {
1475       emit(CMP(result_dst, op[0], op[1],
1476                brw_conditional_for_comparison(ir->operation)));
1477       emit(AND(result_dst, result_src, src_reg(0x1)));
1478       break;
1479    }
1480
1481    case ir_binop_all_equal:
1482       /* "==" operator producing a scalar boolean. */
1483       if (ir->operands[0]->type->is_vector() ||
1484           ir->operands[1]->type->is_vector()) {
1485          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1486          emit(MOV(result_dst, src_reg(0)));
1487          inst = emit(MOV(result_dst, src_reg(1)));
1488          inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1489       } else {
1490          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1491          emit(AND(result_dst, result_src, src_reg(0x1)));
1492       }
1493       break;
1494    case ir_binop_any_nequal:
1495       /* "!=" operator producing a scalar boolean. */
1496       if (ir->operands[0]->type->is_vector() ||
1497           ir->operands[1]->type->is_vector()) {
1498          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1499
1500          emit(MOV(result_dst, src_reg(0)));
1501          inst = emit(MOV(result_dst, src_reg(1)));
1502          inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1503       } else {
1504          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1505          emit(AND(result_dst, result_src, src_reg(0x1)));
1506       }
1507       break;
1508
1509    case ir_unop_any:
1510       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1511       emit(MOV(result_dst, src_reg(0)));
1512
1513       inst = emit(MOV(result_dst, src_reg(1)));
1514       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1515       break;
1516
1517    case ir_binop_logic_xor:
1518       emit(XOR(result_dst, op[0], op[1]));
1519       break;
1520
1521    case ir_binop_logic_or:
1522       emit(OR(result_dst, op[0], op[1]));
1523       break;
1524
1525    case ir_binop_logic_and:
1526       emit(AND(result_dst, op[0], op[1]));
1527       break;
1528
1529    case ir_binop_dot:
1530       assert(ir->operands[0]->type->is_vector());
1531       assert(ir->operands[0]->type == ir->operands[1]->type);
1532       emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1533       break;
1534
1535    case ir_unop_sqrt:
1536       emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1537       break;
1538    case ir_unop_rsq:
1539       emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1540       break;
1541
1542    case ir_unop_bitcast_i2f:
1543    case ir_unop_bitcast_u2f:
1544       this->result = op[0];
1545       this->result.type = BRW_REGISTER_TYPE_F;
1546       break;
1547
1548    case ir_unop_bitcast_f2i:
1549       this->result = op[0];
1550       this->result.type = BRW_REGISTER_TYPE_D;
1551       break;
1552
1553    case ir_unop_bitcast_f2u:
1554       this->result = op[0];
1555       this->result.type = BRW_REGISTER_TYPE_UD;
1556       break;
1557
1558    case ir_unop_i2f:
1559    case ir_unop_i2u:
1560    case ir_unop_u2i:
1561    case ir_unop_u2f:
1562    case ir_unop_b2f:
1563    case ir_unop_b2i:
1564    case ir_unop_f2i:
1565    case ir_unop_f2u:
1566       emit(MOV(result_dst, op[0]));
1567       break;
1568    case ir_unop_f2b:
1569    case ir_unop_i2b: {
1570       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1571       emit(AND(result_dst, result_src, src_reg(1)));
1572       break;
1573    }
1574
1575    case ir_unop_trunc:
1576       emit(RNDZ(result_dst, op[0]));
1577       break;
1578    case ir_unop_ceil:
1579       op[0].negate = !op[0].negate;
1580       inst = emit(RNDD(result_dst, op[0]));
1581       this->result.negate = true;
1582       break;
1583    case ir_unop_floor:
1584       inst = emit(RNDD(result_dst, op[0]));
1585       break;
1586    case ir_unop_fract:
1587       inst = emit(FRC(result_dst, op[0]));
1588       break;
1589    case ir_unop_round_even:
1590       emit(RNDE(result_dst, op[0]));
1591       break;
1592
1593    case ir_binop_min:
1594       emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1595       break;
1596    case ir_binop_max:
1597       emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1598       break;
1599
1600    case ir_binop_pow:
1601       emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1602       break;
1603
1604    case ir_unop_bit_not:
1605       inst = emit(NOT(result_dst, op[0]));
1606       break;
1607    case ir_binop_bit_and:
1608       inst = emit(AND(result_dst, op[0], op[1]));
1609       break;
1610    case ir_binop_bit_xor:
1611       inst = emit(XOR(result_dst, op[0], op[1]));
1612       break;
1613    case ir_binop_bit_or:
1614       inst = emit(OR(result_dst, op[0], op[1]));
1615       break;
1616
1617    case ir_binop_lshift:
1618       inst = emit(SHL(result_dst, op[0], op[1]));
1619       break;
1620
1621    case ir_binop_rshift:
1622       if (ir->type->base_type == GLSL_TYPE_INT)
1623          inst = emit(ASR(result_dst, op[0], op[1]));
1624       else
1625          inst = emit(SHR(result_dst, op[0], op[1]));
1626       break;
1627
1628    case ir_binop_bfm:
1629       emit(BFI1(result_dst, op[0], op[1]));
1630       break;
1631
1632    case ir_binop_ubo_load: {
1633       ir_constant *uniform_block = ir->operands[0]->as_constant();
1634       ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1635       unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1636       src_reg offset;
1637
1638       /* Now, load the vector from that offset. */
1639       assert(ir->type->is_vector() || ir->type->is_scalar());
1640
1641       src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1642       packed_consts.type = result.type;
1643       src_reg surf_index =
1644          src_reg(prog_data->base.binding_table.ubo_start + uniform_block->value.u[0]);
1645       if (const_offset_ir) {
1646          if (brw->gen >= 8) {
1647             /* Store the offset in a GRF so we can send-from-GRF. */
1648             offset = src_reg(this, glsl_type::int_type);
1649             emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1650          } else {
1651             /* Immediates are fine on older generations since they'll be moved
1652              * to a (potentially fake) MRF at the generator level.
1653              */
1654             offset = src_reg(const_offset / 16);
1655          }
1656       } else {
1657          offset = src_reg(this, glsl_type::uint_type);
1658          emit(SHR(dst_reg(offset), op[1], src_reg(4)));
1659       }
1660
1661       if (brw->gen >= 7) {
1662          dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1663          grf_offset.type = offset.type;
1664
1665          emit(MOV(grf_offset, offset));
1666
1667          emit(new(mem_ctx) vec4_instruction(this,
1668                                             VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1669                                             dst_reg(packed_consts),
1670                                             surf_index,
1671                                             src_reg(grf_offset)));
1672       } else {
1673          vec4_instruction *pull =
1674             emit(new(mem_ctx) vec4_instruction(this,
1675                                                VS_OPCODE_PULL_CONSTANT_LOAD,
1676                                                dst_reg(packed_consts),
1677                                                surf_index,
1678                                                offset));
1679          pull->base_mrf = 14;
1680          pull->mlen = 1;
1681       }
1682
1683       packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1684       packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1685                                             const_offset % 16 / 4,
1686                                             const_offset % 16 / 4,
1687                                             const_offset % 16 / 4);
1688
1689       /* UBO bools are any nonzero int.  We store bools as either 0 or 1. */
1690       if (ir->type->base_type == GLSL_TYPE_BOOL) {
1691          emit(CMP(result_dst, packed_consts, src_reg(0u),
1692                   BRW_CONDITIONAL_NZ));
1693          emit(AND(result_dst, result, src_reg(0x1)));
1694       } else {
1695          emit(MOV(result_dst, packed_consts));
1696       }
1697       break;
1698    }
1699
1700    case ir_binop_vector_extract:
1701       unreachable("should have been lowered by vec_index_to_cond_assign");
1702
1703    case ir_triop_fma:
1704       op[0] = fix_3src_operand(op[0]);
1705       op[1] = fix_3src_operand(op[1]);
1706       op[2] = fix_3src_operand(op[2]);
1707       /* Note that the instruction's argument order is reversed from GLSL
1708        * and the IR.
1709        */
1710       emit(MAD(result_dst, op[2], op[1], op[0]));
1711       break;
1712
1713    case ir_triop_lrp:
1714       emit_lrp(result_dst, op[0], op[1], op[2]);
1715       break;
1716
1717    case ir_triop_csel:
1718       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1719       inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1720       inst->predicate = BRW_PREDICATE_NORMAL;
1721       break;
1722
1723    case ir_triop_bfi:
1724       op[0] = fix_3src_operand(op[0]);
1725       op[1] = fix_3src_operand(op[1]);
1726       op[2] = fix_3src_operand(op[2]);
1727       emit(BFI2(result_dst, op[0], op[1], op[2]));
1728       break;
1729
1730    case ir_triop_bitfield_extract:
1731       op[0] = fix_3src_operand(op[0]);
1732       op[1] = fix_3src_operand(op[1]);
1733       op[2] = fix_3src_operand(op[2]);
1734       /* Note that the instruction's argument order is reversed from GLSL
1735        * and the IR.
1736        */
1737       emit(BFE(result_dst, op[2], op[1], op[0]));
1738       break;
1739
1740    case ir_triop_vector_insert:
1741       unreachable("should have been lowered by lower_vector_insert");
1742
1743    case ir_quadop_bitfield_insert:
1744       unreachable("not reached: should be handled by "
1745               "bitfield_insert_to_bfm_bfi\n");
1746
1747    case ir_quadop_vector:
1748       unreachable("not reached: should be handled by lower_quadop_vector");
1749
1750    case ir_unop_pack_half_2x16:
1751       emit_pack_half_2x16(result_dst, op[0]);
1752       break;
1753    case ir_unop_unpack_half_2x16:
1754       emit_unpack_half_2x16(result_dst, op[0]);
1755       break;
1756    case ir_unop_pack_snorm_2x16:
1757    case ir_unop_pack_snorm_4x8:
1758    case ir_unop_pack_unorm_2x16:
1759    case ir_unop_pack_unorm_4x8:
1760    case ir_unop_unpack_snorm_2x16:
1761    case ir_unop_unpack_snorm_4x8:
1762    case ir_unop_unpack_unorm_2x16:
1763    case ir_unop_unpack_unorm_4x8:
1764       unreachable("not reached: should be handled by lower_packing_builtins");
1765    case ir_unop_unpack_half_2x16_split_x:
1766    case ir_unop_unpack_half_2x16_split_y:
1767    case ir_binop_pack_half_2x16_split:
1768       unreachable("not reached: should not occur in vertex shader");
1769    case ir_binop_ldexp:
1770       unreachable("not reached: should be handled by ldexp_to_arith()");
1771    }
1772 }
1773
1774
1775 void
1776 vec4_visitor::visit(ir_swizzle *ir)
1777 {
1778    src_reg src;
1779    int i = 0;
1780    int swizzle[4];
1781
1782    /* Note that this is only swizzles in expressions, not those on the left
1783     * hand side of an assignment, which do write masking.  See ir_assignment
1784     * for that.
1785     */
1786
1787    ir->val->accept(this);
1788    src = this->result;
1789    assert(src.file != BAD_FILE);
1790
1791    for (i = 0; i < ir->type->vector_elements; i++) {
1792       switch (i) {
1793       case 0:
1794          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1795          break;
1796       case 1:
1797          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1798          break;
1799       case 2:
1800          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1801          break;
1802       case 3:
1803          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1804             break;
1805       }
1806    }
1807    for (; i < 4; i++) {
1808       /* Replicate the last channel out. */
1809       swizzle[i] = swizzle[ir->type->vector_elements - 1];
1810    }
1811
1812    src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1813
1814    this->result = src;
1815 }
1816
1817 void
1818 vec4_visitor::visit(ir_dereference_variable *ir)
1819 {
1820    const struct glsl_type *type = ir->type;
1821    dst_reg *reg = variable_storage(ir->var);
1822
1823    if (!reg) {
1824       fail("Failed to find variable storage for %s\n", ir->var->name);
1825       this->result = src_reg(brw_null_reg());
1826       return;
1827    }
1828
1829    this->result = src_reg(*reg);
1830
1831    /* System values get their swizzle from the dst_reg writemask */
1832    if (ir->var->data.mode == ir_var_system_value)
1833       return;
1834
1835    if (type->is_scalar() || type->is_vector() || type->is_matrix())
1836       this->result.swizzle = swizzle_for_size(type->vector_elements);
1837 }
1838
1839
1840 int
1841 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1842 {
1843    /* Under normal circumstances array elements are stored consecutively, so
1844     * the stride is equal to the size of the array element.
1845     */
1846    return type_size(ir->type);
1847 }
1848
1849
1850 void
1851 vec4_visitor::visit(ir_dereference_array *ir)
1852 {
1853    ir_constant *constant_index;
1854    src_reg src;
1855    int array_stride = compute_array_stride(ir);
1856
1857    constant_index = ir->array_index->constant_expression_value();
1858
1859    ir->array->accept(this);
1860    src = this->result;
1861
1862    if (constant_index) {
1863       src.reg_offset += constant_index->value.i[0] * array_stride;
1864    } else {
1865       /* Variable index array dereference.  It eats the "vec4" of the
1866        * base of the array and an index that offsets the Mesa register
1867        * index.
1868        */
1869       ir->array_index->accept(this);
1870
1871       src_reg index_reg;
1872
1873       if (array_stride == 1) {
1874          index_reg = this->result;
1875       } else {
1876          index_reg = src_reg(this, glsl_type::int_type);
1877
1878          emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
1879       }
1880
1881       if (src.reladdr) {
1882          src_reg temp = src_reg(this, glsl_type::int_type);
1883
1884          emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1885
1886          index_reg = temp;
1887       }
1888
1889       src.reladdr = ralloc(mem_ctx, src_reg);
1890       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1891    }
1892
1893    /* If the type is smaller than a vec4, replicate the last channel out. */
1894    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1895       src.swizzle = swizzle_for_size(ir->type->vector_elements);
1896    else
1897       src.swizzle = BRW_SWIZZLE_NOOP;
1898    src.type = brw_type_for_base_type(ir->type);
1899
1900    this->result = src;
1901 }
1902
1903 void
1904 vec4_visitor::visit(ir_dereference_record *ir)
1905 {
1906    unsigned int i;
1907    const glsl_type *struct_type = ir->record->type;
1908    int offset = 0;
1909
1910    ir->record->accept(this);
1911
1912    for (i = 0; i < struct_type->length; i++) {
1913       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1914          break;
1915       offset += type_size(struct_type->fields.structure[i].type);
1916    }
1917
1918    /* If the type is smaller than a vec4, replicate the last channel out. */
1919    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1920       this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1921    else
1922       this->result.swizzle = BRW_SWIZZLE_NOOP;
1923    this->result.type = brw_type_for_base_type(ir->type);
1924
1925    this->result.reg_offset += offset;
1926 }
1927
1928 /**
1929  * We want to be careful in assignment setup to hit the actual storage
1930  * instead of potentially using a temporary like we might with the
1931  * ir_dereference handler.
1932  */
1933 static dst_reg
1934 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1935 {
1936    /* The LHS must be a dereference.  If the LHS is a variable indexed array
1937     * access of a vector, it must be separated into a series conditional moves
1938     * before reaching this point (see ir_vec_index_to_cond_assign).
1939     */
1940    assert(ir->as_dereference());
1941    ir_dereference_array *deref_array = ir->as_dereference_array();
1942    if (deref_array) {
1943       assert(!deref_array->array->type->is_vector());
1944    }
1945
1946    /* Use the rvalue deref handler for the most part.  We'll ignore
1947     * swizzles in it and write swizzles using writemask, though.
1948     */
1949    ir->accept(v);
1950    return dst_reg(v->result);
1951 }
1952
1953 void
1954 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1955                               const struct glsl_type *type, uint32_t predicate)
1956 {
1957    if (type->base_type == GLSL_TYPE_STRUCT) {
1958       for (unsigned int i = 0; i < type->length; i++) {
1959          emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1960       }
1961       return;
1962    }
1963
1964    if (type->is_array()) {
1965       for (unsigned int i = 0; i < type->length; i++) {
1966          emit_block_move(dst, src, type->fields.array, predicate);
1967       }
1968       return;
1969    }
1970
1971    if (type->is_matrix()) {
1972       const struct glsl_type *vec_type;
1973
1974       vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1975                                          type->vector_elements, 1);
1976
1977       for (int i = 0; i < type->matrix_columns; i++) {
1978          emit_block_move(dst, src, vec_type, predicate);
1979       }
1980       return;
1981    }
1982
1983    assert(type->is_scalar() || type->is_vector());
1984
1985    dst->type = brw_type_for_base_type(type);
1986    src->type = dst->type;
1987
1988    dst->writemask = (1 << type->vector_elements) - 1;
1989
1990    src->swizzle = swizzle_for_size(type->vector_elements);
1991
1992    vec4_instruction *inst = emit(MOV(*dst, *src));
1993    inst->predicate = predicate;
1994
1995    dst->reg_offset++;
1996    src->reg_offset++;
1997 }
1998
1999
2000 /* If the RHS processing resulted in an instruction generating a
2001  * temporary value, and it would be easy to rewrite the instruction to
2002  * generate its result right into the LHS instead, do so.  This ends
2003  * up reliably removing instructions where it can be tricky to do so
2004  * later without real UD chain information.
2005  */
2006 bool
2007 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2008                                      dst_reg dst,
2009                                      src_reg src,
2010                                      vec4_instruction *pre_rhs_inst,
2011                                      vec4_instruction *last_rhs_inst)
2012 {
2013    /* This could be supported, but it would take more smarts. */
2014    if (ir->condition)
2015       return false;
2016
2017    if (pre_rhs_inst == last_rhs_inst)
2018       return false; /* No instructions generated to work with. */
2019
2020    /* Make sure the last instruction generated our source reg. */
2021    if (src.file != GRF ||
2022        src.file != last_rhs_inst->dst.file ||
2023        src.reg != last_rhs_inst->dst.reg ||
2024        src.reg_offset != last_rhs_inst->dst.reg_offset ||
2025        src.reladdr ||
2026        src.abs ||
2027        src.negate ||
2028        last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2029       return false;
2030
2031    /* Check that that last instruction fully initialized the channels
2032     * we want to use, in the order we want to use them.  We could
2033     * potentially reswizzle the operands of many instructions so that
2034     * we could handle out of order channels, but don't yet.
2035     */
2036
2037    for (unsigned i = 0; i < 4; i++) {
2038       if (dst.writemask & (1 << i)) {
2039          if (!(last_rhs_inst->dst.writemask & (1 << i)))
2040             return false;
2041
2042          if (BRW_GET_SWZ(src.swizzle, i) != i)
2043             return false;
2044       }
2045    }
2046
2047    /* Success!  Rewrite the instruction. */
2048    last_rhs_inst->dst.file = dst.file;
2049    last_rhs_inst->dst.reg = dst.reg;
2050    last_rhs_inst->dst.reg_offset = dst.reg_offset;
2051    last_rhs_inst->dst.reladdr = dst.reladdr;
2052    last_rhs_inst->dst.writemask &= dst.writemask;
2053
2054    return true;
2055 }
2056
2057 void
2058 vec4_visitor::visit(ir_assignment *ir)
2059 {
2060    dst_reg dst = get_assignment_lhs(ir->lhs, this);
2061    uint32_t predicate = BRW_PREDICATE_NONE;
2062
2063    if (!ir->lhs->type->is_scalar() &&
2064        !ir->lhs->type->is_vector()) {
2065       ir->rhs->accept(this);
2066       src_reg src = this->result;
2067
2068       if (ir->condition) {
2069          emit_bool_to_cond_code(ir->condition, &predicate);
2070       }
2071
2072       /* emit_block_move doesn't account for swizzles in the source register.
2073        * This should be ok, since the source register is a structure or an
2074        * array, and those can't be swizzled.  But double-check to be sure.
2075        */
2076       assert(src.swizzle ==
2077              (ir->rhs->type->is_matrix()
2078               ? swizzle_for_size(ir->rhs->type->vector_elements)
2079               : BRW_SWIZZLE_NOOP));
2080
2081       emit_block_move(&dst, &src, ir->rhs->type, predicate);
2082       return;
2083    }
2084
2085    /* Now we're down to just a scalar/vector with writemasks. */
2086    int i;
2087
2088    vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2089    pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2090
2091    ir->rhs->accept(this);
2092
2093    last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2094
2095    src_reg src = this->result;
2096
2097    int swizzles[4];
2098    int first_enabled_chan = 0;
2099    int src_chan = 0;
2100
2101    assert(ir->lhs->type->is_vector() ||
2102           ir->lhs->type->is_scalar());
2103    dst.writemask = ir->write_mask;
2104
2105    for (int i = 0; i < 4; i++) {
2106       if (dst.writemask & (1 << i)) {
2107          first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
2108          break;
2109       }
2110    }
2111
2112    /* Swizzle a small RHS vector into the channels being written.
2113     *
2114     * glsl ir treats write_mask as dictating how many channels are
2115     * present on the RHS while in our instructions we need to make
2116     * those channels appear in the slots of the vec4 they're written to.
2117     */
2118    for (int i = 0; i < 4; i++) {
2119       if (dst.writemask & (1 << i))
2120          swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
2121       else
2122          swizzles[i] = first_enabled_chan;
2123    }
2124    src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2125                               swizzles[2], swizzles[3]);
2126
2127    if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2128       return;
2129    }
2130
2131    if (ir->condition) {
2132       emit_bool_to_cond_code(ir->condition, &predicate);
2133    }
2134
2135    for (i = 0; i < type_size(ir->lhs->type); i++) {
2136       vec4_instruction *inst = emit(MOV(dst, src));
2137       inst->predicate = predicate;
2138
2139       dst.reg_offset++;
2140       src.reg_offset++;
2141    }
2142 }
2143
2144 void
2145 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2146 {
2147    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2148       foreach_in_list(ir_constant, field_value, &ir->components) {
2149          emit_constant_values(dst, field_value);
2150       }
2151       return;
2152    }
2153
2154    if (ir->type->is_array()) {
2155       for (unsigned int i = 0; i < ir->type->length; i++) {
2156          emit_constant_values(dst, ir->array_elements[i]);
2157       }
2158       return;
2159    }
2160
2161    if (ir->type->is_matrix()) {
2162       for (int i = 0; i < ir->type->matrix_columns; i++) {
2163          float *vec = &ir->value.f[i * ir->type->vector_elements];
2164
2165          for (int j = 0; j < ir->type->vector_elements; j++) {
2166             dst->writemask = 1 << j;
2167             dst->type = BRW_REGISTER_TYPE_F;
2168
2169             emit(MOV(*dst, src_reg(vec[j])));
2170          }
2171          dst->reg_offset++;
2172       }
2173       return;
2174    }
2175
2176    int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2177
2178    for (int i = 0; i < ir->type->vector_elements; i++) {
2179       if (!(remaining_writemask & (1 << i)))
2180          continue;
2181
2182       dst->writemask = 1 << i;
2183       dst->type = brw_type_for_base_type(ir->type);
2184
2185       /* Find other components that match the one we're about to
2186        * write.  Emits fewer instructions for things like vec4(0.5,
2187        * 1.5, 1.5, 1.5).
2188        */
2189       for (int j = i + 1; j < ir->type->vector_elements; j++) {
2190          if (ir->type->base_type == GLSL_TYPE_BOOL) {
2191             if (ir->value.b[i] == ir->value.b[j])
2192                dst->writemask |= (1 << j);
2193          } else {
2194             /* u, i, and f storage all line up, so no need for a
2195              * switch case for comparing each type.
2196              */
2197             if (ir->value.u[i] == ir->value.u[j])
2198                dst->writemask |= (1 << j);
2199          }
2200       }
2201
2202       switch (ir->type->base_type) {
2203       case GLSL_TYPE_FLOAT:
2204          emit(MOV(*dst, src_reg(ir->value.f[i])));
2205          break;
2206       case GLSL_TYPE_INT:
2207          emit(MOV(*dst, src_reg(ir->value.i[i])));
2208          break;
2209       case GLSL_TYPE_UINT:
2210          emit(MOV(*dst, src_reg(ir->value.u[i])));
2211          break;
2212       case GLSL_TYPE_BOOL:
2213          emit(MOV(*dst, src_reg(ir->value.b[i])));
2214          break;
2215       default:
2216          unreachable("Non-float/uint/int/bool constant");
2217       }
2218
2219       remaining_writemask &= ~dst->writemask;
2220    }
2221    dst->reg_offset++;
2222 }
2223
2224 void
2225 vec4_visitor::visit(ir_constant *ir)
2226 {
2227    dst_reg dst = dst_reg(this, ir->type);
2228    this->result = src_reg(dst);
2229
2230    emit_constant_values(&dst, ir);
2231 }
2232
2233 void
2234 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2235 {
2236    ir_dereference *deref = static_cast<ir_dereference *>(
2237       ir->actual_parameters.get_head());
2238    ir_variable *location = deref->variable_referenced();
2239    unsigned surf_index = (prog_data->base.binding_table.abo_start +
2240                           location->data.atomic.buffer_index);
2241
2242    /* Calculate the surface offset */
2243    src_reg offset(this, glsl_type::uint_type);
2244    ir_dereference_array *deref_array = deref->as_dereference_array();
2245    if (deref_array) {
2246       deref_array->array_index->accept(this);
2247
2248       src_reg tmp(this, glsl_type::uint_type);
2249       emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2250       emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2251    } else {
2252       offset = location->data.atomic.offset;
2253    }
2254
2255    /* Emit the appropriate machine instruction */
2256    const char *callee = ir->callee->function_name();
2257    dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2258
2259    if (!strcmp("__intrinsic_atomic_read", callee)) {
2260       emit_untyped_surface_read(surf_index, dst, offset);
2261
2262    } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2263       emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2264                           src_reg(), src_reg());
2265
2266    } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2267       emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2268                           src_reg(), src_reg());
2269    }
2270 }
2271
2272 void
2273 vec4_visitor::visit(ir_call *ir)
2274 {
2275    const char *callee = ir->callee->function_name();
2276
2277    if (!strcmp("__intrinsic_atomic_read", callee) ||
2278        !strcmp("__intrinsic_atomic_increment", callee) ||
2279        !strcmp("__intrinsic_atomic_predecrement", callee)) {
2280       visit_atomic_counter_intrinsic(ir);
2281    } else {
2282       unreachable("Unsupported intrinsic.");
2283    }
2284 }
2285
2286 src_reg
2287 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, int sampler)
2288 {
2289    vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_MCS);
2290    inst->base_mrf = 2;
2291    inst->mlen = 1;
2292    inst->sampler = sampler;
2293    inst->dst = dst_reg(this, glsl_type::uvec4_type);
2294    inst->dst.writemask = WRITEMASK_XYZW;
2295
2296    /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2297    int param_base = inst->base_mrf;
2298    int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2299    int zero_mask = 0xf & ~coord_mask;
2300
2301    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2302             coordinate));
2303
2304    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2305             src_reg(0)));
2306
2307    emit(inst);
2308    return src_reg(inst->dst);
2309 }
2310
2311 void
2312 vec4_visitor::visit(ir_texture *ir)
2313 {
2314    int sampler =
2315       _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2316
2317    /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2318     * emitting anything other than setting up the constant result.
2319     */
2320    if (ir->op == ir_tg4) {
2321       ir_constant *chan = ir->lod_info.component->as_constant();
2322       int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2323       if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2324          dst_reg result(this, ir->type);
2325          this->result = src_reg(result);
2326          emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2327          return;
2328       }
2329    }
2330
2331    /* Should be lowered by do_lower_texture_projection */
2332    assert(!ir->projector);
2333
2334    /* Should be lowered */
2335    assert(!ir->offset || !ir->offset->type->is_array());
2336
2337    /* Generate code to compute all the subexpression trees.  This has to be
2338     * done before loading any values into MRFs for the sampler message since
2339     * generating these values may involve SEND messages that need the MRFs.
2340     */
2341    src_reg coordinate;
2342    if (ir->coordinate) {
2343       ir->coordinate->accept(this);
2344       coordinate = this->result;
2345    }
2346
2347    src_reg shadow_comparitor;
2348    if (ir->shadow_comparitor) {
2349       ir->shadow_comparitor->accept(this);
2350       shadow_comparitor = this->result;
2351    }
2352
2353    bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2354    src_reg offset_value;
2355    if (has_nonconstant_offset) {
2356       ir->offset->accept(this);
2357       offset_value = src_reg(this->result);
2358    }
2359
2360    const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2361    src_reg lod, dPdx, dPdy, sample_index, mcs;
2362    switch (ir->op) {
2363    case ir_tex:
2364       lod = src_reg(0.0f);
2365       lod_type = glsl_type::float_type;
2366       break;
2367    case ir_txf:
2368    case ir_txl:
2369    case ir_txs:
2370       ir->lod_info.lod->accept(this);
2371       lod = this->result;
2372       lod_type = ir->lod_info.lod->type;
2373       break;
2374    case ir_query_levels:
2375       lod = src_reg(0);
2376       lod_type = glsl_type::int_type;
2377       break;
2378    case ir_txf_ms:
2379       ir->lod_info.sample_index->accept(this);
2380       sample_index = this->result;
2381       sample_index_type = ir->lod_info.sample_index->type;
2382
2383       if (brw->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2384          mcs = emit_mcs_fetch(ir, coordinate, sampler);
2385       else
2386          mcs = src_reg(0u);
2387       break;
2388    case ir_txd:
2389       ir->lod_info.grad.dPdx->accept(this);
2390       dPdx = this->result;
2391
2392       ir->lod_info.grad.dPdy->accept(this);
2393       dPdy = this->result;
2394
2395       lod_type = ir->lod_info.grad.dPdx->type;
2396       break;
2397    case ir_txb:
2398    case ir_lod:
2399    case ir_tg4:
2400       break;
2401    }
2402
2403    vec4_instruction *inst = NULL;
2404    switch (ir->op) {
2405    case ir_tex:
2406    case ir_txl:
2407       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXL);
2408       break;
2409    case ir_txd:
2410       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXD);
2411       break;
2412    case ir_txf:
2413       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF);
2414       break;
2415    case ir_txf_ms:
2416       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_CMS);
2417       break;
2418    case ir_txs:
2419       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
2420       break;
2421    case ir_tg4:
2422       if (has_nonconstant_offset)
2423          inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TG4_OFFSET);
2424       else
2425          inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TG4);
2426       break;
2427    case ir_query_levels:
2428       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
2429       break;
2430    case ir_txb:
2431       unreachable("TXB is not valid for vertex shaders.");
2432    case ir_lod:
2433       unreachable("LOD is not valid for vertex shaders.");
2434    default:
2435       unreachable("Unrecognized tex op");
2436    }
2437
2438    if (ir->offset != NULL && ir->op != ir_txf)
2439       inst->texture_offset = brw_texture_offset(ctx, ir->offset->as_constant());
2440
2441    /* Stuff the channel select bits in the top of the texture offset */
2442    if (ir->op == ir_tg4)
2443       inst->texture_offset |= gather_channel(ir, sampler) << 16;
2444
2445    /* The message header is necessary for:
2446     * - Gen4 (always)
2447     * - Texel offsets
2448     * - Gather channel selection
2449     * - Sampler indices too large to fit in a 4-bit value.
2450     */
2451    inst->header_present =
2452       brw->gen < 5 || inst->texture_offset != 0 || ir->op == ir_tg4 ||
2453       sampler >= 16;
2454    inst->base_mrf = 2;
2455    inst->mlen = inst->header_present + 1; /* always at least one */
2456    inst->sampler = sampler;
2457    inst->dst = dst_reg(this, ir->type);
2458    inst->dst.writemask = WRITEMASK_XYZW;
2459    inst->shadow_compare = ir->shadow_comparitor != NULL;
2460
2461    /* MRF for the first parameter */
2462    int param_base = inst->base_mrf + inst->header_present;
2463
2464    if (ir->op == ir_txs || ir->op == ir_query_levels) {
2465       int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2466       emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2467    } else {
2468       /* Load the coordinate */
2469       /* FINISHME: gl_clamp_mask and saturate */
2470       int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2471       int zero_mask = 0xf & ~coord_mask;
2472
2473       emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2474                coordinate));
2475
2476       if (zero_mask != 0) {
2477          emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2478                   src_reg(0)));
2479       }
2480       /* Load the shadow comparitor */
2481       if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2482          emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2483                           WRITEMASK_X),
2484                   shadow_comparitor));
2485          inst->mlen++;
2486       }
2487
2488       /* Load the LOD info */
2489       if (ir->op == ir_tex || ir->op == ir_txl) {
2490          int mrf, writemask;
2491          if (brw->gen >= 5) {
2492             mrf = param_base + 1;
2493             if (ir->shadow_comparitor) {
2494                writemask = WRITEMASK_Y;
2495                /* mlen already incremented */
2496             } else {
2497                writemask = WRITEMASK_X;
2498                inst->mlen++;
2499             }
2500          } else /* brw->gen == 4 */ {
2501             mrf = param_base;
2502             writemask = WRITEMASK_W;
2503          }
2504          emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2505       } else if (ir->op == ir_txf) {
2506          emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2507       } else if (ir->op == ir_txf_ms) {
2508          emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2509                   sample_index));
2510          if (brw->gen >= 7)
2511             /* MCS data is in the first channel of `mcs`, but we need to get it into
2512              * the .y channel of the second vec4 of params, so replicate .x across
2513              * the whole vec4 and then mask off everything except .y
2514              */
2515             mcs.swizzle = BRW_SWIZZLE_XXXX;
2516             emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2517                      mcs));
2518          inst->mlen++;
2519       } else if (ir->op == ir_txd) {
2520          const glsl_type *type = lod_type;
2521
2522          if (brw->gen >= 5) {
2523             dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2524             dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2525             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2526             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2527             inst->mlen++;
2528
2529             if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2530                dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2531                dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2532                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2533                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2534                inst->mlen++;
2535
2536                if (ir->shadow_comparitor) {
2537                   emit(MOV(dst_reg(MRF, param_base + 2,
2538                                    ir->shadow_comparitor->type, WRITEMASK_Z),
2539                            shadow_comparitor));
2540                }
2541             }
2542          } else /* brw->gen == 4 */ {
2543             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2544             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2545             inst->mlen += 2;
2546          }
2547       } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2548          if (ir->shadow_comparitor) {
2549             emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2550                      shadow_comparitor));
2551          }
2552
2553          emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2554                   offset_value));
2555          inst->mlen++;
2556       }
2557    }
2558
2559    emit(inst);
2560
2561    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2562     * spec requires layers.
2563     */
2564    if (ir->op == ir_txs) {
2565       glsl_type const *type = ir->sampler->type;
2566       if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2567           type->sampler_array) {
2568          emit_math(SHADER_OPCODE_INT_QUOTIENT,
2569                    writemask(inst->dst, WRITEMASK_Z),
2570                    src_reg(inst->dst), src_reg(6));
2571       }
2572    }
2573
2574    if (brw->gen == 6 && ir->op == ir_tg4) {
2575       emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2576    }
2577
2578    swizzle_result(ir, src_reg(inst->dst), sampler);
2579 }
2580
2581 /**
2582  * Apply workarounds for Gen6 gather with UINT/SINT
2583  */
2584 void
2585 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2586 {
2587    if (!wa)
2588       return;
2589
2590    int width = (wa & WA_8BIT) ? 8 : 16;
2591    dst_reg dst_f = dst;
2592    dst_f.type = BRW_REGISTER_TYPE_F;
2593
2594    /* Convert from UNORM to UINT */
2595    emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2596    emit(MOV(dst, src_reg(dst_f)));
2597
2598    if (wa & WA_SIGN) {
2599       /* Reinterpret the UINT value as a signed INT value by
2600        * shifting the sign bit into place, then shifting back
2601        * preserving sign.
2602        */
2603       emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2604       emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2605    }
2606 }
2607
2608 /**
2609  * Set up the gather channel based on the swizzle, for gather4.
2610  */
2611 uint32_t
2612 vec4_visitor::gather_channel(ir_texture *ir, int sampler)
2613 {
2614    ir_constant *chan = ir->lod_info.component->as_constant();
2615    int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2616    switch (swiz) {
2617       case SWIZZLE_X: return 0;
2618       case SWIZZLE_Y:
2619          /* gather4 sampler is broken for green channel on RG32F --
2620           * we must ask for blue instead.
2621           */
2622          if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2623             return 2;
2624          return 1;
2625       case SWIZZLE_Z: return 2;
2626       case SWIZZLE_W: return 3;
2627       default:
2628          unreachable("Not reached"); /* zero, one swizzles handled already */
2629    }
2630 }
2631
2632 void
2633 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, int sampler)
2634 {
2635    int s = key->tex.swizzles[sampler];
2636
2637    this->result = src_reg(this, ir->type);
2638    dst_reg swizzled_result(this->result);
2639
2640    if (ir->op == ir_query_levels) {
2641       /* # levels is in .w */
2642       orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2643       emit(MOV(swizzled_result, orig_val));
2644       return;
2645    }
2646
2647    if (ir->op == ir_txs || ir->type == glsl_type::float_type
2648                         || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2649       emit(MOV(swizzled_result, orig_val));
2650       return;
2651    }
2652
2653
2654    int zero_mask = 0, one_mask = 0, copy_mask = 0;
2655    int swizzle[4] = {0};
2656
2657    for (int i = 0; i < 4; i++) {
2658       switch (GET_SWZ(s, i)) {
2659       case SWIZZLE_ZERO:
2660          zero_mask |= (1 << i);
2661          break;
2662       case SWIZZLE_ONE:
2663          one_mask |= (1 << i);
2664          break;
2665       default:
2666          copy_mask |= (1 << i);
2667          swizzle[i] = GET_SWZ(s, i);
2668          break;
2669       }
2670    }
2671
2672    if (copy_mask) {
2673       orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2674       swizzled_result.writemask = copy_mask;
2675       emit(MOV(swizzled_result, orig_val));
2676    }
2677
2678    if (zero_mask) {
2679       swizzled_result.writemask = zero_mask;
2680       emit(MOV(swizzled_result, src_reg(0.0f)));
2681    }
2682
2683    if (one_mask) {
2684       swizzled_result.writemask = one_mask;
2685       emit(MOV(swizzled_result, src_reg(1.0f)));
2686    }
2687 }
2688
2689 void
2690 vec4_visitor::visit(ir_return *)
2691 {
2692    unreachable("not reached");
2693 }
2694
2695 void
2696 vec4_visitor::visit(ir_discard *)
2697 {
2698    unreachable("not reached");
2699 }
2700
2701 void
2702 vec4_visitor::visit(ir_if *ir)
2703 {
2704    /* Don't point the annotation at the if statement, because then it plus
2705     * the then and else blocks get printed.
2706     */
2707    this->base_ir = ir->condition;
2708
2709    if (brw->gen == 6) {
2710       emit_if_gen6(ir);
2711    } else {
2712       uint32_t predicate;
2713       emit_bool_to_cond_code(ir->condition, &predicate);
2714       emit(IF(predicate));
2715    }
2716
2717    visit_instructions(&ir->then_instructions);
2718
2719    if (!ir->else_instructions.is_empty()) {
2720       this->base_ir = ir->condition;
2721       emit(BRW_OPCODE_ELSE);
2722
2723       visit_instructions(&ir->else_instructions);
2724    }
2725
2726    this->base_ir = ir->condition;
2727    emit(BRW_OPCODE_ENDIF);
2728 }
2729
2730 void
2731 vec4_visitor::visit(ir_emit_vertex *)
2732 {
2733    unreachable("not reached");
2734 }
2735
2736 void
2737 vec4_visitor::visit(ir_end_primitive *)
2738 {
2739    unreachable("not reached");
2740 }
2741
2742 void
2743 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2744                                   dst_reg dst, src_reg offset,
2745                                   src_reg src0, src_reg src1)
2746 {
2747    unsigned mlen = 0;
2748
2749    /* Set the atomic operation offset. */
2750    emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2751    mlen++;
2752
2753    /* Set the atomic operation arguments. */
2754    if (src0.file != BAD_FILE) {
2755       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2756       mlen++;
2757    }
2758
2759    if (src1.file != BAD_FILE) {
2760       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2761       mlen++;
2762    }
2763
2764    /* Emit the instruction.  Note that this maps to the normal SIMD8
2765     * untyped atomic message on Ivy Bridge, but that's OK because
2766     * unused channels will be masked out.
2767     */
2768    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
2769                                  src_reg(atomic_op), src_reg(surf_index));
2770    inst->base_mrf = 0;
2771    inst->mlen = mlen;
2772 }
2773
2774 void
2775 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
2776                                         src_reg offset)
2777 {
2778    /* Set the surface read offset. */
2779    emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
2780
2781    /* Emit the instruction.  Note that this maps to the normal SIMD8
2782     * untyped surface read message, but that's OK because unused
2783     * channels will be masked out.
2784     */
2785    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ,
2786                                  dst, src_reg(surf_index));
2787    inst->base_mrf = 0;
2788    inst->mlen = 1;
2789 }
2790
2791 void
2792 vec4_visitor::emit_ndc_computation()
2793 {
2794    /* Get the position */
2795    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2796
2797    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2798    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2799    output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2800
2801    current_annotation = "NDC";
2802    dst_reg ndc_w = ndc;
2803    ndc_w.writemask = WRITEMASK_W;
2804    src_reg pos_w = pos;
2805    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2806    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2807
2808    dst_reg ndc_xyz = ndc;
2809    ndc_xyz.writemask = WRITEMASK_XYZ;
2810
2811    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2812 }
2813
2814 void
2815 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2816 {
2817    if (brw->gen < 6 &&
2818        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2819         key->userclip_active || brw->has_negative_rhw_bug)) {
2820       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2821       dst_reg header1_w = header1;
2822       header1_w.writemask = WRITEMASK_W;
2823
2824       emit(MOV(header1, 0u));
2825
2826       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2827          src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2828
2829          current_annotation = "Point size";
2830          emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2831          emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2832       }
2833
2834       if (key->userclip_active) {
2835          current_annotation = "Clipping flags";
2836          dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
2837          dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
2838
2839          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
2840          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
2841          emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
2842
2843          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
2844          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
2845          emit(SHL(flags1, src_reg(flags1), src_reg(4)));
2846          emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
2847       }
2848
2849       /* i965 clipping workaround:
2850        * 1) Test for -ve rhw
2851        * 2) If set,
2852        *      set ndc = (0,0,0,0)
2853        *      set ucp[6] = 1
2854        *
2855        * Later, clipping will detect ucp[6] and ensure the primitive is
2856        * clipped against all fixed planes.
2857        */
2858       if (brw->has_negative_rhw_bug) {
2859          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
2860          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
2861          emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
2862          vec4_instruction *inst;
2863          inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
2864          inst->predicate = BRW_PREDICATE_NORMAL;
2865          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
2866          inst->predicate = BRW_PREDICATE_NORMAL;
2867       }
2868
2869       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2870    } else if (brw->gen < 6) {
2871       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2872    } else {
2873       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2874       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2875          emit(MOV(brw_writemask(reg, WRITEMASK_W),
2876                   src_reg(output_reg[VARYING_SLOT_PSIZ])));
2877       }
2878       if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
2879          emit(MOV(retype(brw_writemask(reg, WRITEMASK_Y), BRW_REGISTER_TYPE_D),
2880                   src_reg(output_reg[VARYING_SLOT_LAYER])));
2881       }
2882       if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
2883          emit(MOV(retype(brw_writemask(reg, WRITEMASK_Z), BRW_REGISTER_TYPE_D),
2884                   src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
2885       }
2886    }
2887 }
2888
2889 void
2890 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
2891 {
2892    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2893     *
2894     *     "If a linked set of shaders forming the vertex stage contains no
2895     *     static write to gl_ClipVertex or gl_ClipDistance, but the
2896     *     application has requested clipping against user clip planes through
2897     *     the API, then the coordinate written to gl_Position is used for
2898     *     comparison against the user clip planes."
2899     *
2900     * This function is only called if the shader didn't write to
2901     * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
2902     * if the user wrote to it; otherwise we use gl_Position.
2903     */
2904    gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
2905    if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
2906       clip_vertex = VARYING_SLOT_POS;
2907    }
2908
2909    for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
2910         ++i) {
2911       reg.writemask = 1 << i;
2912       emit(DP4(reg,
2913                src_reg(output_reg[clip_vertex]),
2914                src_reg(this->userplane[i + offset])));
2915    }
2916 }
2917
2918 void
2919 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
2920 {
2921    assert (varying < VARYING_SLOT_MAX);
2922    reg.type = output_reg[varying].type;
2923    current_annotation = output_reg_annotation[varying];
2924    /* Copy the register, saturating if necessary */
2925    vec4_instruction *inst = emit(MOV(reg,
2926                                      src_reg(output_reg[varying])));
2927    if ((varying == VARYING_SLOT_COL0 ||
2928         varying == VARYING_SLOT_COL1 ||
2929         varying == VARYING_SLOT_BFC0 ||
2930         varying == VARYING_SLOT_BFC1) &&
2931        key->clamp_vertex_color) {
2932       inst->saturate = true;
2933    }
2934 }
2935
2936 void
2937 vec4_visitor::emit_urb_slot(int mrf, int varying)
2938 {
2939    struct brw_reg hw_reg = brw_message_reg(mrf);
2940    dst_reg reg = dst_reg(MRF, mrf);
2941    reg.type = BRW_REGISTER_TYPE_F;
2942
2943    switch (varying) {
2944    case VARYING_SLOT_PSIZ:
2945       /* PSIZ is always in slot 0, and is coupled with other flags. */
2946       current_annotation = "indices, point width, clip flags";
2947       emit_psiz_and_flags(hw_reg);
2948       break;
2949    case BRW_VARYING_SLOT_NDC:
2950       current_annotation = "NDC";
2951       emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
2952       break;
2953    case VARYING_SLOT_POS:
2954       current_annotation = "gl_Position";
2955       emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
2956       break;
2957    case VARYING_SLOT_EDGE:
2958       /* This is present when doing unfilled polygons.  We're supposed to copy
2959        * the edge flag from the user-provided vertex array
2960        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
2961        * of that attribute (starts as 1.0f).  This is then used in clipping to
2962        * determine which edges should be drawn as wireframe.
2963        */
2964       current_annotation = "edge flag";
2965       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
2966                                     glsl_type::float_type, WRITEMASK_XYZW))));
2967       break;
2968    case BRW_VARYING_SLOT_PAD:
2969       /* No need to write to this slot */
2970       break;
2971    default:
2972       emit_generic_urb_slot(reg, varying);
2973       break;
2974    }
2975 }
2976
2977 static int
2978 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
2979 {
2980    if (brw->gen >= 6) {
2981       /* URB data written (does not include the message header reg) must
2982        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
2983        * section 5.4.3.2.2: URB_INTERLEAVED.
2984        *
2985        * URB entries are allocated on a multiple of 1024 bits, so an
2986        * extra 128 bits written here to make the end align to 256 is
2987        * no problem.
2988        */
2989       if ((mlen % 2) != 1)
2990          mlen++;
2991    }
2992
2993    return mlen;
2994 }
2995
2996
2997 /**
2998  * Generates the VUE payload plus the necessary URB write instructions to
2999  * output it.
3000  *
3001  * The VUE layout is documented in Volume 2a.
3002  */
3003 void
3004 vec4_visitor::emit_vertex()
3005 {
3006    /* MRF 0 is reserved for the debugger, so start with message header
3007     * in MRF 1.
3008     */
3009    int base_mrf = 1;
3010    int mrf = base_mrf;
3011    /* In the process of generating our URB write message contents, we
3012     * may need to unspill a register or load from an array.  Those
3013     * reads would use MRFs 14-15.
3014     */
3015    int max_usable_mrf = 13;
3016
3017    /* The following assertion verifies that max_usable_mrf causes an
3018     * even-numbered amount of URB write data, which will meet gen6's
3019     * requirements for length alignment.
3020     */
3021    assert ((max_usable_mrf - base_mrf) % 2 == 0);
3022
3023    /* First mrf is the g0-based message header containing URB handles and
3024     * such.
3025     */
3026    emit_urb_write_header(mrf++);
3027
3028    if (brw->gen < 6) {
3029       emit_ndc_computation();
3030    }
3031
3032    /* Lower legacy ff and ClipVertex clipping to clip distances */
3033    if (key->userclip_active && !prog->UsesClipDistanceOut) {
3034       current_annotation = "user clip distances";
3035
3036       output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3037       output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3038
3039       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3040       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3041    }
3042
3043    /* We may need to split this up into several URB writes, so do them in a
3044     * loop.
3045     */
3046    int slot = 0;
3047    bool complete = false;
3048    do {
3049       /* URB offset is in URB row increments, and each of our MRFs is half of
3050        * one of those, since we're doing interleaved writes.
3051        */
3052       int offset = slot / 2;
3053
3054       mrf = base_mrf + 1;
3055       for (; slot < prog_data->vue_map.num_slots; ++slot) {
3056          emit_urb_slot(mrf++, prog_data->vue_map.slot_to_varying[slot]);
3057
3058          /* If this was max_usable_mrf, we can't fit anything more into this
3059           * URB WRITE.
3060           */
3061          if (mrf > max_usable_mrf) {
3062             slot++;
3063             break;
3064          }
3065       }
3066
3067       complete = slot >= prog_data->vue_map.num_slots;
3068       current_annotation = "URB write";
3069       vec4_instruction *inst = emit_urb_write_opcode(complete);
3070       inst->base_mrf = base_mrf;
3071       inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
3072       inst->offset += offset;
3073    } while(!complete);
3074 }
3075
3076
3077 src_reg
3078 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
3079                                  src_reg *reladdr, int reg_offset)
3080 {
3081    /* Because we store the values to scratch interleaved like our
3082     * vertex data, we need to scale the vec4 index by 2.
3083     */
3084    int message_header_scale = 2;
3085
3086    /* Pre-gen6, the message header uses byte offsets instead of vec4
3087     * (16-byte) offset units.
3088     */
3089    if (brw->gen < 6)
3090       message_header_scale *= 16;
3091
3092    if (reladdr) {
3093       src_reg index = src_reg(this, glsl_type::int_type);
3094
3095       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3096       emit_before(inst, MUL(dst_reg(index),
3097                             index, src_reg(message_header_scale)));
3098
3099       return index;
3100    } else {
3101       return src_reg(reg_offset * message_header_scale);
3102    }
3103 }
3104
3105 src_reg
3106 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
3107                                        src_reg *reladdr, int reg_offset)
3108 {
3109    if (reladdr) {
3110       src_reg index = src_reg(this, glsl_type::int_type);
3111
3112       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3113
3114       /* Pre-gen6, the message header uses byte offsets instead of vec4
3115        * (16-byte) offset units.
3116        */
3117       if (brw->gen < 6) {
3118          emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
3119       }
3120
3121       return index;
3122    } else if (brw->gen >= 8) {
3123       /* Store the offset in a GRF so we can send-from-GRF. */
3124       src_reg offset = src_reg(this, glsl_type::int_type);
3125       emit_before(inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3126       return offset;
3127    } else {
3128       int message_header_scale = brw->gen < 6 ? 16 : 1;
3129       return src_reg(reg_offset * message_header_scale);
3130    }
3131 }
3132
3133 /**
3134  * Emits an instruction before @inst to load the value named by @orig_src
3135  * from scratch space at @base_offset to @temp.
3136  *
3137  * @base_offset is measured in 32-byte units (the size of a register).
3138  */
3139 void
3140 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
3141                                 dst_reg temp, src_reg orig_src,
3142                                 int base_offset)
3143 {
3144    int reg_offset = base_offset + orig_src.reg_offset;
3145    src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
3146
3147    emit_before(inst, SCRATCH_READ(temp, index));
3148 }
3149
3150 /**
3151  * Emits an instruction after @inst to store the value to be written
3152  * to @orig_dst to scratch space at @base_offset, from @temp.
3153  *
3154  * @base_offset is measured in 32-byte units (the size of a register).
3155  */
3156 void
3157 vec4_visitor::emit_scratch_write(vec4_instruction *inst, int base_offset)
3158 {
3159    int reg_offset = base_offset + inst->dst.reg_offset;
3160    src_reg index = get_scratch_offset(inst, inst->dst.reladdr, reg_offset);
3161
3162    /* Create a temporary register to store *inst's result in.
3163     *
3164     * We have to be careful in MOVing from our temporary result register in
3165     * the scratch write.  If we swizzle from channels of the temporary that
3166     * weren't initialized, it will confuse live interval analysis, which will
3167     * make spilling fail to make progress.
3168     */
3169    src_reg temp = src_reg(this, glsl_type::vec4_type);
3170    temp.type = inst->dst.type;
3171    int first_writemask_chan = ffs(inst->dst.writemask) - 1;
3172    int swizzles[4];
3173    for (int i = 0; i < 4; i++)
3174       if (inst->dst.writemask & (1 << i))
3175          swizzles[i] = i;
3176       else
3177          swizzles[i] = first_writemask_chan;
3178    temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
3179                                swizzles[2], swizzles[3]);
3180
3181    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3182                                        inst->dst.writemask));
3183    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3184    write->predicate = inst->predicate;
3185    write->ir = inst->ir;
3186    write->annotation = inst->annotation;
3187    inst->insert_after(write);
3188
3189    inst->dst.file = temp.file;
3190    inst->dst.reg = temp.reg;
3191    inst->dst.reg_offset = temp.reg_offset;
3192    inst->dst.reladdr = NULL;
3193 }
3194
3195 /**
3196  * We can't generally support array access in GRF space, because a
3197  * single instruction's destination can only span 2 contiguous
3198  * registers.  So, we send all GRF arrays that get variable index
3199  * access to scratch space.
3200  */
3201 void
3202 vec4_visitor::move_grf_array_access_to_scratch()
3203 {
3204    int scratch_loc[this->virtual_grf_count];
3205
3206    for (int i = 0; i < this->virtual_grf_count; i++) {
3207       scratch_loc[i] = -1;
3208    }
3209
3210    /* First, calculate the set of virtual GRFs that need to be punted
3211     * to scratch due to having any array access on them, and where in
3212     * scratch.
3213     */
3214    foreach_in_list(vec4_instruction, inst, &instructions) {
3215       if (inst->dst.file == GRF && inst->dst.reladdr &&
3216           scratch_loc[inst->dst.reg] == -1) {
3217          scratch_loc[inst->dst.reg] = c->last_scratch;
3218          c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
3219       }
3220
3221       for (int i = 0 ; i < 3; i++) {
3222          src_reg *src = &inst->src[i];
3223
3224          if (src->file == GRF && src->reladdr &&
3225              scratch_loc[src->reg] == -1) {
3226             scratch_loc[src->reg] = c->last_scratch;
3227             c->last_scratch += this->virtual_grf_sizes[src->reg];
3228          }
3229       }
3230    }
3231
3232    /* Now, for anything that will be accessed through scratch, rewrite
3233     * it to load/store.  Note that this is a _safe list walk, because
3234     * we may generate a new scratch_write instruction after the one
3235     * we're processing.
3236     */
3237    foreach_in_list_safe(vec4_instruction, inst, &instructions) {
3238       /* Set up the annotation tracking for new generated instructions. */
3239       base_ir = inst->ir;
3240       current_annotation = inst->annotation;
3241
3242       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
3243          emit_scratch_write(inst, scratch_loc[inst->dst.reg]);
3244       }
3245
3246       for (int i = 0 ; i < 3; i++) {
3247          if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
3248             continue;
3249
3250          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3251
3252          emit_scratch_read(inst, temp, inst->src[i],
3253                            scratch_loc[inst->src[i].reg]);
3254
3255          inst->src[i].file = temp.file;
3256          inst->src[i].reg = temp.reg;
3257          inst->src[i].reg_offset = temp.reg_offset;
3258          inst->src[i].reladdr = NULL;
3259       }
3260    }
3261 }
3262
3263 /**
3264  * Emits an instruction before @inst to load the value named by @orig_src
3265  * from the pull constant buffer (surface) at @base_offset to @temp.
3266  */
3267 void
3268 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
3269                                       dst_reg temp, src_reg orig_src,
3270                                       int base_offset)
3271 {
3272    int reg_offset = base_offset + orig_src.reg_offset;
3273    src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3274    src_reg offset = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
3275    vec4_instruction *load;
3276
3277    if (brw->gen >= 7) {
3278       dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
3279       grf_offset.type = offset.type;
3280       emit_before(inst, MOV(grf_offset, offset));
3281
3282       load = new(mem_ctx) vec4_instruction(this,
3283                                            VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
3284                                            temp, index, src_reg(grf_offset));
3285    } else {
3286       load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
3287                                            temp, index, offset);
3288       load->base_mrf = 14;
3289       load->mlen = 1;
3290    }
3291    emit_before(inst, load);
3292 }
3293
3294 /**
3295  * Implements array access of uniforms by inserting a
3296  * PULL_CONSTANT_LOAD instruction.
3297  *
3298  * Unlike temporary GRF array access (where we don't support it due to
3299  * the difficulty of doing relative addressing on instruction
3300  * destinations), we could potentially do array access of uniforms
3301  * that were loaded in GRF space as push constants.  In real-world
3302  * usage we've seen, though, the arrays being used are always larger
3303  * than we could load as push constants, so just always move all
3304  * uniform array access out to a pull constant buffer.
3305  */
3306 void
3307 vec4_visitor::move_uniform_array_access_to_pull_constants()
3308 {
3309    int pull_constant_loc[this->uniforms];
3310
3311    for (int i = 0; i < this->uniforms; i++) {
3312       pull_constant_loc[i] = -1;
3313    }
3314
3315    /* Walk through and find array access of uniforms.  Put a copy of that
3316     * uniform in the pull constant buffer.
3317     *
3318     * Note that we don't move constant-indexed accesses to arrays.  No
3319     * testing has been done of the performance impact of this choice.
3320     */
3321    foreach_in_list_safe(vec4_instruction, inst, &instructions) {
3322       for (int i = 0 ; i < 3; i++) {
3323          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3324             continue;
3325
3326          int uniform = inst->src[i].reg;
3327
3328          /* If this array isn't already present in the pull constant buffer,
3329           * add it.
3330           */
3331          if (pull_constant_loc[uniform] == -1) {
3332             const float **values = &stage_prog_data->param[uniform * 4];
3333
3334             pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3335
3336             assert(uniform < uniform_array_size);
3337             for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3338                stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3339                   = values[j];
3340             }
3341          }
3342
3343          /* Set up the annotation tracking for new generated instructions. */
3344          base_ir = inst->ir;
3345          current_annotation = inst->annotation;
3346
3347          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3348
3349          emit_pull_constant_load(inst, temp, inst->src[i],
3350                                  pull_constant_loc[uniform]);
3351
3352          inst->src[i].file = temp.file;
3353          inst->src[i].reg = temp.reg;
3354          inst->src[i].reg_offset = temp.reg_offset;
3355          inst->src[i].reladdr = NULL;
3356       }
3357    }
3358
3359    /* Now there are no accesses of the UNIFORM file with a reladdr, so
3360     * no need to track them as larger-than-vec4 objects.  This will be
3361     * relied on in cutting out unused uniform vectors from push
3362     * constants.
3363     */
3364    split_uniform_registers();
3365 }
3366
3367 void
3368 vec4_visitor::resolve_ud_negate(src_reg *reg)
3369 {
3370    if (reg->type != BRW_REGISTER_TYPE_UD ||
3371        !reg->negate)
3372       return;
3373
3374    src_reg temp = src_reg(this, glsl_type::uvec4_type);
3375    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3376    *reg = temp;
3377 }
3378
3379 vec4_visitor::vec4_visitor(struct brw_context *brw,
3380                            struct brw_vec4_compile *c,
3381                            struct gl_program *prog,
3382                            const struct brw_vec4_prog_key *key,
3383                            struct brw_vec4_prog_data *prog_data,
3384                            struct gl_shader_program *shader_prog,
3385                            gl_shader_stage stage,
3386                            void *mem_ctx,
3387                            bool debug_flag,
3388                            bool no_spills,
3389                            shader_time_shader_type st_base,
3390                            shader_time_shader_type st_written,
3391                            shader_time_shader_type st_reset)
3392    : backend_visitor(brw, shader_prog, prog, &prog_data->base, stage),
3393      c(c),
3394      key(key),
3395      prog_data(prog_data),
3396      sanity_param_count(0),
3397      fail_msg(NULL),
3398      first_non_payload_grf(0),
3399      need_all_constants_in_pull_buffer(false),
3400      debug_flag(debug_flag),
3401      no_spills(no_spills),
3402      st_base(st_base),
3403      st_written(st_written),
3404      st_reset(st_reset)
3405 {
3406    this->mem_ctx = mem_ctx;
3407    this->failed = false;
3408
3409    this->base_ir = NULL;
3410    this->current_annotation = NULL;
3411    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3412
3413    this->variable_ht = hash_table_ctor(0,
3414                                        hash_table_pointer_hash,
3415                                        hash_table_pointer_compare);
3416
3417    this->virtual_grf_start = NULL;
3418    this->virtual_grf_end = NULL;
3419    this->virtual_grf_sizes = NULL;
3420    this->virtual_grf_count = 0;
3421    this->virtual_grf_reg_map = NULL;
3422    this->virtual_grf_reg_count = 0;
3423    this->virtual_grf_array_size = 0;
3424    this->live_intervals_valid = false;
3425
3426    this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3427
3428    this->uniforms = 0;
3429
3430    /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3431     * at least one. See setup_uniforms() in brw_vec4.cpp.
3432     */
3433    this->uniform_array_size = 1;
3434    if (prog_data) {
3435       this->uniform_array_size = MAX2(stage_prog_data->nr_params, 1);
3436    }
3437
3438    this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3439    this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3440 }
3441
3442 vec4_visitor::~vec4_visitor()
3443 {
3444    hash_table_dtor(this->variable_ht);
3445 }
3446
3447
3448 void
3449 vec4_visitor::fail(const char *format, ...)
3450 {
3451    va_list va;
3452    char *msg;
3453
3454    if (failed)
3455       return;
3456
3457    failed = true;
3458
3459    va_start(va, format);
3460    msg = ralloc_vasprintf(mem_ctx, format, va);
3461    va_end(va);
3462    msg = ralloc_asprintf(mem_ctx, "vec4 compile failed: %s\n", msg);
3463
3464    this->fail_msg = msg;
3465
3466    if (debug_flag) {
3467       fprintf(stderr, "%s",  msg);
3468    }
3469 }
3470
3471 } /* namespace brw */