src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "glsl/ir_uniform.h"
  26 extern "C" {
  27 #include "program/sampler.h"
  28 }
  29
  30 namespace brw {
  31
  32 vec4_instruction::vec4_instruction(vec4_visitor *v,
  33                                    enum opcode opcode, const dst_reg &dst,
  34                                    const src_reg &src0, const src_reg &src1,
  35                                    const src_reg &src2)
  36 {
  37    this->opcode = opcode;
  38    this->dst = dst;
  39    this->src[0] = src0;
  40    this->src[1] = src1;
  41    this->src[2] = src2;
  42    this->saturate = false;
  43    this->force_writemask_all = false;
  44    this->no_dd_clear = false;
  45    this->no_dd_check = false;
  46    this->writes_accumulator = false;
  47    this->conditional_mod = BRW_CONDITIONAL_NONE;
  48    this->sampler = 0;
  49    this->texture_offset = 0;
  50    this->target = 0;
  51    this->shadow_compare = false;
  52    this->ir = v->base_ir;
  53    this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
  54    this->header_present = false;
  55    this->mlen = 0;
  56    this->base_mrf = 0;
  57    this->offset = 0;
  58    this->annotation = v->current_annotation;
  59 }
  60
  61 vec4_instruction *
  62 vec4_visitor::emit(vec4_instruction *inst)
  63 {
  64    this->instructions.push_tail(inst);
  65
  66    return inst;
  67 }
  68
  69 vec4_instruction *
  70 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
  71 {
  72    new_inst->ir = inst->ir;
  73    new_inst->annotation = inst->annotation;
  74
  75    inst->insert_before(new_inst);
  76
  77    return inst;
  78 }
  79
  80 vec4_instruction *
  81 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
  82                    src_reg src0, src_reg src1, src_reg src2)
  83 {
  84    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
  85                                              src0, src1, src2));
  86 }
  87
  88
  89 vec4_instruction *
  90 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
  91 {
  92    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
  93 }
  94
  95 vec4_instruction *
  96 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
  97 {
  98    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
  99 }
 100
 101 vec4_instruction *
 102 vec4_visitor::emit(enum opcode opcode, dst_reg dst)
 103 {
 104    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst));
 105 }
 106
 107 vec4_instruction *
 108 vec4_visitor::emit(enum opcode opcode)
 109 {
 110    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
 111 }
 112
 113 #define ALU1(op)                                                        \
 114    vec4_instruction *                                                   \
 115    vec4_visitor::op(const dst_reg &dst, const src_reg &src0)            \
 116    {                                                                    \
 117       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 118                                            src0);                       \
 119    }
 120
 121 #define ALU2(op)                                                        \
 122    vec4_instruction *                                                   \
 123    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 124                     const src_reg &src1)                                \
 125    {                                                                    \
 126       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 127                                            src0, src1);                 \
 128    }
 129
 130 #define ALU2_ACC(op)                                                    \
 131    vec4_instruction *                                                   \
 132    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 133                     const src_reg &src1)                                \
 134    {                                                                    \
 135       vec4_instruction *inst = new(mem_ctx) vec4_instruction(this,     \
 136                        BRW_OPCODE_##op, dst, src0, src1);               \
 137       inst->writes_accumulator = true;                                 \
 138       return inst;                                                     \
 139    }
 140
 141 #define ALU3(op)                                                        \
 142    vec4_instruction *                                                   \
 143    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 144                     const src_reg &src1, const src_reg &src2)           \
 145    {                                                                    \
 146       assert(brw->gen >= 6);                                            \
 147       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 148                                            src0, src1, src2);           \
 149    }
 150
 151 ALU1(NOT)
 152 ALU1(MOV)
 153 ALU1(FRC)
 154 ALU1(RNDD)
 155 ALU1(RNDE)
 156 ALU1(RNDZ)
 157 ALU1(F32TO16)
 158 ALU1(F16TO32)
 159 ALU2(ADD)
 160 ALU2(MUL)
 161 ALU2_ACC(MACH)
 162 ALU2(AND)
 163 ALU2(OR)
 164 ALU2(XOR)
 165 ALU2(DP3)
 166 ALU2(DP4)
 167 ALU2(DPH)
 168 ALU2(SHL)
 169 ALU2(SHR)
 170 ALU2(ASR)
 171 ALU3(LRP)
 172 ALU1(BFREV)
 173 ALU3(BFE)
 174 ALU2(BFI1)
 175 ALU3(BFI2)
 176 ALU1(FBH)
 177 ALU1(FBL)
 178 ALU1(CBIT)
 179 ALU3(MAD)
 180 ALU2_ACC(ADDC)
 181 ALU2_ACC(SUBB)
 182 ALU2(MAC)
 183
 184 /** Gen4 predicated IF. */
 185 vec4_instruction *
 186 vec4_visitor::IF(uint32_t predicate)
 187 {
 188    vec4_instruction *inst;
 189
 190    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
 191    inst->predicate = predicate;
 192
 193    return inst;
 194 }
 195
 196 /** Gen6 IF with embedded comparison. */
 197 vec4_instruction *
 198 vec4_visitor::IF(src_reg src0, src_reg src1, uint32_t condition)
 199 {
 200    assert(brw->gen == 6);
 201
 202    vec4_instruction *inst;
 203
 204    resolve_ud_negate(&src0);
 205    resolve_ud_negate(&src1);
 206
 207    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
 208                                         src0, src1);
 209    inst->conditional_mod = condition;
 210
 211    return inst;
 212 }
 213
 214 /**
 215  * CMP: Sets the low bit of the destination channels with the result
 216  * of the comparison, while the upper bits are undefined, and updates
 217  * the flag register with the packed 16 bits of the result.
 218  */
 219 vec4_instruction *
 220 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, uint32_t condition)
 221 {
 222    vec4_instruction *inst;
 223
 224    /* original gen4 does type conversion to the destination type
 225     * before before comparison, producing garbage results for floating
 226     * point comparisons.
 227     */
 228    if (brw->gen == 4) {
 229       dst.type = src0.type;
 230       if (dst.file == HW_REG)
 231          dst.fixed_hw_reg.type = dst.type;
 232    }
 233
 234    resolve_ud_negate(&src0);
 235    resolve_ud_negate(&src1);
 236
 237    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
 238    inst->conditional_mod = condition;
 239
 240    return inst;
 241 }
 242
 243 vec4_instruction *
 244 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
 245 {
 246    vec4_instruction *inst;
 247
 248    inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_READ,
 249                                         dst, index);
 250    inst->base_mrf = 14;
 251    inst->mlen = 2;
 252
 253    return inst;
 254 }
 255
 256 vec4_instruction *
 257 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
 258                             const src_reg &index)
 259 {
 260    vec4_instruction *inst;
 261
 262    inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_WRITE,
 263                                         dst, src, index);
 264    inst->base_mrf = 13;
 265    inst->mlen = 3;
 266
 267    return inst;
 268 }
 269
 270 void
 271 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
 272 {
 273    static enum opcode dot_opcodes[] = {
 274       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
 275    };
 276
 277    emit(dot_opcodes[elements - 2], dst, src0, src1);
 278 }
 279
 280 src_reg
 281 vec4_visitor::fix_3src_operand(src_reg src)
 282 {
 283    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
 284     * able to use vertical stride of zero to replicate the vec4 uniform, like
 285     *
 286     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
 287     *
 288     * But you can't, since vertical stride is always four in three-source
 289     * instructions. Instead, insert a MOV instruction to do the replication so
 290     * that the three-source instruction can consume it.
 291     */
 292
 293    /* The MOV is only needed if the source is a uniform or immediate. */
 294    if (src.file != UNIFORM && src.file != IMM)
 295       return src;
 296
 297    if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
 298       return src;
 299
 300    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 301    expanded.type = src.type;
 302    emit(MOV(expanded, src));
 303    return src_reg(expanded);
 304 }
 305
 306 src_reg
 307 vec4_visitor::fix_math_operand(src_reg src)
 308 {
 309    /* The gen6 math instruction ignores the source modifiers --
 310     * swizzle, abs, negate, and at least some parts of the register
 311     * region description.
 312     *
 313     * Rather than trying to enumerate all these cases, *always* expand the
 314     * operand to a temp GRF for gen6.
 315     *
 316     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 317     * can't use.
 318     */
 319
 320    if (brw->gen == 7 && src.file != IMM)
 321       return src;
 322
 323    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 324    expanded.type = src.type;
 325    emit(MOV(expanded, src));
 326    return src_reg(expanded);
 327 }
 328
 329 void
 330 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
 331 {
 332    src = fix_math_operand(src);
 333
 334    if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
 335       /* The gen6 math instruction must be align1, so we can't do
 336        * writemasks.
 337        */
 338       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 339
 340       emit(opcode, temp_dst, src);
 341
 342       emit(MOV(dst, src_reg(temp_dst)));
 343    } else {
 344       emit(opcode, dst, src);
 345    }
 346 }
 347
 348 void
 349 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
 350 {
 351    vec4_instruction *inst = emit(opcode, dst, src);
 352    inst->base_mrf = 1;
 353    inst->mlen = 1;
 354 }
 355
 356 void
 357 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
 358 {
 359    switch (opcode) {
 360    case SHADER_OPCODE_RCP:
 361    case SHADER_OPCODE_RSQ:
 362    case SHADER_OPCODE_SQRT:
 363    case SHADER_OPCODE_EXP2:
 364    case SHADER_OPCODE_LOG2:
 365    case SHADER_OPCODE_SIN:
 366    case SHADER_OPCODE_COS:
 367       break;
 368    default:
 369       assert(!"not reached: bad math opcode");
 370       return;
 371    }
 372
 373    if (brw->gen >= 8) {
 374       emit(opcode, dst, src);
 375    } else if (brw->gen >= 6) {
 376       emit_math1_gen6(opcode, dst, src);
 377    } else {
 378       emit_math1_gen4(opcode, dst, src);
 379    }
 380 }
 381
 382 void
 383 vec4_visitor::emit_math2_gen6(enum opcode opcode,
 384                               dst_reg dst, src_reg src0, src_reg src1)
 385 {
 386    src0 = fix_math_operand(src0);
 387    src1 = fix_math_operand(src1);
 388
 389    if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
 390       /* The gen6 math instruction must be align1, so we can't do
 391        * writemasks.
 392        */
 393       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 394       temp_dst.type = dst.type;
 395
 396       emit(opcode, temp_dst, src0, src1);
 397
 398       emit(MOV(dst, src_reg(temp_dst)));
 399    } else {
 400       emit(opcode, dst, src0, src1);
 401    }
 402 }
 403
 404 void
 405 vec4_visitor::emit_math2_gen4(enum opcode opcode,
 406                               dst_reg dst, src_reg src0, src_reg src1)
 407 {
 408    vec4_instruction *inst = emit(opcode, dst, src0, src1);
 409    inst->base_mrf = 1;
 410    inst->mlen = 2;
 411 }
 412
 413 void
 414 vec4_visitor::emit_math(enum opcode opcode,
 415                         dst_reg dst, src_reg src0, src_reg src1)
 416 {
 417    switch (opcode) {
 418    case SHADER_OPCODE_POW:
 419    case SHADER_OPCODE_INT_QUOTIENT:
 420    case SHADER_OPCODE_INT_REMAINDER:
 421       break;
 422    default:
 423       assert(!"not reached: unsupported binary math opcode");
 424       return;
 425    }
 426
 427    if (brw->gen >= 8) {
 428       emit(opcode, dst, src0, src1);
 429    } else if (brw->gen >= 6) {
 430       emit_math2_gen6(opcode, dst, src0, src1);
 431    } else {
 432       emit_math2_gen4(opcode, dst, src0, src1);
 433    }
 434 }
 435
 436 void
 437 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 438 {
 439    if (brw->gen < 7)
 440       assert(!"ir_unop_pack_half_2x16 should be lowered");
 441
 442    assert(dst.type == BRW_REGISTER_TYPE_UD);
 443    assert(src0.type == BRW_REGISTER_TYPE_F);
 444
 445    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
 446     *
 447     *   Because this instruction does not have a 16-bit floating-point type,
 448     *   the destination data type must be Word (W).
 449     *
 450     *   The destination must be DWord-aligned and specify a horizontal stride
 451     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
 452     *   each destination channel and the upper word is not modified.
 453     *
 454     * The above restriction implies that the f32to16 instruction must use
 455     * align1 mode, because only in align1 mode is it possible to specify
 456     * horizontal stride.  We choose here to defy the hardware docs and emit
 457     * align16 instructions.
 458     *
 459     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
 460     * instructions. I was partially successful in that the code passed all
 461     * tests.  However, the code was dubiously correct and fragile, and the
 462     * tests were not harsh enough to probe that frailty. Not trusting the
 463     * code, I chose instead to remain in align16 mode in defiance of the hw
 464     * docs).
 465     *
 466     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
 467     * simulator, emitting a f32to16 in align16 mode with UD as destination
 468     * data type is safe. The behavior differs from that specified in the PRM
 469     * in that the upper word of each destination channel is cleared to 0.
 470     */
 471
 472    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 473    src_reg tmp_src(tmp_dst);
 474
 475 #if 0
 476    /* Verify the undocumented behavior on which the following instructions
 477     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
 478     * then the result of the bit-or instruction below will be incorrect.
 479     *
 480     * You should inspect the disasm output in order to verify that the MOV is
 481     * not optimized away.
 482     */
 483    emit(MOV(tmp_dst, src_reg(0x12345678u)));
 484 #endif
 485
 486    /* Give tmp the form below, where "." means untouched.
 487     *
 488     *     w z          y          x w z          y          x
 489     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
 490     *
 491     * That the upper word of each write-channel be 0 is required for the
 492     * following bit-shift and bit-or instructions to work. Note that this
 493     * relies on the undocumented hardware behavior mentioned above.
 494     */
 495    tmp_dst.writemask = WRITEMASK_XY;
 496    emit(F32TO16(tmp_dst, src0));
 497
 498    /* Give the write-channels of dst the form:
 499     *   0xhhhh0000
 500     */
 501    tmp_src.swizzle = BRW_SWIZZLE_YYYY;
 502    emit(SHL(dst, tmp_src, src_reg(16u)));
 503
 504    /* Finally, give the write-channels of dst the form of packHalf2x16's
 505     * output:
 506     *   0xhhhhllll
 507     */
 508    tmp_src.swizzle = BRW_SWIZZLE_XXXX;
 509    emit(OR(dst, src_reg(dst), tmp_src));
 510 }
 511
 512 void
 513 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 514 {
 515    if (brw->gen < 7)
 516       assert(!"ir_unop_unpack_half_2x16 should be lowered");
 517
 518    assert(dst.type == BRW_REGISTER_TYPE_F);
 519    assert(src0.type == BRW_REGISTER_TYPE_UD);
 520
 521    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
 522     *
 523     *   Because this instruction does not have a 16-bit floating-point type,
 524     *   the source data type must be Word (W). The destination type must be
 525     *   F (Float).
 526     *
 527     * To use W as the source data type, we must adjust horizontal strides,
 528     * which is only possible in align1 mode. All my [chadv] attempts at
 529     * emitting align1 instructions for unpackHalf2x16 failed to pass the
 530     * Piglit tests, so I gave up.
 531     *
 532     * I've verified that, on gen7 hardware and the simulator, it is safe to
 533     * emit f16to32 in align16 mode with UD as source data type.
 534     */
 535
 536    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 537    src_reg tmp_src(tmp_dst);
 538
 539    tmp_dst.writemask = WRITEMASK_X;
 540    emit(AND(tmp_dst, src0, src_reg(0xffffu)));
 541
 542    tmp_dst.writemask = WRITEMASK_Y;
 543    emit(SHR(tmp_dst, src0, src_reg(16u)));
 544
 545    dst.writemask = WRITEMASK_XY;
 546    emit(F16TO32(dst, tmp_src));
 547 }
 548
 549 void
 550 vec4_visitor::visit_instructions(const exec_list *list)
 551 {
 552    foreach_in_list(ir_instruction, ir, list) {
 553       base_ir = ir;
 554       ir->accept(this);
 555    }
 556 }
 557
 558
 559 static int
 560 type_size(const struct glsl_type *type)
 561 {
 562    unsigned int i;
 563    int size;
 564
 565    switch (type->base_type) {
 566    case GLSL_TYPE_UINT:
 567    case GLSL_TYPE_INT:
 568    case GLSL_TYPE_FLOAT:
 569    case GLSL_TYPE_BOOL:
 570       if (type->is_matrix()) {
 571          return type->matrix_columns;
 572       } else {
 573          /* Regardless of size of vector, it gets a vec4. This is bad
 574           * packing for things like floats, but otherwise arrays become a
 575           * mess.  Hopefully a later pass over the code can pack scalars
 576           * down if appropriate.
 577           */
 578          return 1;
 579       }
 580    case GLSL_TYPE_ARRAY:
 581       assert(type->length > 0);
 582       return type_size(type->fields.array) * type->length;
 583    case GLSL_TYPE_STRUCT:
 584       size = 0;
 585       for (i = 0; i < type->length; i++) {
 586          size += type_size(type->fields.structure[i].type);
 587       }
 588       return size;
 589    case GLSL_TYPE_SAMPLER:
 590       /* Samplers take up one slot in UNIFORMS[], but they're baked in
 591        * at link time.
 592        */
 593       return 1;
 594    case GLSL_TYPE_ATOMIC_UINT:
 595       return 0;
 596    case GLSL_TYPE_IMAGE:
 597    case GLSL_TYPE_VOID:
 598    case GLSL_TYPE_ERROR:
 599    case GLSL_TYPE_INTERFACE:
 600       assert(0);
 601       break;
 602    }
 603
 604    return 0;
 605 }
 606
 607 int
 608 vec4_visitor::virtual_grf_alloc(int size)
 609 {
 610    if (virtual_grf_array_size <= virtual_grf_count) {
 611       if (virtual_grf_array_size == 0)
 612          virtual_grf_array_size = 16;
 613       else
 614          virtual_grf_array_size *= 2;
 615       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 616                                    virtual_grf_array_size);
 617       virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
 618                                      virtual_grf_array_size);
 619    }
 620    virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
 621    virtual_grf_reg_count += size;
 622    virtual_grf_sizes[virtual_grf_count] = size;
 623    return virtual_grf_count++;
 624 }
 625
 626 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 627 {
 628    init();
 629
 630    this->file = GRF;
 631    this->reg = v->virtual_grf_alloc(type_size(type));
 632
 633    if (type->is_array() || type->is_record()) {
 634       this->swizzle = BRW_SWIZZLE_NOOP;
 635    } else {
 636       this->swizzle = swizzle_for_size(type->vector_elements);
 637    }
 638
 639    this->type = brw_type_for_base_type(type);
 640 }
 641
 642 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 643 {
 644    init();
 645
 646    this->file = GRF;
 647    this->reg = v->virtual_grf_alloc(type_size(type));
 648
 649    if (type->is_array() || type->is_record()) {
 650       this->writemask = WRITEMASK_XYZW;
 651    } else {
 652       this->writemask = (1 << type->vector_elements) - 1;
 653    }
 654
 655    this->type = brw_type_for_base_type(type);
 656 }
 657
 658 /* Our support for uniforms is piggy-backed on the struct
 659  * gl_fragment_program, because that's where the values actually
 660  * get stored, rather than in some global gl_shader_program uniform
 661  * store.
 662  */
 663 void
 664 vec4_visitor::setup_uniform_values(ir_variable *ir)
 665 {
 666    int namelen = strlen(ir->name);
 667
 668    /* The data for our (non-builtin) uniforms is stored in a series of
 669     * gl_uniform_driver_storage structs for each subcomponent that
 670     * glGetUniformLocation() could name.  We know it's been set up in the same
 671     * order we'd walk the type, so walk the list of storage and find anything
 672     * with our name, or the prefix of a component that starts with our name.
 673     */
 674    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 675       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 676
 677       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 678           (storage->name[namelen] != 0 &&
 679            storage->name[namelen] != '.' &&
 680            storage->name[namelen] != '[')) {
 681          continue;
 682       }
 683
 684       gl_constant_value *components = storage->storage;
 685       unsigned vector_count = (MAX2(storage->array_elements, 1) *
 686                                storage->type->matrix_columns);
 687
 688       for (unsigned s = 0; s < vector_count; s++) {
 689          assert(uniforms < uniform_array_size);
 690          uniform_vector_size[uniforms] = storage->type->vector_elements;
 691
 692          int i;
 693          for (i = 0; i < uniform_vector_size[uniforms]; i++) {
 694             stage_prog_data->param[uniforms * 4 + i] = &components->f;
 695             components++;
 696          }
 697          for (; i < 4; i++) {
 698             static float zero = 0;
 699             stage_prog_data->param[uniforms * 4 + i] = &zero;
 700          }
 701
 702          uniforms++;
 703       }
 704    }
 705 }
 706
 707 void
 708 vec4_visitor::setup_uniform_clipplane_values()
 709 {
 710    gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
 711
 712    for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
 713       assert(this->uniforms < uniform_array_size);
 714       this->uniform_vector_size[this->uniforms] = 4;
 715       this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
 716       this->userplane[i].type = BRW_REGISTER_TYPE_F;
 717       for (int j = 0; j < 4; ++j) {
 718          stage_prog_data->param[this->uniforms * 4 + j] = &clip_planes[i][j];
 719       }
 720       ++this->uniforms;
 721    }
 722 }
 723
 724 /* Our support for builtin uniforms is even scarier than non-builtin.
 725  * It sits on top of the PROG_STATE_VAR parameters that are
 726  * automatically updated from GL context state.
 727  */
 728 void
 729 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
 730 {
 731    const ir_state_slot *const slots = ir->state_slots;
 732    assert(ir->state_slots != NULL);
 733
 734    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 735       /* This state reference has already been setup by ir_to_mesa,
 736        * but we'll get the same index back here.  We can reference
 737        * ParameterValues directly, since unlike brw_fs.cpp, we never
 738        * add new state references during compile.
 739        */
 740       int index = _mesa_add_state_reference(this->prog->Parameters,
 741                                             (gl_state_index *)slots[i].tokens);
 742       float *values = &this->prog->Parameters->ParameterValues[index][0].f;
 743
 744       assert(this->uniforms < uniform_array_size);
 745       this->uniform_vector_size[this->uniforms] = 0;
 746       /* Add each of the unique swizzled channels of the element.
 747        * This will end up matching the size of the glsl_type of this field.
 748        */
 749       int last_swiz = -1;
 750       for (unsigned int j = 0; j < 4; j++) {
 751          int swiz = GET_SWZ(slots[i].swizzle, j);
 752          last_swiz = swiz;
 753
 754          stage_prog_data->param[this->uniforms * 4 + j] = &values[swiz];
 755          assert(this->uniforms < uniform_array_size);
 756          if (swiz <= last_swiz)
 757             this->uniform_vector_size[this->uniforms]++;
 758       }
 759       this->uniforms++;
 760    }
 761 }
 762
 763 dst_reg *
 764 vec4_visitor::variable_storage(ir_variable *var)
 765 {
 766    return (dst_reg *)hash_table_find(this->variable_ht, var);
 767 }
 768
 769 void
 770 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir, uint32_t *predicate)
 771 {
 772    ir_expression *expr = ir->as_expression();
 773
 774    *predicate = BRW_PREDICATE_NORMAL;
 775
 776    if (expr) {
 777       src_reg op[2];
 778       vec4_instruction *inst;
 779
 780       assert(expr->get_num_operands() <= 2);
 781       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 782          expr->operands[i]->accept(this);
 783          op[i] = this->result;
 784
 785          resolve_ud_negate(&op[i]);
 786       }
 787
 788       switch (expr->operation) {
 789       case ir_unop_logic_not:
 790          inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
 791          inst->conditional_mod = BRW_CONDITIONAL_Z;
 792          break;
 793
 794       case ir_binop_logic_xor:
 795          inst = emit(XOR(dst_null_d(), op[0], op[1]));
 796          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 797          break;
 798
 799       case ir_binop_logic_or:
 800          inst = emit(OR(dst_null_d(), op[0], op[1]));
 801          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 802          break;
 803
 804       case ir_binop_logic_and:
 805          inst = emit(AND(dst_null_d(), op[0], op[1]));
 806          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 807          break;
 808
 809       case ir_unop_f2b:
 810          if (brw->gen >= 6) {
 811             emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
 812          } else {
 813             inst = emit(MOV(dst_null_f(), op[0]));
 814             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 815          }
 816          break;
 817
 818       case ir_unop_i2b:
 819          if (brw->gen >= 6) {
 820             emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 821          } else {
 822             inst = emit(MOV(dst_null_d(), op[0]));
 823             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 824          }
 825          break;
 826
 827       case ir_binop_all_equal:
 828          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 829          *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
 830          break;
 831
 832       case ir_binop_any_nequal:
 833          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 834          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 835          break;
 836
 837       case ir_unop_any:
 838          inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 839          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 840          break;
 841
 842       case ir_binop_greater:
 843       case ir_binop_gequal:
 844       case ir_binop_less:
 845       case ir_binop_lequal:
 846       case ir_binop_equal:
 847       case ir_binop_nequal:
 848          emit(CMP(dst_null_d(), op[0], op[1],
 849                   brw_conditional_for_comparison(expr->operation)));
 850          break;
 851
 852       default:
 853          assert(!"not reached");
 854          break;
 855       }
 856       return;
 857    }
 858
 859    ir->accept(this);
 860
 861    resolve_ud_negate(&this->result);
 862
 863    if (brw->gen >= 6) {
 864       vec4_instruction *inst = emit(AND(dst_null_d(),
 865                                         this->result, src_reg(1)));
 866       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 867    } else {
 868       vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
 869       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 870    }
 871 }
 872
 873 /**
 874  * Emit a gen6 IF statement with the comparison folded into the IF
 875  * instruction.
 876  */
 877 void
 878 vec4_visitor::emit_if_gen6(ir_if *ir)
 879 {
 880    ir_expression *expr = ir->condition->as_expression();
 881
 882    if (expr) {
 883       src_reg op[2];
 884       dst_reg temp;
 885
 886       assert(expr->get_num_operands() <= 2);
 887       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 888          expr->operands[i]->accept(this);
 889          op[i] = this->result;
 890       }
 891
 892       switch (expr->operation) {
 893       case ir_unop_logic_not:
 894          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
 895          return;
 896
 897       case ir_binop_logic_xor:
 898          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
 899          return;
 900
 901       case ir_binop_logic_or:
 902          temp = dst_reg(this, glsl_type::bool_type);
 903          emit(OR(temp, op[0], op[1]));
 904          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 905          return;
 906
 907       case ir_binop_logic_and:
 908          temp = dst_reg(this, glsl_type::bool_type);
 909          emit(AND(temp, op[0], op[1]));
 910          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 911          return;
 912
 913       case ir_unop_f2b:
 914          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 915          return;
 916
 917       case ir_unop_i2b:
 918          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 919          return;
 920
 921       case ir_binop_greater:
 922       case ir_binop_gequal:
 923       case ir_binop_less:
 924       case ir_binop_lequal:
 925       case ir_binop_equal:
 926       case ir_binop_nequal:
 927          emit(IF(op[0], op[1],
 928                  brw_conditional_for_comparison(expr->operation)));
 929          return;
 930
 931       case ir_binop_all_equal:
 932          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 933          emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
 934          return;
 935
 936       case ir_binop_any_nequal:
 937          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 938          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 939          return;
 940
 941       case ir_unop_any:
 942          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 943          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 944          return;
 945
 946       default:
 947          assert(!"not reached");
 948          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 949          return;
 950       }
 951       return;
 952    }
 953
 954    ir->condition->accept(this);
 955
 956    emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
 957 }
 958
 959 void
 960 vec4_visitor::visit(ir_variable *ir)
 961 {
 962    dst_reg *reg = NULL;
 963
 964    if (variable_storage(ir))
 965       return;
 966
 967    switch (ir->data.mode) {
 968    case ir_var_shader_in:
 969       reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
 970       break;
 971
 972    case ir_var_shader_out:
 973       reg = new(mem_ctx) dst_reg(this, ir->type);
 974
 975       for (int i = 0; i < type_size(ir->type); i++) {
 976          output_reg[ir->data.location + i] = *reg;
 977          output_reg[ir->data.location + i].reg_offset = i;
 978          output_reg[ir->data.location + i].type =
 979             brw_type_for_base_type(ir->type->get_scalar_type());
 980          output_reg_annotation[ir->data.location + i] = ir->name;
 981       }
 982       break;
 983
 984    case ir_var_auto:
 985    case ir_var_temporary:
 986       reg = new(mem_ctx) dst_reg(this, ir->type);
 987       break;
 988
 989    case ir_var_uniform:
 990       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
 991
 992       /* Thanks to the lower_ubo_reference pass, we will see only
 993        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
 994        * variables, so no need for them to be in variable_ht.
 995        *
 996        * Atomic counters take no uniform storage, no need to do
 997        * anything here.
 998        */
 999       if (ir->is_in_uniform_block() || ir->type->contains_atomic())
1000          return;
1001
1002       /* Track how big the whole uniform variable is, in case we need to put a
1003        * copy of its data into pull constants for array access.
1004        */
1005       assert(this->uniforms < uniform_array_size);
1006       this->uniform_size[this->uniforms] = type_size(ir->type);
1007
1008       if (!strncmp(ir->name, "gl_", 3)) {
1009          setup_builtin_uniform_values(ir);
1010       } else {
1011          setup_uniform_values(ir);
1012       }
1013       break;
1014
1015    case ir_var_system_value:
1016       reg = make_reg_for_system_value(ir);
1017       break;
1018
1019    default:
1020       assert(!"not reached");
1021    }
1022
1023    reg->type = brw_type_for_base_type(ir->type);
1024    hash_table_insert(this->variable_ht, reg, ir);
1025 }
1026
1027 void
1028 vec4_visitor::visit(ir_loop *ir)
1029 {
1030    /* We don't want debugging output to print the whole body of the
1031     * loop as the annotation.
1032     */
1033    this->base_ir = NULL;
1034
1035    emit(BRW_OPCODE_DO);
1036
1037    visit_instructions(&ir->body_instructions);
1038
1039    emit(BRW_OPCODE_WHILE);
1040 }
1041
1042 void
1043 vec4_visitor::visit(ir_loop_jump *ir)
1044 {
1045    switch (ir->mode) {
1046    case ir_loop_jump::jump_break:
1047       emit(BRW_OPCODE_BREAK);
1048       break;
1049    case ir_loop_jump::jump_continue:
1050       emit(BRW_OPCODE_CONTINUE);
1051       break;
1052    }
1053 }
1054
1055
1056 void
1057 vec4_visitor::visit(ir_function_signature *ir)
1058 {
1059    assert(0);
1060    (void)ir;
1061 }
1062
1063 void
1064 vec4_visitor::visit(ir_function *ir)
1065 {
1066    /* Ignore function bodies other than main() -- we shouldn't see calls to
1067     * them since they should all be inlined.
1068     */
1069    if (strcmp(ir->name, "main") == 0) {
1070       const ir_function_signature *sig;
1071       exec_list empty;
1072
1073       sig = ir->matching_signature(NULL, &empty);
1074
1075       assert(sig);
1076
1077       visit_instructions(&sig->body);
1078    }
1079 }
1080
1081 bool
1082 vec4_visitor::try_emit_sat(ir_expression *ir)
1083 {
1084    ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
1085    if (!sat_src)
1086       return false;
1087
1088    sat_src->accept(this);
1089    src_reg src = this->result;
1090
1091    this->result = src_reg(this, ir->type);
1092    vec4_instruction *inst;
1093    inst = emit(MOV(dst_reg(this->result), src));
1094    inst->saturate = true;
1095
1096    return true;
1097 }
1098
1099 bool
1100 vec4_visitor::try_emit_mad(ir_expression *ir)
1101 {
1102    /* 3-src instructions were introduced in gen6. */
1103    if (brw->gen < 6)
1104       return false;
1105
1106    /* MAD can only handle floating-point data. */
1107    if (ir->type->base_type != GLSL_TYPE_FLOAT)
1108       return false;
1109
1110    ir_rvalue *nonmul = ir->operands[1];
1111    ir_expression *mul = ir->operands[0]->as_expression();
1112
1113    if (!mul || mul->operation != ir_binop_mul) {
1114       nonmul = ir->operands[0];
1115       mul = ir->operands[1]->as_expression();
1116
1117       if (!mul || mul->operation != ir_binop_mul)
1118          return false;
1119    }
1120
1121    nonmul->accept(this);
1122    src_reg src0 = fix_3src_operand(this->result);
1123
1124    mul->operands[0]->accept(this);
1125    src_reg src1 = fix_3src_operand(this->result);
1126
1127    mul->operands[1]->accept(this);
1128    src_reg src2 = fix_3src_operand(this->result);
1129
1130    this->result = src_reg(this, ir->type);
1131    emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1132
1133    return true;
1134 }
1135
1136 bool
1137 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1138 {
1139    ir_expression *const cmp = ir->operands[0]->as_expression();
1140
1141    if (cmp == NULL)
1142       return false;
1143
1144    switch (cmp->operation) {
1145    case ir_binop_less:
1146    case ir_binop_greater:
1147    case ir_binop_lequal:
1148    case ir_binop_gequal:
1149    case ir_binop_equal:
1150    case ir_binop_nequal:
1151       break;
1152
1153    default:
1154       return false;
1155    }
1156
1157    cmp->operands[0]->accept(this);
1158    const src_reg cmp_src0 = this->result;
1159
1160    cmp->operands[1]->accept(this);
1161    const src_reg cmp_src1 = this->result;
1162
1163    this->result = src_reg(this, ir->type);
1164
1165    emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1166             brw_conditional_for_comparison(cmp->operation)));
1167
1168    /* If the comparison is false, this->result will just happen to be zero.
1169     */
1170    vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1171                                        this->result, src_reg(1.0f));
1172    inst->predicate = BRW_PREDICATE_NORMAL;
1173    inst->predicate_inverse = true;
1174
1175    return true;
1176 }
1177
1178 void
1179 vec4_visitor::emit_bool_comparison(unsigned int op,
1180                                  dst_reg dst, src_reg src0, src_reg src1)
1181 {
1182    /* original gen4 does destination conversion before comparison. */
1183    if (brw->gen < 5)
1184       dst.type = src0.type;
1185
1186    emit(CMP(dst, src0, src1, brw_conditional_for_comparison(op)));
1187
1188    dst.type = BRW_REGISTER_TYPE_D;
1189    emit(AND(dst, src_reg(dst), src_reg(0x1)));
1190 }
1191
1192 void
1193 vec4_visitor::emit_minmax(uint32_t conditionalmod, dst_reg dst,
1194                           src_reg src0, src_reg src1)
1195 {
1196    vec4_instruction *inst;
1197
1198    if (brw->gen >= 6) {
1199       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1200       inst->conditional_mod = conditionalmod;
1201    } else {
1202       emit(CMP(dst, src0, src1, conditionalmod));
1203
1204       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1205       inst->predicate = BRW_PREDICATE_NORMAL;
1206    }
1207 }
1208
1209 void
1210 vec4_visitor::emit_lrp(const dst_reg &dst,
1211                        const src_reg &x, const src_reg &y, const src_reg &a)
1212 {
1213    if (brw->gen >= 6) {
1214       /* Note that the instruction's argument order is reversed from GLSL
1215        * and the IR.
1216        */
1217       emit(LRP(dst,
1218                fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
1219    } else {
1220       /* Earlier generations don't support three source operations, so we
1221        * need to emit x*(1-a) + y*a.
1222        */
1223       dst_reg y_times_a           = dst_reg(this, glsl_type::vec4_type);
1224       dst_reg one_minus_a         = dst_reg(this, glsl_type::vec4_type);
1225       dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1226       y_times_a.writemask           = dst.writemask;
1227       one_minus_a.writemask         = dst.writemask;
1228       x_times_one_minus_a.writemask = dst.writemask;
1229
1230       emit(MUL(y_times_a, y, a));
1231       emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1232       emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1233       emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1234    }
1235 }
1236
1237 void
1238 vec4_visitor::visit(ir_expression *ir)
1239 {
1240    unsigned int operand;
1241    src_reg op[Elements(ir->operands)];
1242    src_reg result_src;
1243    dst_reg result_dst;
1244    vec4_instruction *inst;
1245
1246    if (try_emit_sat(ir))
1247       return;
1248
1249    if (ir->operation == ir_binop_add) {
1250       if (try_emit_mad(ir))
1251          return;
1252    }
1253
1254    if (ir->operation == ir_unop_b2f) {
1255       if (try_emit_b2f_of_compare(ir))
1256          return;
1257    }
1258
1259    for (operand = 0; operand < ir->get_num_operands(); operand++) {
1260       this->result.file = BAD_FILE;
1261       ir->operands[operand]->accept(this);
1262       if (this->result.file == BAD_FILE) {
1263          fprintf(stderr, "Failed to get tree for expression operand:\n");
1264          ir->operands[operand]->fprint(stderr);
1265          exit(1);
1266       }
1267       op[operand] = this->result;
1268
1269       /* Matrix expression operands should have been broken down to vector
1270        * operations already.
1271        */
1272       assert(!ir->operands[operand]->type->is_matrix());
1273    }
1274
1275    int vector_elements = ir->operands[0]->type->vector_elements;
1276    if (ir->operands[1]) {
1277       vector_elements = MAX2(vector_elements,
1278                              ir->operands[1]->type->vector_elements);
1279    }
1280
1281    this->result.file = BAD_FILE;
1282
1283    /* Storage for our result.  Ideally for an assignment we'd be using
1284     * the actual storage for the result here, instead.
1285     */
1286    result_src = src_reg(this, ir->type);
1287    /* convenience for the emit functions below. */
1288    result_dst = dst_reg(result_src);
1289    /* If nothing special happens, this is the result. */
1290    this->result = result_src;
1291    /* Limit writes to the channels that will be used by result_src later.
1292     * This does limit this temp's use as a temporary for multi-instruction
1293     * sequences.
1294     */
1295    result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1296
1297    switch (ir->operation) {
1298    case ir_unop_logic_not:
1299       /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
1300        * ones complement of the whole register, not just bit 0.
1301        */
1302       emit(XOR(result_dst, op[0], src_reg(1)));
1303       break;
1304    case ir_unop_neg:
1305       op[0].negate = !op[0].negate;
1306       emit(MOV(result_dst, op[0]));
1307       break;
1308    case ir_unop_abs:
1309       op[0].abs = true;
1310       op[0].negate = false;
1311       emit(MOV(result_dst, op[0]));
1312       break;
1313
1314    case ir_unop_sign:
1315       if (ir->type->is_float()) {
1316          /* AND(val, 0x80000000) gives the sign bit.
1317           *
1318           * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1319           * zero.
1320           */
1321          emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1322
1323          op[0].type = BRW_REGISTER_TYPE_UD;
1324          result_dst.type = BRW_REGISTER_TYPE_UD;
1325          emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1326
1327          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1328          inst->predicate = BRW_PREDICATE_NORMAL;
1329
1330          this->result.type = BRW_REGISTER_TYPE_F;
1331       } else {
1332          /*  ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1333           *               -> non-negative val generates 0x00000000.
1334           *  Predicated OR sets 1 if val is positive.
1335           */
1336          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1337
1338          emit(ASR(result_dst, op[0], src_reg(31)));
1339
1340          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1341          inst->predicate = BRW_PREDICATE_NORMAL;
1342       }
1343       break;
1344
1345    case ir_unop_rcp:
1346       emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1347       break;
1348
1349    case ir_unop_exp2:
1350       emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1351       break;
1352    case ir_unop_log2:
1353       emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1354       break;
1355    case ir_unop_exp:
1356    case ir_unop_log:
1357       assert(!"not reached: should be handled by ir_explog_to_explog2");
1358       break;
1359    case ir_unop_sin:
1360    case ir_unop_sin_reduced:
1361       emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1362       break;
1363    case ir_unop_cos:
1364    case ir_unop_cos_reduced:
1365       emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1366       break;
1367
1368    case ir_unop_dFdx:
1369    case ir_unop_dFdy:
1370       assert(!"derivatives not valid in vertex shader");
1371       break;
1372
1373    case ir_unop_bitfield_reverse:
1374       emit(BFREV(result_dst, op[0]));
1375       break;
1376    case ir_unop_bit_count:
1377       emit(CBIT(result_dst, op[0]));
1378       break;
1379    case ir_unop_find_msb: {
1380       src_reg temp = src_reg(this, glsl_type::uint_type);
1381
1382       inst = emit(FBH(dst_reg(temp), op[0]));
1383       inst->dst.writemask = WRITEMASK_XYZW;
1384
1385       /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1386        * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1387        * subtract the result from 31 to convert the MSB count into an LSB count.
1388        */
1389
1390       /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1391       temp.swizzle = BRW_SWIZZLE_NOOP;
1392       emit(MOV(result_dst, temp));
1393
1394       src_reg src_tmp = src_reg(result_dst);
1395       emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1396
1397       src_tmp.negate = true;
1398       inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1399       inst->predicate = BRW_PREDICATE_NORMAL;
1400       break;
1401    }
1402    case ir_unop_find_lsb:
1403       emit(FBL(result_dst, op[0]));
1404       break;
1405
1406    case ir_unop_noise:
1407       assert(!"not reached: should be handled by lower_noise");
1408       break;
1409
1410    case ir_binop_add:
1411       emit(ADD(result_dst, op[0], op[1]));
1412       break;
1413    case ir_binop_sub:
1414       assert(!"not reached: should be handled by ir_sub_to_add_neg");
1415       break;
1416
1417    case ir_binop_mul:
1418       if (brw->gen < 8 && ir->type->is_integer()) {
1419          /* For integer multiplication, the MUL uses the low 16 bits of one of
1420           * the operands (src0 through SNB, src1 on IVB and later).  The MACH
1421           * accumulates in the contribution of the upper 16 bits of that
1422           * operand.  If we can determine that one of the args is in the low
1423           * 16 bits, though, we can just emit a single MUL.
1424           */
1425          if (ir->operands[0]->is_uint16_constant()) {
1426             if (brw->gen < 7)
1427                emit(MUL(result_dst, op[0], op[1]));
1428             else
1429                emit(MUL(result_dst, op[1], op[0]));
1430          } else if (ir->operands[1]->is_uint16_constant()) {
1431             if (brw->gen < 7)
1432                emit(MUL(result_dst, op[1], op[0]));
1433             else
1434                emit(MUL(result_dst, op[0], op[1]));
1435          } else {
1436             struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1437
1438             emit(MUL(acc, op[0], op[1]));
1439             emit(MACH(dst_null_d(), op[0], op[1]));
1440             emit(MOV(result_dst, src_reg(acc)));
1441          }
1442       } else {
1443          emit(MUL(result_dst, op[0], op[1]));
1444       }
1445       break;
1446    case ir_binop_imul_high: {
1447       struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1448
1449       emit(MUL(acc, op[0], op[1]));
1450       emit(MACH(result_dst, op[0], op[1]));
1451       break;
1452    }
1453    case ir_binop_div:
1454       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1455       assert(ir->type->is_integer());
1456       emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1457       break;
1458    case ir_binop_carry: {
1459       struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1460
1461       emit(ADDC(dst_null_ud(), op[0], op[1]));
1462       emit(MOV(result_dst, src_reg(acc)));
1463       break;
1464    }
1465    case ir_binop_borrow: {
1466       struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1467
1468       emit(SUBB(dst_null_ud(), op[0], op[1]));
1469       emit(MOV(result_dst, src_reg(acc)));
1470       break;
1471    }
1472    case ir_binop_mod:
1473       /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1474       assert(ir->type->is_integer());
1475       emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1476       break;
1477
1478    case ir_binop_less:
1479    case ir_binop_greater:
1480    case ir_binop_lequal:
1481    case ir_binop_gequal:
1482    case ir_binop_equal:
1483    case ir_binop_nequal: {
1484       emit(CMP(result_dst, op[0], op[1],
1485                brw_conditional_for_comparison(ir->operation)));
1486       emit(AND(result_dst, result_src, src_reg(0x1)));
1487       break;
1488    }
1489
1490    case ir_binop_all_equal:
1491       /* "==" operator producing a scalar boolean. */
1492       if (ir->operands[0]->type->is_vector() ||
1493           ir->operands[1]->type->is_vector()) {
1494          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1495          emit(MOV(result_dst, src_reg(0)));
1496          inst = emit(MOV(result_dst, src_reg(1)));
1497          inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1498       } else {
1499          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1500          emit(AND(result_dst, result_src, src_reg(0x1)));
1501       }
1502       break;
1503    case ir_binop_any_nequal:
1504       /* "!=" operator producing a scalar boolean. */
1505       if (ir->operands[0]->type->is_vector() ||
1506           ir->operands[1]->type->is_vector()) {
1507          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1508
1509          emit(MOV(result_dst, src_reg(0)));
1510          inst = emit(MOV(result_dst, src_reg(1)));
1511          inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1512       } else {
1513          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1514          emit(AND(result_dst, result_src, src_reg(0x1)));
1515       }
1516       break;
1517
1518    case ir_unop_any:
1519       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1520       emit(MOV(result_dst, src_reg(0)));
1521
1522       inst = emit(MOV(result_dst, src_reg(1)));
1523       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1524       break;
1525
1526    case ir_binop_logic_xor:
1527       emit(XOR(result_dst, op[0], op[1]));
1528       break;
1529
1530    case ir_binop_logic_or:
1531       emit(OR(result_dst, op[0], op[1]));
1532       break;
1533
1534    case ir_binop_logic_and:
1535       emit(AND(result_dst, op[0], op[1]));
1536       break;
1537
1538    case ir_binop_dot:
1539       assert(ir->operands[0]->type->is_vector());
1540       assert(ir->operands[0]->type == ir->operands[1]->type);
1541       emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1542       break;
1543
1544    case ir_unop_sqrt:
1545       emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1546       break;
1547    case ir_unop_rsq:
1548       emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1549       break;
1550
1551    case ir_unop_bitcast_i2f:
1552    case ir_unop_bitcast_u2f:
1553       this->result = op[0];
1554       this->result.type = BRW_REGISTER_TYPE_F;
1555       break;
1556
1557    case ir_unop_bitcast_f2i:
1558       this->result = op[0];
1559       this->result.type = BRW_REGISTER_TYPE_D;
1560       break;
1561
1562    case ir_unop_bitcast_f2u:
1563       this->result = op[0];
1564       this->result.type = BRW_REGISTER_TYPE_UD;
1565       break;
1566
1567    case ir_unop_i2f:
1568    case ir_unop_i2u:
1569    case ir_unop_u2i:
1570    case ir_unop_u2f:
1571    case ir_unop_b2f:
1572    case ir_unop_b2i:
1573    case ir_unop_f2i:
1574    case ir_unop_f2u:
1575       emit(MOV(result_dst, op[0]));
1576       break;
1577    case ir_unop_f2b:
1578    case ir_unop_i2b: {
1579       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1580       emit(AND(result_dst, result_src, src_reg(1)));
1581       break;
1582    }
1583
1584    case ir_unop_trunc:
1585       emit(RNDZ(result_dst, op[0]));
1586       break;
1587    case ir_unop_ceil:
1588       op[0].negate = !op[0].negate;
1589       inst = emit(RNDD(result_dst, op[0]));
1590       this->result.negate = true;
1591       break;
1592    case ir_unop_floor:
1593       inst = emit(RNDD(result_dst, op[0]));
1594       break;
1595    case ir_unop_fract:
1596       inst = emit(FRC(result_dst, op[0]));
1597       break;
1598    case ir_unop_round_even:
1599       emit(RNDE(result_dst, op[0]));
1600       break;
1601
1602    case ir_binop_min:
1603       emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1604       break;
1605    case ir_binop_max:
1606       emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1607       break;
1608
1609    case ir_binop_pow:
1610       emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1611       break;
1612
1613    case ir_unop_bit_not:
1614       inst = emit(NOT(result_dst, op[0]));
1615       break;
1616    case ir_binop_bit_and:
1617       inst = emit(AND(result_dst, op[0], op[1]));
1618       break;
1619    case ir_binop_bit_xor:
1620       inst = emit(XOR(result_dst, op[0], op[1]));
1621       break;
1622    case ir_binop_bit_or:
1623       inst = emit(OR(result_dst, op[0], op[1]));
1624       break;
1625
1626    case ir_binop_lshift:
1627       inst = emit(SHL(result_dst, op[0], op[1]));
1628       break;
1629
1630    case ir_binop_rshift:
1631       if (ir->type->base_type == GLSL_TYPE_INT)
1632          inst = emit(ASR(result_dst, op[0], op[1]));
1633       else
1634          inst = emit(SHR(result_dst, op[0], op[1]));
1635       break;
1636
1637    case ir_binop_bfm:
1638       emit(BFI1(result_dst, op[0], op[1]));
1639       break;
1640
1641    case ir_binop_ubo_load: {
1642       ir_constant *uniform_block = ir->operands[0]->as_constant();
1643       ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1644       unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1645       src_reg offset;
1646
1647       /* Now, load the vector from that offset. */
1648       assert(ir->type->is_vector() || ir->type->is_scalar());
1649
1650       src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1651       packed_consts.type = result.type;
1652       src_reg surf_index =
1653          src_reg(prog_data->base.binding_table.ubo_start + uniform_block->value.u[0]);
1654       if (const_offset_ir) {
1655          if (brw->gen >= 8) {
1656             /* Store the offset in a GRF so we can send-from-GRF. */
1657             offset = src_reg(this, glsl_type::int_type);
1658             emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1659          } else {
1660             /* Immediates are fine on older generations since they'll be moved
1661              * to a (potentially fake) MRF at the generator level.
1662              */
1663             offset = src_reg(const_offset / 16);
1664          }
1665       } else {
1666          offset = src_reg(this, glsl_type::uint_type);
1667          emit(SHR(dst_reg(offset), op[1], src_reg(4)));
1668       }
1669
1670       if (brw->gen >= 7) {
1671          dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1672          grf_offset.type = offset.type;
1673
1674          emit(MOV(grf_offset, offset));
1675
1676          emit(new(mem_ctx) vec4_instruction(this,
1677                                             VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1678                                             dst_reg(packed_consts),
1679                                             surf_index,
1680                                             src_reg(grf_offset)));
1681       } else {
1682          vec4_instruction *pull =
1683             emit(new(mem_ctx) vec4_instruction(this,
1684                                                VS_OPCODE_PULL_CONSTANT_LOAD,
1685                                                dst_reg(packed_consts),
1686                                                surf_index,
1687                                                offset));
1688          pull->base_mrf = 14;
1689          pull->mlen = 1;
1690       }
1691
1692       packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1693       packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1694                                             const_offset % 16 / 4,
1695                                             const_offset % 16 / 4,
1696                                             const_offset % 16 / 4);
1697
1698       /* UBO bools are any nonzero int.  We store bools as either 0 or 1. */
1699       if (ir->type->base_type == GLSL_TYPE_BOOL) {
1700          emit(CMP(result_dst, packed_consts, src_reg(0u),
1701                   BRW_CONDITIONAL_NZ));
1702          emit(AND(result_dst, result, src_reg(0x1)));
1703       } else {
1704          emit(MOV(result_dst, packed_consts));
1705       }
1706       break;
1707    }
1708
1709    case ir_binop_vector_extract:
1710       assert(!"should have been lowered by vec_index_to_cond_assign");
1711       break;
1712
1713    case ir_triop_fma:
1714       op[0] = fix_3src_operand(op[0]);
1715       op[1] = fix_3src_operand(op[1]);
1716       op[2] = fix_3src_operand(op[2]);
1717       /* Note that the instruction's argument order is reversed from GLSL
1718        * and the IR.
1719        */
1720       emit(MAD(result_dst, op[2], op[1], op[0]));
1721       break;
1722
1723    case ir_triop_lrp:
1724       emit_lrp(result_dst, op[0], op[1], op[2]);
1725       break;
1726
1727    case ir_triop_csel:
1728       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1729       inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1730       inst->predicate = BRW_PREDICATE_NORMAL;
1731       break;
1732
1733    case ir_triop_bfi:
1734       op[0] = fix_3src_operand(op[0]);
1735       op[1] = fix_3src_operand(op[1]);
1736       op[2] = fix_3src_operand(op[2]);
1737       emit(BFI2(result_dst, op[0], op[1], op[2]));
1738       break;
1739
1740    case ir_triop_bitfield_extract:
1741       op[0] = fix_3src_operand(op[0]);
1742       op[1] = fix_3src_operand(op[1]);
1743       op[2] = fix_3src_operand(op[2]);
1744       /* Note that the instruction's argument order is reversed from GLSL
1745        * and the IR.
1746        */
1747       emit(BFE(result_dst, op[2], op[1], op[0]));
1748       break;
1749
1750    case ir_triop_vector_insert:
1751       assert(!"should have been lowered by lower_vector_insert");
1752       break;
1753
1754    case ir_quadop_bitfield_insert:
1755       assert(!"not reached: should be handled by "
1756               "bitfield_insert_to_bfm_bfi\n");
1757       break;
1758
1759    case ir_quadop_vector:
1760       assert(!"not reached: should be handled by lower_quadop_vector");
1761       break;
1762
1763    case ir_unop_pack_half_2x16:
1764       emit_pack_half_2x16(result_dst, op[0]);
1765       break;
1766    case ir_unop_unpack_half_2x16:
1767       emit_unpack_half_2x16(result_dst, op[0]);
1768       break;
1769    case ir_unop_pack_snorm_2x16:
1770    case ir_unop_pack_snorm_4x8:
1771    case ir_unop_pack_unorm_2x16:
1772    case ir_unop_pack_unorm_4x8:
1773    case ir_unop_unpack_snorm_2x16:
1774    case ir_unop_unpack_snorm_4x8:
1775    case ir_unop_unpack_unorm_2x16:
1776    case ir_unop_unpack_unorm_4x8:
1777       assert(!"not reached: should be handled by lower_packing_builtins");
1778       break;
1779    case ir_unop_unpack_half_2x16_split_x:
1780    case ir_unop_unpack_half_2x16_split_y:
1781    case ir_binop_pack_half_2x16_split:
1782       assert(!"not reached: should not occur in vertex shader");
1783       break;
1784    case ir_binop_ldexp:
1785       assert(!"not reached: should be handled by ldexp_to_arith()");
1786       break;
1787    }
1788 }
1789
1790
1791 void
1792 vec4_visitor::visit(ir_swizzle *ir)
1793 {
1794    src_reg src;
1795    int i = 0;
1796    int swizzle[4];
1797
1798    /* Note that this is only swizzles in expressions, not those on the left
1799     * hand side of an assignment, which do write masking.  See ir_assignment
1800     * for that.
1801     */
1802
1803    ir->val->accept(this);
1804    src = this->result;
1805    assert(src.file != BAD_FILE);
1806
1807    for (i = 0; i < ir->type->vector_elements; i++) {
1808       switch (i) {
1809       case 0:
1810          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1811          break;
1812       case 1:
1813          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1814          break;
1815       case 2:
1816          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1817          break;
1818       case 3:
1819          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1820             break;
1821       }
1822    }
1823    for (; i < 4; i++) {
1824       /* Replicate the last channel out. */
1825       swizzle[i] = swizzle[ir->type->vector_elements - 1];
1826    }
1827
1828    src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1829
1830    this->result = src;
1831 }
1832
1833 void
1834 vec4_visitor::visit(ir_dereference_variable *ir)
1835 {
1836    const struct glsl_type *type = ir->type;
1837    dst_reg *reg = variable_storage(ir->var);
1838
1839    if (!reg) {
1840       fail("Failed to find variable storage for %s\n", ir->var->name);
1841       this->result = src_reg(brw_null_reg());
1842       return;
1843    }
1844
1845    this->result = src_reg(*reg);
1846
1847    /* System values get their swizzle from the dst_reg writemask */
1848    if (ir->var->data.mode == ir_var_system_value)
1849       return;
1850
1851    if (type->is_scalar() || type->is_vector() || type->is_matrix())
1852       this->result.swizzle = swizzle_for_size(type->vector_elements);
1853 }
1854
1855
1856 int
1857 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1858 {
1859    /* Under normal circumstances array elements are stored consecutively, so
1860     * the stride is equal to the size of the array element.
1861     */
1862    return type_size(ir->type);
1863 }
1864
1865
1866 void
1867 vec4_visitor::visit(ir_dereference_array *ir)
1868 {
1869    ir_constant *constant_index;
1870    src_reg src;
1871    int array_stride = compute_array_stride(ir);
1872
1873    constant_index = ir->array_index->constant_expression_value();
1874
1875    ir->array->accept(this);
1876    src = this->result;
1877
1878    if (constant_index) {
1879       src.reg_offset += constant_index->value.i[0] * array_stride;
1880    } else {
1881       /* Variable index array dereference.  It eats the "vec4" of the
1882        * base of the array and an index that offsets the Mesa register
1883        * index.
1884        */
1885       ir->array_index->accept(this);
1886
1887       src_reg index_reg;
1888
1889       if (array_stride == 1) {
1890          index_reg = this->result;
1891       } else {
1892          index_reg = src_reg(this, glsl_type::int_type);
1893
1894          emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
1895       }
1896
1897       if (src.reladdr) {
1898          src_reg temp = src_reg(this, glsl_type::int_type);
1899
1900          emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1901
1902          index_reg = temp;
1903       }
1904
1905       src.reladdr = ralloc(mem_ctx, src_reg);
1906       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1907    }
1908
1909    /* If the type is smaller than a vec4, replicate the last channel out. */
1910    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1911       src.swizzle = swizzle_for_size(ir->type->vector_elements);
1912    else
1913       src.swizzle = BRW_SWIZZLE_NOOP;
1914    src.type = brw_type_for_base_type(ir->type);
1915
1916    this->result = src;
1917 }
1918
1919 void
1920 vec4_visitor::visit(ir_dereference_record *ir)
1921 {
1922    unsigned int i;
1923    const glsl_type *struct_type = ir->record->type;
1924    int offset = 0;
1925
1926    ir->record->accept(this);
1927
1928    for (i = 0; i < struct_type->length; i++) {
1929       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1930          break;
1931       offset += type_size(struct_type->fields.structure[i].type);
1932    }
1933
1934    /* If the type is smaller than a vec4, replicate the last channel out. */
1935    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1936       this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1937    else
1938       this->result.swizzle = BRW_SWIZZLE_NOOP;
1939    this->result.type = brw_type_for_base_type(ir->type);
1940
1941    this->result.reg_offset += offset;
1942 }
1943
1944 /**
1945  * We want to be careful in assignment setup to hit the actual storage
1946  * instead of potentially using a temporary like we might with the
1947  * ir_dereference handler.
1948  */
1949 static dst_reg
1950 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1951 {
1952    /* The LHS must be a dereference.  If the LHS is a variable indexed array
1953     * access of a vector, it must be separated into a series conditional moves
1954     * before reaching this point (see ir_vec_index_to_cond_assign).
1955     */
1956    assert(ir->as_dereference());
1957    ir_dereference_array *deref_array = ir->as_dereference_array();
1958    if (deref_array) {
1959       assert(!deref_array->array->type->is_vector());
1960    }
1961
1962    /* Use the rvalue deref handler for the most part.  We'll ignore
1963     * swizzles in it and write swizzles using writemask, though.
1964     */
1965    ir->accept(v);
1966    return dst_reg(v->result);
1967 }
1968
1969 void
1970 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1971                               const struct glsl_type *type, uint32_t predicate)
1972 {
1973    if (type->base_type == GLSL_TYPE_STRUCT) {
1974       for (unsigned int i = 0; i < type->length; i++) {
1975          emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1976       }
1977       return;
1978    }
1979
1980    if (type->is_array()) {
1981       for (unsigned int i = 0; i < type->length; i++) {
1982          emit_block_move(dst, src, type->fields.array, predicate);
1983       }
1984       return;
1985    }
1986
1987    if (type->is_matrix()) {
1988       const struct glsl_type *vec_type;
1989
1990       vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1991                                          type->vector_elements, 1);
1992
1993       for (int i = 0; i < type->matrix_columns; i++) {
1994          emit_block_move(dst, src, vec_type, predicate);
1995       }
1996       return;
1997    }
1998
1999    assert(type->is_scalar() || type->is_vector());
2000
2001    dst->type = brw_type_for_base_type(type);
2002    src->type = dst->type;
2003
2004    dst->writemask = (1 << type->vector_elements) - 1;
2005
2006    src->swizzle = swizzle_for_size(type->vector_elements);
2007
2008    vec4_instruction *inst = emit(MOV(*dst, *src));
2009    inst->predicate = predicate;
2010
2011    dst->reg_offset++;
2012    src->reg_offset++;
2013 }
2014
2015
2016 /* If the RHS processing resulted in an instruction generating a
2017  * temporary value, and it would be easy to rewrite the instruction to
2018  * generate its result right into the LHS instead, do so.  This ends
2019  * up reliably removing instructions where it can be tricky to do so
2020  * later without real UD chain information.
2021  */
2022 bool
2023 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2024                                      dst_reg dst,
2025                                      src_reg src,
2026                                      vec4_instruction *pre_rhs_inst,
2027                                      vec4_instruction *last_rhs_inst)
2028 {
2029    /* This could be supported, but it would take more smarts. */
2030    if (ir->condition)
2031       return false;
2032
2033    if (pre_rhs_inst == last_rhs_inst)
2034       return false; /* No instructions generated to work with. */
2035
2036    /* Make sure the last instruction generated our source reg. */
2037    if (src.file != GRF ||
2038        src.file != last_rhs_inst->dst.file ||
2039        src.reg != last_rhs_inst->dst.reg ||
2040        src.reg_offset != last_rhs_inst->dst.reg_offset ||
2041        src.reladdr ||
2042        src.abs ||
2043        src.negate ||
2044        last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2045       return false;
2046
2047    /* Check that that last instruction fully initialized the channels
2048     * we want to use, in the order we want to use them.  We could
2049     * potentially reswizzle the operands of many instructions so that
2050     * we could handle out of order channels, but don't yet.
2051     */
2052
2053    for (unsigned i = 0; i < 4; i++) {
2054       if (dst.writemask & (1 << i)) {
2055          if (!(last_rhs_inst->dst.writemask & (1 << i)))
2056             return false;
2057
2058          if (BRW_GET_SWZ(src.swizzle, i) != i)
2059             return false;
2060       }
2061    }
2062
2063    /* Success!  Rewrite the instruction. */
2064    last_rhs_inst->dst.file = dst.file;
2065    last_rhs_inst->dst.reg = dst.reg;
2066    last_rhs_inst->dst.reg_offset = dst.reg_offset;
2067    last_rhs_inst->dst.reladdr = dst.reladdr;
2068    last_rhs_inst->dst.writemask &= dst.writemask;
2069
2070    return true;
2071 }
2072
2073 void
2074 vec4_visitor::visit(ir_assignment *ir)
2075 {
2076    dst_reg dst = get_assignment_lhs(ir->lhs, this);
2077    uint32_t predicate = BRW_PREDICATE_NONE;
2078
2079    if (!ir->lhs->type->is_scalar() &&
2080        !ir->lhs->type->is_vector()) {
2081       ir->rhs->accept(this);
2082       src_reg src = this->result;
2083
2084       if (ir->condition) {
2085          emit_bool_to_cond_code(ir->condition, &predicate);
2086       }
2087
2088       /* emit_block_move doesn't account for swizzles in the source register.
2089        * This should be ok, since the source register is a structure or an
2090        * array, and those can't be swizzled.  But double-check to be sure.
2091        */
2092       assert(src.swizzle ==
2093              (ir->rhs->type->is_matrix()
2094               ? swizzle_for_size(ir->rhs->type->vector_elements)
2095               : BRW_SWIZZLE_NOOP));
2096
2097       emit_block_move(&dst, &src, ir->rhs->type, predicate);
2098       return;
2099    }
2100
2101    /* Now we're down to just a scalar/vector with writemasks. */
2102    int i;
2103
2104    vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2105    pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2106
2107    ir->rhs->accept(this);
2108
2109    last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2110
2111    src_reg src = this->result;
2112
2113    int swizzles[4];
2114    int first_enabled_chan = 0;
2115    int src_chan = 0;
2116
2117    assert(ir->lhs->type->is_vector() ||
2118           ir->lhs->type->is_scalar());
2119    dst.writemask = ir->write_mask;
2120
2121    for (int i = 0; i < 4; i++) {
2122       if (dst.writemask & (1 << i)) {
2123          first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
2124          break;
2125       }
2126    }
2127
2128    /* Swizzle a small RHS vector into the channels being written.
2129     *
2130     * glsl ir treats write_mask as dictating how many channels are
2131     * present on the RHS while in our instructions we need to make
2132     * those channels appear in the slots of the vec4 they're written to.
2133     */
2134    for (int i = 0; i < 4; i++) {
2135       if (dst.writemask & (1 << i))
2136          swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
2137       else
2138          swizzles[i] = first_enabled_chan;
2139    }
2140    src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2141                               swizzles[2], swizzles[3]);
2142
2143    if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2144       return;
2145    }
2146
2147    if (ir->condition) {
2148       emit_bool_to_cond_code(ir->condition, &predicate);
2149    }
2150
2151    for (i = 0; i < type_size(ir->lhs->type); i++) {
2152       vec4_instruction *inst = emit(MOV(dst, src));
2153       inst->predicate = predicate;
2154
2155       dst.reg_offset++;
2156       src.reg_offset++;
2157    }
2158 }
2159
2160 void
2161 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2162 {
2163    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2164       foreach_in_list(ir_constant, field_value, &ir->components) {
2165          emit_constant_values(dst, field_value);
2166       }
2167       return;
2168    }
2169
2170    if (ir->type->is_array()) {
2171       for (unsigned int i = 0; i < ir->type->length; i++) {
2172          emit_constant_values(dst, ir->array_elements[i]);
2173       }
2174       return;
2175    }
2176
2177    if (ir->type->is_matrix()) {
2178       for (int i = 0; i < ir->type->matrix_columns; i++) {
2179          float *vec = &ir->value.f[i * ir->type->vector_elements];
2180
2181          for (int j = 0; j < ir->type->vector_elements; j++) {
2182             dst->writemask = 1 << j;
2183             dst->type = BRW_REGISTER_TYPE_F;
2184
2185             emit(MOV(*dst, src_reg(vec[j])));
2186          }
2187          dst->reg_offset++;
2188       }
2189       return;
2190    }
2191
2192    int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2193
2194    for (int i = 0; i < ir->type->vector_elements; i++) {
2195       if (!(remaining_writemask & (1 << i)))
2196          continue;
2197
2198       dst->writemask = 1 << i;
2199       dst->type = brw_type_for_base_type(ir->type);
2200
2201       /* Find other components that match the one we're about to
2202        * write.  Emits fewer instructions for things like vec4(0.5,
2203        * 1.5, 1.5, 1.5).
2204        */
2205       for (int j = i + 1; j < ir->type->vector_elements; j++) {
2206          if (ir->type->base_type == GLSL_TYPE_BOOL) {
2207             if (ir->value.b[i] == ir->value.b[j])
2208                dst->writemask |= (1 << j);
2209          } else {
2210             /* u, i, and f storage all line up, so no need for a
2211              * switch case for comparing each type.
2212              */
2213             if (ir->value.u[i] == ir->value.u[j])
2214                dst->writemask |= (1 << j);
2215          }
2216       }
2217
2218       switch (ir->type->base_type) {
2219       case GLSL_TYPE_FLOAT:
2220          emit(MOV(*dst, src_reg(ir->value.f[i])));
2221          break;
2222       case GLSL_TYPE_INT:
2223          emit(MOV(*dst, src_reg(ir->value.i[i])));
2224          break;
2225       case GLSL_TYPE_UINT:
2226          emit(MOV(*dst, src_reg(ir->value.u[i])));
2227          break;
2228       case GLSL_TYPE_BOOL:
2229          emit(MOV(*dst, src_reg(ir->value.b[i])));
2230          break;
2231       default:
2232          assert(!"Non-float/uint/int/bool constant");
2233          break;
2234       }
2235
2236       remaining_writemask &= ~dst->writemask;
2237    }
2238    dst->reg_offset++;
2239 }
2240
2241 void
2242 vec4_visitor::visit(ir_constant *ir)
2243 {
2244    dst_reg dst = dst_reg(this, ir->type);
2245    this->result = src_reg(dst);
2246
2247    emit_constant_values(&dst, ir);
2248 }
2249
2250 void
2251 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2252 {
2253    ir_dereference *deref = static_cast<ir_dereference *>(
2254       ir->actual_parameters.get_head());
2255    ir_variable *location = deref->variable_referenced();
2256    unsigned surf_index = (prog_data->base.binding_table.abo_start +
2257                           location->data.atomic.buffer_index);
2258
2259    /* Calculate the surface offset */
2260    src_reg offset(this, glsl_type::uint_type);
2261    ir_dereference_array *deref_array = deref->as_dereference_array();
2262    if (deref_array) {
2263       deref_array->array_index->accept(this);
2264
2265       src_reg tmp(this, glsl_type::uint_type);
2266       emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2267       emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2268    } else {
2269       offset = location->data.atomic.offset;
2270    }
2271
2272    /* Emit the appropriate machine instruction */
2273    const char *callee = ir->callee->function_name();
2274    dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2275
2276    if (!strcmp("__intrinsic_atomic_read", callee)) {
2277       emit_untyped_surface_read(surf_index, dst, offset);
2278
2279    } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2280       emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2281                           src_reg(), src_reg());
2282
2283    } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2284       emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2285                           src_reg(), src_reg());
2286    }
2287 }
2288
2289 void
2290 vec4_visitor::visit(ir_call *ir)
2291 {
2292    const char *callee = ir->callee->function_name();
2293
2294    if (!strcmp("__intrinsic_atomic_read", callee) ||
2295        !strcmp("__intrinsic_atomic_increment", callee) ||
2296        !strcmp("__intrinsic_atomic_predecrement", callee)) {
2297       visit_atomic_counter_intrinsic(ir);
2298    } else {
2299       assert(!"Unsupported intrinsic.");
2300    }
2301 }
2302
2303 src_reg
2304 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, int sampler)
2305 {
2306    vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_MCS);
2307    inst->base_mrf = 2;
2308    inst->mlen = 1;
2309    inst->sampler = sampler;
2310    inst->dst = dst_reg(this, glsl_type::uvec4_type);
2311    inst->dst.writemask = WRITEMASK_XYZW;
2312
2313    /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2314    int param_base = inst->base_mrf;
2315    int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2316    int zero_mask = 0xf & ~coord_mask;
2317
2318    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2319             coordinate));
2320
2321    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2322             src_reg(0)));
2323
2324    emit(inst);
2325    return src_reg(inst->dst);
2326 }
2327
2328 void
2329 vec4_visitor::visit(ir_texture *ir)
2330 {
2331    int sampler =
2332       _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2333
2334    /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2335     * emitting anything other than setting up the constant result.
2336     */
2337    if (ir->op == ir_tg4) {
2338       ir_constant *chan = ir->lod_info.component->as_constant();
2339       int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2340       if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2341          dst_reg result(this, ir->type);
2342          this->result = src_reg(result);
2343          emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2344          return;
2345       }
2346    }
2347
2348    /* Should be lowered by do_lower_texture_projection */
2349    assert(!ir->projector);
2350
2351    /* Should be lowered */
2352    assert(!ir->offset || !ir->offset->type->is_array());
2353
2354    /* Generate code to compute all the subexpression trees.  This has to be
2355     * done before loading any values into MRFs for the sampler message since
2356     * generating these values may involve SEND messages that need the MRFs.
2357     */
2358    src_reg coordinate;
2359    if (ir->coordinate) {
2360       ir->coordinate->accept(this);
2361       coordinate = this->result;
2362    }
2363
2364    src_reg shadow_comparitor;
2365    if (ir->shadow_comparitor) {
2366       ir->shadow_comparitor->accept(this);
2367       shadow_comparitor = this->result;
2368    }
2369
2370    bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2371    src_reg offset_value;
2372    if (has_nonconstant_offset) {
2373       ir->offset->accept(this);
2374       offset_value = src_reg(this->result);
2375    }
2376
2377    const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2378    src_reg lod, dPdx, dPdy, sample_index, mcs;
2379    switch (ir->op) {
2380    case ir_tex:
2381       lod = src_reg(0.0f);
2382       lod_type = glsl_type::float_type;
2383       break;
2384    case ir_txf:
2385    case ir_txl:
2386    case ir_txs:
2387       ir->lod_info.lod->accept(this);
2388       lod = this->result;
2389       lod_type = ir->lod_info.lod->type;
2390       break;
2391    case ir_query_levels:
2392       lod = src_reg(0);
2393       lod_type = glsl_type::int_type;
2394       break;
2395    case ir_txf_ms:
2396       ir->lod_info.sample_index->accept(this);
2397       sample_index = this->result;
2398       sample_index_type = ir->lod_info.sample_index->type;
2399
2400       if (brw->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2401          mcs = emit_mcs_fetch(ir, coordinate, sampler);
2402       else
2403          mcs = src_reg(0u);
2404       break;
2405    case ir_txd:
2406       ir->lod_info.grad.dPdx->accept(this);
2407       dPdx = this->result;
2408
2409       ir->lod_info.grad.dPdy->accept(this);
2410       dPdy = this->result;
2411
2412       lod_type = ir->lod_info.grad.dPdx->type;
2413       break;
2414    case ir_txb:
2415    case ir_lod:
2416    case ir_tg4:
2417       break;
2418    }
2419
2420    vec4_instruction *inst = NULL;
2421    switch (ir->op) {
2422    case ir_tex:
2423    case ir_txl:
2424       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXL);
2425       break;
2426    case ir_txd:
2427       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXD);
2428       break;
2429    case ir_txf:
2430       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF);
2431       break;
2432    case ir_txf_ms:
2433       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_CMS);
2434       break;
2435    case ir_txs:
2436       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
2437       break;
2438    case ir_tg4:
2439       if (has_nonconstant_offset)
2440          inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TG4_OFFSET);
2441       else
2442          inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TG4);
2443       break;
2444    case ir_query_levels:
2445       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
2446       break;
2447    case ir_txb:
2448       assert(!"TXB is not valid for vertex shaders.");
2449       break;
2450    case ir_lod:
2451       assert(!"LOD is not valid for vertex shaders.");
2452       break;
2453    default:
2454       assert(!"Unrecognized tex op");
2455    }
2456
2457    if (ir->offset != NULL && ir->op != ir_txf)
2458       inst->texture_offset = brw_texture_offset(ctx, ir->offset->as_constant());
2459
2460    /* Stuff the channel select bits in the top of the texture offset */
2461    if (ir->op == ir_tg4)
2462       inst->texture_offset |= gather_channel(ir, sampler) << 16;
2463
2464    /* The message header is necessary for:
2465     * - Gen4 (always)
2466     * - Texel offsets
2467     * - Gather channel selection
2468     * - Sampler indices too large to fit in a 4-bit value.
2469     */
2470    inst->header_present =
2471       brw->gen < 5 || inst->texture_offset != 0 || ir->op == ir_tg4 ||
2472       sampler >= 16;
2473    inst->base_mrf = 2;
2474    inst->mlen = inst->header_present + 1; /* always at least one */
2475    inst->sampler = sampler;
2476    inst->dst = dst_reg(this, ir->type);
2477    inst->dst.writemask = WRITEMASK_XYZW;
2478    inst->shadow_compare = ir->shadow_comparitor != NULL;
2479
2480    /* MRF for the first parameter */
2481    int param_base = inst->base_mrf + inst->header_present;
2482
2483    if (ir->op == ir_txs || ir->op == ir_query_levels) {
2484       int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2485       emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2486    } else {
2487       /* Load the coordinate */
2488       /* FINISHME: gl_clamp_mask and saturate */
2489       int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2490       int zero_mask = 0xf & ~coord_mask;
2491
2492       emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2493                coordinate));
2494
2495       if (zero_mask != 0) {
2496          emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2497                   src_reg(0)));
2498       }
2499       /* Load the shadow comparitor */
2500       if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2501          emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2502                           WRITEMASK_X),
2503                   shadow_comparitor));
2504          inst->mlen++;
2505       }
2506
2507       /* Load the LOD info */
2508       if (ir->op == ir_tex || ir->op == ir_txl) {
2509          int mrf, writemask;
2510          if (brw->gen >= 5) {
2511             mrf = param_base + 1;
2512             if (ir->shadow_comparitor) {
2513                writemask = WRITEMASK_Y;
2514                /* mlen already incremented */
2515             } else {
2516                writemask = WRITEMASK_X;
2517                inst->mlen++;
2518             }
2519          } else /* brw->gen == 4 */ {
2520             mrf = param_base;
2521             writemask = WRITEMASK_W;
2522          }
2523          emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2524       } else if (ir->op == ir_txf) {
2525          emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2526       } else if (ir->op == ir_txf_ms) {
2527          emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2528                   sample_index));
2529          if (brw->gen >= 7)
2530             /* MCS data is in the first channel of `mcs`, but we need to get it into
2531              * the .y channel of the second vec4 of params, so replicate .x across
2532              * the whole vec4 and then mask off everything except .y
2533              */
2534             mcs.swizzle = BRW_SWIZZLE_XXXX;
2535             emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2536                      mcs));
2537          inst->mlen++;
2538       } else if (ir->op == ir_txd) {
2539          const glsl_type *type = lod_type;
2540
2541          if (brw->gen >= 5) {
2542             dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2543             dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2544             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2545             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2546             inst->mlen++;
2547
2548             if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2549                dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2550                dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2551                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2552                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2553                inst->mlen++;
2554
2555                if (ir->shadow_comparitor) {
2556                   emit(MOV(dst_reg(MRF, param_base + 2,
2557                                    ir->shadow_comparitor->type, WRITEMASK_Z),
2558                            shadow_comparitor));
2559                }
2560             }
2561          } else /* brw->gen == 4 */ {
2562             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2563             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2564             inst->mlen += 2;
2565          }
2566       } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2567          if (ir->shadow_comparitor) {
2568             emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2569                      shadow_comparitor));
2570          }
2571
2572          emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2573                   offset_value));
2574          inst->mlen++;
2575       }
2576    }
2577
2578    emit(inst);
2579
2580    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2581     * spec requires layers.
2582     */
2583    if (ir->op == ir_txs) {
2584       glsl_type const *type = ir->sampler->type;
2585       if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2586           type->sampler_array) {
2587          emit_math(SHADER_OPCODE_INT_QUOTIENT,
2588                    writemask(inst->dst, WRITEMASK_Z),
2589                    src_reg(inst->dst), src_reg(6));
2590       }
2591    }
2592
2593    if (brw->gen == 6 && ir->op == ir_tg4) {
2594       emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2595    }
2596
2597    swizzle_result(ir, src_reg(inst->dst), sampler);
2598 }
2599
2600 /**
2601  * Apply workarounds for Gen6 gather with UINT/SINT
2602  */
2603 void
2604 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2605 {
2606    if (!wa)
2607       return;
2608
2609    int width = (wa & WA_8BIT) ? 8 : 16;
2610    dst_reg dst_f = dst;
2611    dst_f.type = BRW_REGISTER_TYPE_F;
2612
2613    /* Convert from UNORM to UINT */
2614    emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2615    emit(MOV(dst, src_reg(dst_f)));
2616
2617    if (wa & WA_SIGN) {
2618       /* Reinterpret the UINT value as a signed INT value by
2619        * shifting the sign bit into place, then shifting back
2620        * preserving sign.
2621        */
2622       emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2623       emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2624    }
2625 }
2626
2627 /**
2628  * Set up the gather channel based on the swizzle, for gather4.
2629  */
2630 uint32_t
2631 vec4_visitor::gather_channel(ir_texture *ir, int sampler)
2632 {
2633    ir_constant *chan = ir->lod_info.component->as_constant();
2634    int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2635    switch (swiz) {
2636       case SWIZZLE_X: return 0;
2637       case SWIZZLE_Y:
2638          /* gather4 sampler is broken for green channel on RG32F --
2639           * we must ask for blue instead.
2640           */
2641          if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2642             return 2;
2643          return 1;
2644       case SWIZZLE_Z: return 2;
2645       case SWIZZLE_W: return 3;
2646       default:
2647          assert(!"Not reached"); /* zero, one swizzles handled already */
2648          return 0;
2649    }
2650 }
2651
2652 void
2653 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, int sampler)
2654 {
2655    int s = key->tex.swizzles[sampler];
2656
2657    this->result = src_reg(this, ir->type);
2658    dst_reg swizzled_result(this->result);
2659
2660    if (ir->op == ir_query_levels) {
2661       /* # levels is in .w */
2662       orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2663       emit(MOV(swizzled_result, orig_val));
2664       return;
2665    }
2666
2667    if (ir->op == ir_txs || ir->type == glsl_type::float_type
2668                         || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2669       emit(MOV(swizzled_result, orig_val));
2670       return;
2671    }
2672
2673
2674    int zero_mask = 0, one_mask = 0, copy_mask = 0;
2675    int swizzle[4] = {0};
2676
2677    for (int i = 0; i < 4; i++) {
2678       switch (GET_SWZ(s, i)) {
2679       case SWIZZLE_ZERO:
2680          zero_mask |= (1 << i);
2681          break;
2682       case SWIZZLE_ONE:
2683          one_mask |= (1 << i);
2684          break;
2685       default:
2686          copy_mask |= (1 << i);
2687          swizzle[i] = GET_SWZ(s, i);
2688          break;
2689       }
2690    }
2691
2692    if (copy_mask) {
2693       orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2694       swizzled_result.writemask = copy_mask;
2695       emit(MOV(swizzled_result, orig_val));
2696    }
2697
2698    if (zero_mask) {
2699       swizzled_result.writemask = zero_mask;
2700       emit(MOV(swizzled_result, src_reg(0.0f)));
2701    }
2702
2703    if (one_mask) {
2704       swizzled_result.writemask = one_mask;
2705       emit(MOV(swizzled_result, src_reg(1.0f)));
2706    }
2707 }
2708
2709 void
2710 vec4_visitor::visit(ir_return *)
2711 {
2712    assert(!"not reached");
2713 }
2714
2715 void
2716 vec4_visitor::visit(ir_discard *)
2717 {
2718    assert(!"not reached");
2719 }
2720
2721 void
2722 vec4_visitor::visit(ir_if *ir)
2723 {
2724    /* Don't point the annotation at the if statement, because then it plus
2725     * the then and else blocks get printed.
2726     */
2727    this->base_ir = ir->condition;
2728
2729    if (brw->gen == 6) {
2730       emit_if_gen6(ir);
2731    } else {
2732       uint32_t predicate;
2733       emit_bool_to_cond_code(ir->condition, &predicate);
2734       emit(IF(predicate));
2735    }
2736
2737    visit_instructions(&ir->then_instructions);
2738
2739    if (!ir->else_instructions.is_empty()) {
2740       this->base_ir = ir->condition;
2741       emit(BRW_OPCODE_ELSE);
2742
2743       visit_instructions(&ir->else_instructions);
2744    }
2745
2746    this->base_ir = ir->condition;
2747    emit(BRW_OPCODE_ENDIF);
2748 }
2749
2750 void
2751 vec4_visitor::visit(ir_emit_vertex *)
2752 {
2753    assert(!"not reached");
2754 }
2755
2756 void
2757 vec4_visitor::visit(ir_end_primitive *)
2758 {
2759    assert(!"not reached");
2760 }
2761
2762 void
2763 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2764                                   dst_reg dst, src_reg offset,
2765                                   src_reg src0, src_reg src1)
2766 {
2767    unsigned mlen = 0;
2768
2769    /* Set the atomic operation offset. */
2770    emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2771    mlen++;
2772
2773    /* Set the atomic operation arguments. */
2774    if (src0.file != BAD_FILE) {
2775       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2776       mlen++;
2777    }
2778
2779    if (src1.file != BAD_FILE) {
2780       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2781       mlen++;
2782    }
2783
2784    /* Emit the instruction.  Note that this maps to the normal SIMD8
2785     * untyped atomic message on Ivy Bridge, but that's OK because
2786     * unused channels will be masked out.
2787     */
2788    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
2789                                  src_reg(atomic_op), src_reg(surf_index));
2790    inst->base_mrf = 0;
2791    inst->mlen = mlen;
2792 }
2793
2794 void
2795 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
2796                                         src_reg offset)
2797 {
2798    /* Set the surface read offset. */
2799    emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
2800
2801    /* Emit the instruction.  Note that this maps to the normal SIMD8
2802     * untyped surface read message, but that's OK because unused
2803     * channels will be masked out.
2804     */
2805    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ,
2806                                  dst, src_reg(surf_index));
2807    inst->base_mrf = 0;
2808    inst->mlen = 1;
2809 }
2810
2811 void
2812 vec4_visitor::emit_ndc_computation()
2813 {
2814    /* Get the position */
2815    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2816
2817    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2818    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2819    output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2820
2821    current_annotation = "NDC";
2822    dst_reg ndc_w = ndc;
2823    ndc_w.writemask = WRITEMASK_W;
2824    src_reg pos_w = pos;
2825    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2826    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2827
2828    dst_reg ndc_xyz = ndc;
2829    ndc_xyz.writemask = WRITEMASK_XYZ;
2830
2831    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2832 }
2833
2834 void
2835 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2836 {
2837    if (brw->gen < 6 &&
2838        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2839         key->userclip_active || brw->has_negative_rhw_bug)) {
2840       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2841       dst_reg header1_w = header1;
2842       header1_w.writemask = WRITEMASK_W;
2843
2844       emit(MOV(header1, 0u));
2845
2846       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2847          src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2848
2849          current_annotation = "Point size";
2850          emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2851          emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2852       }
2853
2854       if (key->userclip_active) {
2855          current_annotation = "Clipping flags";
2856          dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
2857          dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
2858
2859          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
2860          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
2861          emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
2862
2863          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
2864          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
2865          emit(SHL(flags1, src_reg(flags1), src_reg(4)));
2866          emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
2867       }
2868
2869       /* i965 clipping workaround:
2870        * 1) Test for -ve rhw
2871        * 2) If set,
2872        *      set ndc = (0,0,0,0)
2873        *      set ucp[6] = 1
2874        *
2875        * Later, clipping will detect ucp[6] and ensure the primitive is
2876        * clipped against all fixed planes.
2877        */
2878       if (brw->has_negative_rhw_bug) {
2879          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
2880          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
2881          emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
2882          vec4_instruction *inst;
2883          inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
2884          inst->predicate = BRW_PREDICATE_NORMAL;
2885          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
2886          inst->predicate = BRW_PREDICATE_NORMAL;
2887       }
2888
2889       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2890    } else if (brw->gen < 6) {
2891       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2892    } else {
2893       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2894       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2895          emit(MOV(brw_writemask(reg, WRITEMASK_W),
2896                   src_reg(output_reg[VARYING_SLOT_PSIZ])));
2897       }
2898       if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
2899          emit(MOV(retype(brw_writemask(reg, WRITEMASK_Y), BRW_REGISTER_TYPE_D),
2900                   src_reg(output_reg[VARYING_SLOT_LAYER])));
2901       }
2902       if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
2903          emit(MOV(retype(brw_writemask(reg, WRITEMASK_Z), BRW_REGISTER_TYPE_D),
2904                   src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
2905       }
2906    }
2907 }
2908
2909 void
2910 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
2911 {
2912    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2913     *
2914     *     "If a linked set of shaders forming the vertex stage contains no
2915     *     static write to gl_ClipVertex or gl_ClipDistance, but the
2916     *     application has requested clipping against user clip planes through
2917     *     the API, then the coordinate written to gl_Position is used for
2918     *     comparison against the user clip planes."
2919     *
2920     * This function is only called if the shader didn't write to
2921     * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
2922     * if the user wrote to it; otherwise we use gl_Position.
2923     */
2924    gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
2925    if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
2926       clip_vertex = VARYING_SLOT_POS;
2927    }
2928
2929    for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
2930         ++i) {
2931       reg.writemask = 1 << i;
2932       emit(DP4(reg,
2933                src_reg(output_reg[clip_vertex]),
2934                src_reg(this->userplane[i + offset])));
2935    }
2936 }
2937
2938 void
2939 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
2940 {
2941    assert (varying < VARYING_SLOT_MAX);
2942    reg.type = output_reg[varying].type;
2943    current_annotation = output_reg_annotation[varying];
2944    /* Copy the register, saturating if necessary */
2945    vec4_instruction *inst = emit(MOV(reg,
2946                                      src_reg(output_reg[varying])));
2947    if ((varying == VARYING_SLOT_COL0 ||
2948         varying == VARYING_SLOT_COL1 ||
2949         varying == VARYING_SLOT_BFC0 ||
2950         varying == VARYING_SLOT_BFC1) &&
2951        key->clamp_vertex_color) {
2952       inst->saturate = true;
2953    }
2954 }
2955
2956 void
2957 vec4_visitor::emit_urb_slot(int mrf, int varying)
2958 {
2959    struct brw_reg hw_reg = brw_message_reg(mrf);
2960    dst_reg reg = dst_reg(MRF, mrf);
2961    reg.type = BRW_REGISTER_TYPE_F;
2962
2963    switch (varying) {
2964    case VARYING_SLOT_PSIZ:
2965       /* PSIZ is always in slot 0, and is coupled with other flags. */
2966       current_annotation = "indices, point width, clip flags";
2967       emit_psiz_and_flags(hw_reg);
2968       break;
2969    case BRW_VARYING_SLOT_NDC:
2970       current_annotation = "NDC";
2971       emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
2972       break;
2973    case VARYING_SLOT_POS:
2974       current_annotation = "gl_Position";
2975       emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
2976       break;
2977    case VARYING_SLOT_EDGE:
2978       /* This is present when doing unfilled polygons.  We're supposed to copy
2979        * the edge flag from the user-provided vertex array
2980        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
2981        * of that attribute (starts as 1.0f).  This is then used in clipping to
2982        * determine which edges should be drawn as wireframe.
2983        */
2984       current_annotation = "edge flag";
2985       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
2986                                     glsl_type::float_type, WRITEMASK_XYZW))));
2987       break;
2988    case BRW_VARYING_SLOT_PAD:
2989       /* No need to write to this slot */
2990       break;
2991    default:
2992       emit_generic_urb_slot(reg, varying);
2993       break;
2994    }
2995 }
2996
2997 static int
2998 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
2999 {
3000    if (brw->gen >= 6) {
3001       /* URB data written (does not include the message header reg) must
3002        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
3003        * section 5.4.3.2.2: URB_INTERLEAVED.
3004        *
3005        * URB entries are allocated on a multiple of 1024 bits, so an
3006        * extra 128 bits written here to make the end align to 256 is
3007        * no problem.
3008        */
3009       if ((mlen % 2) != 1)
3010          mlen++;
3011    }
3012
3013    return mlen;
3014 }
3015
3016
3017 /**
3018  * Generates the VUE payload plus the necessary URB write instructions to
3019  * output it.
3020  *
3021  * The VUE layout is documented in Volume 2a.
3022  */
3023 void
3024 vec4_visitor::emit_vertex()
3025 {
3026    /* MRF 0 is reserved for the debugger, so start with message header
3027     * in MRF 1.
3028     */
3029    int base_mrf = 1;
3030    int mrf = base_mrf;
3031    /* In the process of generating our URB write message contents, we
3032     * may need to unspill a register or load from an array.  Those
3033     * reads would use MRFs 14-15.
3034     */
3035    int max_usable_mrf = 13;
3036
3037    /* The following assertion verifies that max_usable_mrf causes an
3038     * even-numbered amount of URB write data, which will meet gen6's
3039     * requirements for length alignment.
3040     */
3041    assert ((max_usable_mrf - base_mrf) % 2 == 0);
3042
3043    /* First mrf is the g0-based message header containing URB handles and
3044     * such.
3045     */
3046    emit_urb_write_header(mrf++);
3047
3048    if (brw->gen < 6) {
3049       emit_ndc_computation();
3050    }
3051
3052    /* Lower legacy ff and ClipVertex clipping to clip distances */
3053    if (key->userclip_active && !prog->UsesClipDistanceOut) {
3054       current_annotation = "user clip distances";
3055
3056       output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3057       output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3058
3059       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3060       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3061    }
3062
3063    /* We may need to split this up into several URB writes, so do them in a
3064     * loop.
3065     */
3066    int slot = 0;
3067    bool complete = false;
3068    do {
3069       /* URB offset is in URB row increments, and each of our MRFs is half of
3070        * one of those, since we're doing interleaved writes.
3071        */
3072       int offset = slot / 2;
3073
3074       mrf = base_mrf + 1;
3075       for (; slot < prog_data->vue_map.num_slots; ++slot) {
3076          emit_urb_slot(mrf++, prog_data->vue_map.slot_to_varying[slot]);
3077
3078          /* If this was max_usable_mrf, we can't fit anything more into this
3079           * URB WRITE.
3080           */
3081          if (mrf > max_usable_mrf) {
3082             slot++;
3083             break;
3084          }
3085       }
3086
3087       complete = slot >= prog_data->vue_map.num_slots;
3088       current_annotation = "URB write";
3089       vec4_instruction *inst = emit_urb_write_opcode(complete);
3090       inst->base_mrf = base_mrf;
3091       inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
3092       inst->offset += offset;
3093    } while(!complete);
3094 }
3095
3096
3097 src_reg
3098 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
3099                                  src_reg *reladdr, int reg_offset)
3100 {
3101    /* Because we store the values to scratch interleaved like our
3102     * vertex data, we need to scale the vec4 index by 2.
3103     */
3104    int message_header_scale = 2;
3105
3106    /* Pre-gen6, the message header uses byte offsets instead of vec4
3107     * (16-byte) offset units.
3108     */
3109    if (brw->gen < 6)
3110       message_header_scale *= 16;
3111
3112    if (reladdr) {
3113       src_reg index = src_reg(this, glsl_type::int_type);
3114
3115       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3116       emit_before(inst, MUL(dst_reg(index),
3117                             index, src_reg(message_header_scale)));
3118
3119       return index;
3120    } else {
3121       return src_reg(reg_offset * message_header_scale);
3122    }
3123 }
3124
3125 src_reg
3126 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
3127                                        src_reg *reladdr, int reg_offset)
3128 {
3129    if (reladdr) {
3130       src_reg index = src_reg(this, glsl_type::int_type);
3131
3132       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3133
3134       /* Pre-gen6, the message header uses byte offsets instead of vec4
3135        * (16-byte) offset units.
3136        */
3137       if (brw->gen < 6) {
3138          emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
3139       }
3140
3141       return index;
3142    } else if (brw->gen >= 8) {
3143       /* Store the offset in a GRF so we can send-from-GRF. */
3144       src_reg offset = src_reg(this, glsl_type::int_type);
3145       emit_before(inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3146       return offset;
3147    } else {
3148       int message_header_scale = brw->gen < 6 ? 16 : 1;
3149       return src_reg(reg_offset * message_header_scale);
3150    }
3151 }
3152
3153 /**
3154  * Emits an instruction before @inst to load the value named by @orig_src
3155  * from scratch space at @base_offset to @temp.
3156  *
3157  * @base_offset is measured in 32-byte units (the size of a register).
3158  */
3159 void
3160 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
3161                                 dst_reg temp, src_reg orig_src,
3162                                 int base_offset)
3163 {
3164    int reg_offset = base_offset + orig_src.reg_offset;
3165    src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
3166
3167    emit_before(inst, SCRATCH_READ(temp, index));
3168 }
3169
3170 /**
3171  * Emits an instruction after @inst to store the value to be written
3172  * to @orig_dst to scratch space at @base_offset, from @temp.
3173  *
3174  * @base_offset is measured in 32-byte units (the size of a register).
3175  */
3176 void
3177 vec4_visitor::emit_scratch_write(vec4_instruction *inst, int base_offset)
3178 {
3179    int reg_offset = base_offset + inst->dst.reg_offset;
3180    src_reg index = get_scratch_offset(inst, inst->dst.reladdr, reg_offset);
3181
3182    /* Create a temporary register to store *inst's result in.
3183     *
3184     * We have to be careful in MOVing from our temporary result register in
3185     * the scratch write.  If we swizzle from channels of the temporary that
3186     * weren't initialized, it will confuse live interval analysis, which will
3187     * make spilling fail to make progress.
3188     */
3189    src_reg temp = src_reg(this, glsl_type::vec4_type);
3190    temp.type = inst->dst.type;
3191    int first_writemask_chan = ffs(inst->dst.writemask) - 1;
3192    int swizzles[4];
3193    for (int i = 0; i < 4; i++)
3194       if (inst->dst.writemask & (1 << i))
3195          swizzles[i] = i;
3196       else
3197          swizzles[i] = first_writemask_chan;
3198    temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
3199                                swizzles[2], swizzles[3]);
3200
3201    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3202                                        inst->dst.writemask));
3203    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3204    write->predicate = inst->predicate;
3205    write->ir = inst->ir;
3206    write->annotation = inst->annotation;
3207    inst->insert_after(write);
3208
3209    inst->dst.file = temp.file;
3210    inst->dst.reg = temp.reg;
3211    inst->dst.reg_offset = temp.reg_offset;
3212    inst->dst.reladdr = NULL;
3213 }
3214
3215 /**
3216  * We can't generally support array access in GRF space, because a
3217  * single instruction's destination can only span 2 contiguous
3218  * registers.  So, we send all GRF arrays that get variable index
3219  * access to scratch space.
3220  */
3221 void
3222 vec4_visitor::move_grf_array_access_to_scratch()
3223 {
3224    int scratch_loc[this->virtual_grf_count];
3225
3226    for (int i = 0; i < this->virtual_grf_count; i++) {
3227       scratch_loc[i] = -1;
3228    }
3229
3230    /* First, calculate the set of virtual GRFs that need to be punted
3231     * to scratch due to having any array access on them, and where in
3232     * scratch.
3233     */
3234    foreach_in_list(vec4_instruction, inst, &instructions) {
3235       if (inst->dst.file == GRF && inst->dst.reladdr &&
3236           scratch_loc[inst->dst.reg] == -1) {
3237          scratch_loc[inst->dst.reg] = c->last_scratch;
3238          c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
3239       }
3240
3241       for (int i = 0 ; i < 3; i++) {
3242          src_reg *src = &inst->src[i];
3243
3244          if (src->file == GRF && src->reladdr &&
3245              scratch_loc[src->reg] == -1) {
3246             scratch_loc[src->reg] = c->last_scratch;
3247             c->last_scratch += this->virtual_grf_sizes[src->reg];
3248          }
3249       }
3250    }
3251
3252    /* Now, for anything that will be accessed through scratch, rewrite
3253     * it to load/store.  Note that this is a _safe list walk, because
3254     * we may generate a new scratch_write instruction after the one
3255     * we're processing.
3256     */
3257    foreach_list_safe(node, &this->instructions) {
3258       vec4_instruction *inst = (vec4_instruction *)node;
3259
3260       /* Set up the annotation tracking for new generated instructions. */
3261       base_ir = inst->ir;
3262       current_annotation = inst->annotation;
3263
3264       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
3265          emit_scratch_write(inst, scratch_loc[inst->dst.reg]);
3266       }
3267
3268       for (int i = 0 ; i < 3; i++) {
3269          if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
3270             continue;
3271
3272          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3273
3274          emit_scratch_read(inst, temp, inst->src[i],
3275                            scratch_loc[inst->src[i].reg]);
3276
3277          inst->src[i].file = temp.file;
3278          inst->src[i].reg = temp.reg;
3279          inst->src[i].reg_offset = temp.reg_offset;
3280          inst->src[i].reladdr = NULL;
3281       }
3282    }
3283 }
3284
3285 /**
3286  * Emits an instruction before @inst to load the value named by @orig_src
3287  * from the pull constant buffer (surface) at @base_offset to @temp.
3288  */
3289 void
3290 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
3291                                       dst_reg temp, src_reg orig_src,
3292                                       int base_offset)
3293 {
3294    int reg_offset = base_offset + orig_src.reg_offset;
3295    src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3296    src_reg offset = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
3297    vec4_instruction *load;
3298
3299    if (brw->gen >= 7) {
3300       dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
3301       grf_offset.type = offset.type;
3302       emit_before(inst, MOV(grf_offset, offset));
3303
3304       load = new(mem_ctx) vec4_instruction(this,
3305                                            VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
3306                                            temp, index, src_reg(grf_offset));
3307    } else {
3308       load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
3309                                            temp, index, offset);
3310       load->base_mrf = 14;
3311       load->mlen = 1;
3312    }
3313    emit_before(inst, load);
3314 }
3315
3316 /**
3317  * Implements array access of uniforms by inserting a
3318  * PULL_CONSTANT_LOAD instruction.
3319  *
3320  * Unlike temporary GRF array access (where we don't support it due to
3321  * the difficulty of doing relative addressing on instruction
3322  * destinations), we could potentially do array access of uniforms
3323  * that were loaded in GRF space as push constants.  In real-world
3324  * usage we've seen, though, the arrays being used are always larger
3325  * than we could load as push constants, so just always move all
3326  * uniform array access out to a pull constant buffer.
3327  */
3328 void
3329 vec4_visitor::move_uniform_array_access_to_pull_constants()
3330 {
3331    int pull_constant_loc[this->uniforms];
3332
3333    for (int i = 0; i < this->uniforms; i++) {
3334       pull_constant_loc[i] = -1;
3335    }
3336
3337    /* Walk through and find array access of uniforms.  Put a copy of that
3338     * uniform in the pull constant buffer.
3339     *
3340     * Note that we don't move constant-indexed accesses to arrays.  No
3341     * testing has been done of the performance impact of this choice.
3342     */
3343    foreach_list_safe(node, &this->instructions) {
3344       vec4_instruction *inst = (vec4_instruction *)node;
3345
3346       for (int i = 0 ; i < 3; i++) {
3347          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3348             continue;
3349
3350          int uniform = inst->src[i].reg;
3351
3352          /* If this array isn't already present in the pull constant buffer,
3353           * add it.
3354           */
3355          if (pull_constant_loc[uniform] == -1) {
3356             const float **values = &stage_prog_data->param[uniform * 4];
3357
3358             pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3359
3360             assert(uniform < uniform_array_size);
3361             for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3362                stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3363                   = values[j];
3364             }
3365          }
3366
3367          /* Set up the annotation tracking for new generated instructions. */
3368          base_ir = inst->ir;
3369          current_annotation = inst->annotation;
3370
3371          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3372
3373          emit_pull_constant_load(inst, temp, inst->src[i],
3374                                  pull_constant_loc[uniform]);
3375
3376          inst->src[i].file = temp.file;
3377          inst->src[i].reg = temp.reg;
3378          inst->src[i].reg_offset = temp.reg_offset;
3379          inst->src[i].reladdr = NULL;
3380       }
3381    }
3382
3383    /* Now there are no accesses of the UNIFORM file with a reladdr, so
3384     * no need to track them as larger-than-vec4 objects.  This will be
3385     * relied on in cutting out unused uniform vectors from push
3386     * constants.
3387     */
3388    split_uniform_registers();
3389 }
3390
3391 void
3392 vec4_visitor::resolve_ud_negate(src_reg *reg)
3393 {
3394    if (reg->type != BRW_REGISTER_TYPE_UD ||
3395        !reg->negate)
3396       return;
3397
3398    src_reg temp = src_reg(this, glsl_type::uvec4_type);
3399    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3400    *reg = temp;
3401 }
3402
3403 vec4_visitor::vec4_visitor(struct brw_context *brw,
3404                            struct brw_vec4_compile *c,
3405                            struct gl_program *prog,
3406                            const struct brw_vec4_prog_key *key,
3407                            struct brw_vec4_prog_data *prog_data,
3408                            struct gl_shader_program *shader_prog,
3409                            gl_shader_stage stage,
3410                            void *mem_ctx,
3411                            bool debug_flag,
3412                            bool no_spills,
3413                            shader_time_shader_type st_base,
3414                            shader_time_shader_type st_written,
3415                            shader_time_shader_type st_reset)
3416    : backend_visitor(brw, shader_prog, prog, &prog_data->base, stage),
3417      c(c),
3418      key(key),
3419      prog_data(prog_data),
3420      sanity_param_count(0),
3421      fail_msg(NULL),
3422      first_non_payload_grf(0),
3423      need_all_constants_in_pull_buffer(false),
3424      debug_flag(debug_flag),
3425      no_spills(no_spills),
3426      st_base(st_base),
3427      st_written(st_written),
3428      st_reset(st_reset)
3429 {
3430    this->mem_ctx = mem_ctx;
3431    this->failed = false;
3432
3433    this->base_ir = NULL;
3434    this->current_annotation = NULL;
3435    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3436
3437    this->variable_ht = hash_table_ctor(0,
3438                                        hash_table_pointer_hash,
3439                                        hash_table_pointer_compare);
3440
3441    this->virtual_grf_start = NULL;
3442    this->virtual_grf_end = NULL;
3443    this->virtual_grf_sizes = NULL;
3444    this->virtual_grf_count = 0;
3445    this->virtual_grf_reg_map = NULL;
3446    this->virtual_grf_reg_count = 0;
3447    this->virtual_grf_array_size = 0;
3448    this->live_intervals_valid = false;
3449
3450    this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3451
3452    this->uniforms = 0;
3453
3454    /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3455     * at least one. See setup_uniforms() in brw_vec4.cpp.
3456     */
3457    this->uniform_array_size = 1;
3458    if (prog_data) {
3459       this->uniform_array_size = MAX2(stage_prog_data->nr_params, 1);
3460    }
3461
3462    this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3463    this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3464 }
3465
3466 vec4_visitor::~vec4_visitor()
3467 {
3468    hash_table_dtor(this->variable_ht);
3469 }
3470
3471
3472 void
3473 vec4_visitor::fail(const char *format, ...)
3474 {
3475    va_list va;
3476    char *msg;
3477
3478    if (failed)
3479       return;
3480
3481    failed = true;
3482
3483    va_start(va, format);
3484    msg = ralloc_vasprintf(mem_ctx, format, va);
3485    va_end(va);
3486    msg = ralloc_asprintf(mem_ctx, "vec4 compile failed: %s\n", msg);
3487
3488    this->fail_msg = msg;
3489
3490    if (debug_flag) {
3491       fprintf(stderr, "%s",  msg);
3492    }
3493 }
3494
3495 } /* namespace brw */