src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "glsl/ir_uniform.h"
  26 extern "C" {
  27 #include "program/sampler.h"
  28 }
  29
  30 namespace brw {
  31
  32 vec4_instruction::vec4_instruction(vec4_visitor *v,
  33                                    enum opcode opcode, const dst_reg &dst,
  34                                    const src_reg &src0, const src_reg &src1,
  35                                    const src_reg &src2)
  36 {
  37    this->opcode = opcode;
  38    this->dst = dst;
  39    this->src[0] = src0;
  40    this->src[1] = src1;
  41    this->src[2] = src2;
  42    this->saturate = false;
  43    this->force_writemask_all = false;
  44    this->no_dd_clear = false;
  45    this->no_dd_check = false;
  46    this->writes_accumulator = false;
  47    this->conditional_mod = BRW_CONDITIONAL_NONE;
  48    this->sampler = 0;
  49    this->texture_offset = 0;
  50    this->target = 0;
  51    this->shadow_compare = false;
  52    this->ir = v->base_ir;
  53    this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
  54    this->header_present = false;
  55    this->mlen = 0;
  56    this->base_mrf = 0;
  57    this->offset = 0;
  58    this->annotation = v->current_annotation;
  59 }
  60
  61 vec4_instruction *
  62 vec4_visitor::emit(vec4_instruction *inst)
  63 {
  64    this->instructions.push_tail(inst);
  65
  66    return inst;
  67 }
  68
  69 vec4_instruction *
  70 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
  71 {
  72    new_inst->ir = inst->ir;
  73    new_inst->annotation = inst->annotation;
  74
  75    inst->insert_before(new_inst);
  76
  77    return inst;
  78 }
  79
  80 vec4_instruction *
  81 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
  82                    src_reg src0, src_reg src1, src_reg src2)
  83 {
  84    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
  85                                              src0, src1, src2));
  86 }
  87
  88
  89 vec4_instruction *
  90 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
  91 {
  92    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
  93 }
  94
  95 vec4_instruction *
  96 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
  97 {
  98    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
  99 }
 100
 101 vec4_instruction *
 102 vec4_visitor::emit(enum opcode opcode, dst_reg dst)
 103 {
 104    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst));
 105 }
 106
 107 vec4_instruction *
 108 vec4_visitor::emit(enum opcode opcode)
 109 {
 110    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
 111 }
 112
 113 #define ALU1(op)                                                        \
 114    vec4_instruction *                                                   \
 115    vec4_visitor::op(const dst_reg &dst, const src_reg &src0)            \
 116    {                                                                    \
 117       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 118                                            src0);                       \
 119    }
 120
 121 #define ALU2(op)                                                        \
 122    vec4_instruction *                                                   \
 123    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 124                     const src_reg &src1)                                \
 125    {                                                                    \
 126       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 127                                            src0, src1);                 \
 128    }
 129
 130 #define ALU2_ACC(op)                                                    \
 131    vec4_instruction *                                                   \
 132    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 133                     const src_reg &src1)                                \
 134    {                                                                    \
 135       vec4_instruction *inst = new(mem_ctx) vec4_instruction(this,     \
 136                        BRW_OPCODE_##op, dst, src0, src1);               \
 137       inst->writes_accumulator = true;                                 \
 138       return inst;                                                     \
 139    }
 140
 141 #define ALU3(op)                                                        \
 142    vec4_instruction *                                                   \
 143    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 144                     const src_reg &src1, const src_reg &src2)           \
 145    {                                                                    \
 146       assert(brw->gen >= 6);                                            \
 147       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 148                                            src0, src1, src2);           \
 149    }
 150
 151 ALU1(NOT)
 152 ALU1(MOV)
 153 ALU1(FRC)
 154 ALU1(RNDD)
 155 ALU1(RNDE)
 156 ALU1(RNDZ)
 157 ALU1(F32TO16)
 158 ALU1(F16TO32)
 159 ALU2(ADD)
 160 ALU2(MUL)
 161 ALU2_ACC(MACH)
 162 ALU2(AND)
 163 ALU2(OR)
 164 ALU2(XOR)
 165 ALU2(DP3)
 166 ALU2(DP4)
 167 ALU2(DPH)
 168 ALU2(SHL)
 169 ALU2(SHR)
 170 ALU2(ASR)
 171 ALU3(LRP)
 172 ALU1(BFREV)
 173 ALU3(BFE)
 174 ALU2(BFI1)
 175 ALU3(BFI2)
 176 ALU1(FBH)
 177 ALU1(FBL)
 178 ALU1(CBIT)
 179 ALU3(MAD)
 180 ALU2_ACC(ADDC)
 181 ALU2_ACC(SUBB)
 182 ALU2(MAC)
 183
 184 /** Gen4 predicated IF. */
 185 vec4_instruction *
 186 vec4_visitor::IF(enum brw_predicate predicate)
 187 {
 188    vec4_instruction *inst;
 189
 190    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
 191    inst->predicate = predicate;
 192
 193    return inst;
 194 }
 195
 196 /** Gen6 IF with embedded comparison. */
 197 vec4_instruction *
 198 vec4_visitor::IF(src_reg src0, src_reg src1,
 199                  enum brw_conditional_mod condition)
 200 {
 201    assert(brw->gen == 6);
 202
 203    vec4_instruction *inst;
 204
 205    resolve_ud_negate(&src0);
 206    resolve_ud_negate(&src1);
 207
 208    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
 209                                         src0, src1);
 210    inst->conditional_mod = condition;
 211
 212    return inst;
 213 }
 214
 215 /**
 216  * CMP: Sets the low bit of the destination channels with the result
 217  * of the comparison, while the upper bits are undefined, and updates
 218  * the flag register with the packed 16 bits of the result.
 219  */
 220 vec4_instruction *
 221 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
 222                   enum brw_conditional_mod condition)
 223 {
 224    vec4_instruction *inst;
 225
 226    /* original gen4 does type conversion to the destination type
 227     * before before comparison, producing garbage results for floating
 228     * point comparisons.
 229     */
 230    if (brw->gen == 4) {
 231       dst.type = src0.type;
 232       if (dst.file == HW_REG)
 233          dst.fixed_hw_reg.type = dst.type;
 234    }
 235
 236    resolve_ud_negate(&src0);
 237    resolve_ud_negate(&src1);
 238
 239    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
 240    inst->conditional_mod = condition;
 241
 242    return inst;
 243 }
 244
 245 vec4_instruction *
 246 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
 247 {
 248    vec4_instruction *inst;
 249
 250    inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_READ,
 251                                         dst, index);
 252    inst->base_mrf = 14;
 253    inst->mlen = 2;
 254
 255    return inst;
 256 }
 257
 258 vec4_instruction *
 259 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
 260                             const src_reg &index)
 261 {
 262    vec4_instruction *inst;
 263
 264    inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_WRITE,
 265                                         dst, src, index);
 266    inst->base_mrf = 13;
 267    inst->mlen = 3;
 268
 269    return inst;
 270 }
 271
 272 void
 273 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
 274 {
 275    static enum opcode dot_opcodes[] = {
 276       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
 277    };
 278
 279    emit(dot_opcodes[elements - 2], dst, src0, src1);
 280 }
 281
 282 src_reg
 283 vec4_visitor::fix_3src_operand(src_reg src)
 284 {
 285    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
 286     * able to use vertical stride of zero to replicate the vec4 uniform, like
 287     *
 288     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
 289     *
 290     * But you can't, since vertical stride is always four in three-source
 291     * instructions. Instead, insert a MOV instruction to do the replication so
 292     * that the three-source instruction can consume it.
 293     */
 294
 295    /* The MOV is only needed if the source is a uniform or immediate. */
 296    if (src.file != UNIFORM && src.file != IMM)
 297       return src;
 298
 299    if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
 300       return src;
 301
 302    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 303    expanded.type = src.type;
 304    emit(MOV(expanded, src));
 305    return src_reg(expanded);
 306 }
 307
 308 src_reg
 309 vec4_visitor::fix_math_operand(src_reg src)
 310 {
 311    /* The gen6 math instruction ignores the source modifiers --
 312     * swizzle, abs, negate, and at least some parts of the register
 313     * region description.
 314     *
 315     * Rather than trying to enumerate all these cases, *always* expand the
 316     * operand to a temp GRF for gen6.
 317     *
 318     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 319     * can't use.
 320     */
 321
 322    if (brw->gen == 7 && src.file != IMM)
 323       return src;
 324
 325    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 326    expanded.type = src.type;
 327    emit(MOV(expanded, src));
 328    return src_reg(expanded);
 329 }
 330
 331 void
 332 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
 333 {
 334    src = fix_math_operand(src);
 335
 336    if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
 337       /* The gen6 math instruction must be align1, so we can't do
 338        * writemasks.
 339        */
 340       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 341
 342       emit(opcode, temp_dst, src);
 343
 344       emit(MOV(dst, src_reg(temp_dst)));
 345    } else {
 346       emit(opcode, dst, src);
 347    }
 348 }
 349
 350 void
 351 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
 352 {
 353    vec4_instruction *inst = emit(opcode, dst, src);
 354    inst->base_mrf = 1;
 355    inst->mlen = 1;
 356 }
 357
 358 void
 359 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
 360 {
 361    switch (opcode) {
 362    case SHADER_OPCODE_RCP:
 363    case SHADER_OPCODE_RSQ:
 364    case SHADER_OPCODE_SQRT:
 365    case SHADER_OPCODE_EXP2:
 366    case SHADER_OPCODE_LOG2:
 367    case SHADER_OPCODE_SIN:
 368    case SHADER_OPCODE_COS:
 369       break;
 370    default:
 371       unreachable("not reached: bad math opcode");
 372    }
 373
 374    if (brw->gen >= 8) {
 375       emit(opcode, dst, src);
 376    } else if (brw->gen >= 6) {
 377       emit_math1_gen6(opcode, dst, src);
 378    } else {
 379       emit_math1_gen4(opcode, dst, src);
 380    }
 381 }
 382
 383 void
 384 vec4_visitor::emit_math2_gen6(enum opcode opcode,
 385                               dst_reg dst, src_reg src0, src_reg src1)
 386 {
 387    src0 = fix_math_operand(src0);
 388    src1 = fix_math_operand(src1);
 389
 390    if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
 391       /* The gen6 math instruction must be align1, so we can't do
 392        * writemasks.
 393        */
 394       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 395       temp_dst.type = dst.type;
 396
 397       emit(opcode, temp_dst, src0, src1);
 398
 399       emit(MOV(dst, src_reg(temp_dst)));
 400    } else {
 401       emit(opcode, dst, src0, src1);
 402    }
 403 }
 404
 405 void
 406 vec4_visitor::emit_math2_gen4(enum opcode opcode,
 407                               dst_reg dst, src_reg src0, src_reg src1)
 408 {
 409    vec4_instruction *inst = emit(opcode, dst, src0, src1);
 410    inst->base_mrf = 1;
 411    inst->mlen = 2;
 412 }
 413
 414 void
 415 vec4_visitor::emit_math(enum opcode opcode,
 416                         dst_reg dst, src_reg src0, src_reg src1)
 417 {
 418    switch (opcode) {
 419    case SHADER_OPCODE_POW:
 420    case SHADER_OPCODE_INT_QUOTIENT:
 421    case SHADER_OPCODE_INT_REMAINDER:
 422       break;
 423    default:
 424       unreachable("not reached: unsupported binary math opcode");
 425    }
 426
 427    if (brw->gen >= 8) {
 428       emit(opcode, dst, src0, src1);
 429    } else if (brw->gen >= 6) {
 430       emit_math2_gen6(opcode, dst, src0, src1);
 431    } else {
 432       emit_math2_gen4(opcode, dst, src0, src1);
 433    }
 434 }
 435
 436 void
 437 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 438 {
 439    if (brw->gen < 7) {
 440       unreachable("ir_unop_pack_half_2x16 should be lowered");
 441    }
 442
 443    assert(dst.type == BRW_REGISTER_TYPE_UD);
 444    assert(src0.type == BRW_REGISTER_TYPE_F);
 445
 446    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
 447     *
 448     *   Because this instruction does not have a 16-bit floating-point type,
 449     *   the destination data type must be Word (W).
 450     *
 451     *   The destination must be DWord-aligned and specify a horizontal stride
 452     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
 453     *   each destination channel and the upper word is not modified.
 454     *
 455     * The above restriction implies that the f32to16 instruction must use
 456     * align1 mode, because only in align1 mode is it possible to specify
 457     * horizontal stride.  We choose here to defy the hardware docs and emit
 458     * align16 instructions.
 459     *
 460     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
 461     * instructions. I was partially successful in that the code passed all
 462     * tests.  However, the code was dubiously correct and fragile, and the
 463     * tests were not harsh enough to probe that frailty. Not trusting the
 464     * code, I chose instead to remain in align16 mode in defiance of the hw
 465     * docs).
 466     *
 467     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
 468     * simulator, emitting a f32to16 in align16 mode with UD as destination
 469     * data type is safe. The behavior differs from that specified in the PRM
 470     * in that the upper word of each destination channel is cleared to 0.
 471     */
 472
 473    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 474    src_reg tmp_src(tmp_dst);
 475
 476 #if 0
 477    /* Verify the undocumented behavior on which the following instructions
 478     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
 479     * then the result of the bit-or instruction below will be incorrect.
 480     *
 481     * You should inspect the disasm output in order to verify that the MOV is
 482     * not optimized away.
 483     */
 484    emit(MOV(tmp_dst, src_reg(0x12345678u)));
 485 #endif
 486
 487    /* Give tmp the form below, where "." means untouched.
 488     *
 489     *     w z          y          x w z          y          x
 490     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
 491     *
 492     * That the upper word of each write-channel be 0 is required for the
 493     * following bit-shift and bit-or instructions to work. Note that this
 494     * relies on the undocumented hardware behavior mentioned above.
 495     */
 496    tmp_dst.writemask = WRITEMASK_XY;
 497    emit(F32TO16(tmp_dst, src0));
 498
 499    /* Give the write-channels of dst the form:
 500     *   0xhhhh0000
 501     */
 502    tmp_src.swizzle = BRW_SWIZZLE_YYYY;
 503    emit(SHL(dst, tmp_src, src_reg(16u)));
 504
 505    /* Finally, give the write-channels of dst the form of packHalf2x16's
 506     * output:
 507     *   0xhhhhllll
 508     */
 509    tmp_src.swizzle = BRW_SWIZZLE_XXXX;
 510    emit(OR(dst, src_reg(dst), tmp_src));
 511 }
 512
 513 void
 514 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 515 {
 516    if (brw->gen < 7) {
 517       unreachable("ir_unop_unpack_half_2x16 should be lowered");
 518    }
 519
 520    assert(dst.type == BRW_REGISTER_TYPE_F);
 521    assert(src0.type == BRW_REGISTER_TYPE_UD);
 522
 523    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
 524     *
 525     *   Because this instruction does not have a 16-bit floating-point type,
 526     *   the source data type must be Word (W). The destination type must be
 527     *   F (Float).
 528     *
 529     * To use W as the source data type, we must adjust horizontal strides,
 530     * which is only possible in align1 mode. All my [chadv] attempts at
 531     * emitting align1 instructions for unpackHalf2x16 failed to pass the
 532     * Piglit tests, so I gave up.
 533     *
 534     * I've verified that, on gen7 hardware and the simulator, it is safe to
 535     * emit f16to32 in align16 mode with UD as source data type.
 536     */
 537
 538    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 539    src_reg tmp_src(tmp_dst);
 540
 541    tmp_dst.writemask = WRITEMASK_X;
 542    emit(AND(tmp_dst, src0, src_reg(0xffffu)));
 543
 544    tmp_dst.writemask = WRITEMASK_Y;
 545    emit(SHR(tmp_dst, src0, src_reg(16u)));
 546
 547    dst.writemask = WRITEMASK_XY;
 548    emit(F16TO32(dst, tmp_src));
 549 }
 550
 551 void
 552 vec4_visitor::visit_instructions(const exec_list *list)
 553 {
 554    foreach_in_list(ir_instruction, ir, list) {
 555       base_ir = ir;
 556       ir->accept(this);
 557    }
 558 }
 559
 560
 561 static int
 562 type_size(const struct glsl_type *type)
 563 {
 564    unsigned int i;
 565    int size;
 566
 567    switch (type->base_type) {
 568    case GLSL_TYPE_UINT:
 569    case GLSL_TYPE_INT:
 570    case GLSL_TYPE_FLOAT:
 571    case GLSL_TYPE_BOOL:
 572       if (type->is_matrix()) {
 573          return type->matrix_columns;
 574       } else {
 575          /* Regardless of size of vector, it gets a vec4. This is bad
 576           * packing for things like floats, but otherwise arrays become a
 577           * mess.  Hopefully a later pass over the code can pack scalars
 578           * down if appropriate.
 579           */
 580          return 1;
 581       }
 582    case GLSL_TYPE_ARRAY:
 583       assert(type->length > 0);
 584       return type_size(type->fields.array) * type->length;
 585    case GLSL_TYPE_STRUCT:
 586       size = 0;
 587       for (i = 0; i < type->length; i++) {
 588          size += type_size(type->fields.structure[i].type);
 589       }
 590       return size;
 591    case GLSL_TYPE_SAMPLER:
 592       /* Samplers take up one slot in UNIFORMS[], but they're baked in
 593        * at link time.
 594        */
 595       return 1;
 596    case GLSL_TYPE_ATOMIC_UINT:
 597       return 0;
 598    case GLSL_TYPE_IMAGE:
 599    case GLSL_TYPE_VOID:
 600    case GLSL_TYPE_ERROR:
 601    case GLSL_TYPE_INTERFACE:
 602       unreachable("not reached");
 603    }
 604
 605    return 0;
 606 }
 607
 608 int
 609 vec4_visitor::virtual_grf_alloc(int size)
 610 {
 611    if (virtual_grf_array_size <= virtual_grf_count) {
 612       if (virtual_grf_array_size == 0)
 613          virtual_grf_array_size = 16;
 614       else
 615          virtual_grf_array_size *= 2;
 616       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 617                                    virtual_grf_array_size);
 618       virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
 619                                      virtual_grf_array_size);
 620    }
 621    virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
 622    virtual_grf_reg_count += size;
 623    virtual_grf_sizes[virtual_grf_count] = size;
 624    return virtual_grf_count++;
 625 }
 626
 627 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 628 {
 629    init();
 630
 631    this->file = GRF;
 632    this->reg = v->virtual_grf_alloc(type_size(type));
 633
 634    if (type->is_array() || type->is_record()) {
 635       this->swizzle = BRW_SWIZZLE_NOOP;
 636    } else {
 637       this->swizzle = swizzle_for_size(type->vector_elements);
 638    }
 639
 640    this->type = brw_type_for_base_type(type);
 641 }
 642
 643 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 644 {
 645    init();
 646
 647    this->file = GRF;
 648    this->reg = v->virtual_grf_alloc(type_size(type));
 649
 650    if (type->is_array() || type->is_record()) {
 651       this->writemask = WRITEMASK_XYZW;
 652    } else {
 653       this->writemask = (1 << type->vector_elements) - 1;
 654    }
 655
 656    this->type = brw_type_for_base_type(type);
 657 }
 658
 659 /* Our support for uniforms is piggy-backed on the struct
 660  * gl_fragment_program, because that's where the values actually
 661  * get stored, rather than in some global gl_shader_program uniform
 662  * store.
 663  */
 664 void
 665 vec4_visitor::setup_uniform_values(ir_variable *ir)
 666 {
 667    int namelen = strlen(ir->name);
 668
 669    /* The data for our (non-builtin) uniforms is stored in a series of
 670     * gl_uniform_driver_storage structs for each subcomponent that
 671     * glGetUniformLocation() could name.  We know it's been set up in the same
 672     * order we'd walk the type, so walk the list of storage and find anything
 673     * with our name, or the prefix of a component that starts with our name.
 674     */
 675    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 676       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 677
 678       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 679           (storage->name[namelen] != 0 &&
 680            storage->name[namelen] != '.' &&
 681            storage->name[namelen] != '[')) {
 682          continue;
 683       }
 684
 685       gl_constant_value *components = storage->storage;
 686       unsigned vector_count = (MAX2(storage->array_elements, 1) *
 687                                storage->type->matrix_columns);
 688
 689       for (unsigned s = 0; s < vector_count; s++) {
 690          assert(uniforms < uniform_array_size);
 691          uniform_vector_size[uniforms] = storage->type->vector_elements;
 692
 693          int i;
 694          for (i = 0; i < uniform_vector_size[uniforms]; i++) {
 695             stage_prog_data->param[uniforms * 4 + i] = &components->f;
 696             components++;
 697          }
 698          for (; i < 4; i++) {
 699             static float zero = 0;
 700             stage_prog_data->param[uniforms * 4 + i] = &zero;
 701          }
 702
 703          uniforms++;
 704       }
 705    }
 706 }
 707
 708 void
 709 vec4_visitor::setup_uniform_clipplane_values()
 710 {
 711    gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
 712
 713    for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
 714       assert(this->uniforms < uniform_array_size);
 715       this->uniform_vector_size[this->uniforms] = 4;
 716       this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
 717       this->userplane[i].type = BRW_REGISTER_TYPE_F;
 718       for (int j = 0; j < 4; ++j) {
 719          stage_prog_data->param[this->uniforms * 4 + j] = &clip_planes[i][j];
 720       }
 721       ++this->uniforms;
 722    }
 723 }
 724
 725 /* Our support for builtin uniforms is even scarier than non-builtin.
 726  * It sits on top of the PROG_STATE_VAR parameters that are
 727  * automatically updated from GL context state.
 728  */
 729 void
 730 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
 731 {
 732    const ir_state_slot *const slots = ir->state_slots;
 733    assert(ir->state_slots != NULL);
 734
 735    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 736       /* This state reference has already been setup by ir_to_mesa,
 737        * but we'll get the same index back here.  We can reference
 738        * ParameterValues directly, since unlike brw_fs.cpp, we never
 739        * add new state references during compile.
 740        */
 741       int index = _mesa_add_state_reference(this->prog->Parameters,
 742                                             (gl_state_index *)slots[i].tokens);
 743       float *values = &this->prog->Parameters->ParameterValues[index][0].f;
 744
 745       assert(this->uniforms < uniform_array_size);
 746       this->uniform_vector_size[this->uniforms] = 0;
 747       /* Add each of the unique swizzled channels of the element.
 748        * This will end up matching the size of the glsl_type of this field.
 749        */
 750       int last_swiz = -1;
 751       for (unsigned int j = 0; j < 4; j++) {
 752          int swiz = GET_SWZ(slots[i].swizzle, j);
 753          last_swiz = swiz;
 754
 755          stage_prog_data->param[this->uniforms * 4 + j] = &values[swiz];
 756          assert(this->uniforms < uniform_array_size);
 757          if (swiz <= last_swiz)
 758             this->uniform_vector_size[this->uniforms]++;
 759       }
 760       this->uniforms++;
 761    }
 762 }
 763
 764 dst_reg *
 765 vec4_visitor::variable_storage(ir_variable *var)
 766 {
 767    return (dst_reg *)hash_table_find(this->variable_ht, var);
 768 }
 769
 770 void
 771 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir,
 772                                      enum brw_predicate *predicate)
 773 {
 774    ir_expression *expr = ir->as_expression();
 775
 776    *predicate = BRW_PREDICATE_NORMAL;
 777
 778    if (expr) {
 779       src_reg op[2];
 780       vec4_instruction *inst;
 781
 782       assert(expr->get_num_operands() <= 2);
 783       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 784          expr->operands[i]->accept(this);
 785          op[i] = this->result;
 786
 787          resolve_ud_negate(&op[i]);
 788       }
 789
 790       switch (expr->operation) {
 791       case ir_unop_logic_not:
 792          inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
 793          inst->conditional_mod = BRW_CONDITIONAL_Z;
 794          break;
 795
 796       case ir_binop_logic_xor:
 797          inst = emit(XOR(dst_null_d(), op[0], op[1]));
 798          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 799          break;
 800
 801       case ir_binop_logic_or:
 802          inst = emit(OR(dst_null_d(), op[0], op[1]));
 803          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 804          break;
 805
 806       case ir_binop_logic_and:
 807          inst = emit(AND(dst_null_d(), op[0], op[1]));
 808          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 809          break;
 810
 811       case ir_unop_f2b:
 812          if (brw->gen >= 6) {
 813             emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
 814          } else {
 815             inst = emit(MOV(dst_null_f(), op[0]));
 816             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 817          }
 818          break;
 819
 820       case ir_unop_i2b:
 821          if (brw->gen >= 6) {
 822             emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 823          } else {
 824             inst = emit(MOV(dst_null_d(), op[0]));
 825             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 826          }
 827          break;
 828
 829       case ir_binop_all_equal:
 830          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 831          *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
 832          break;
 833
 834       case ir_binop_any_nequal:
 835          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 836          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 837          break;
 838
 839       case ir_unop_any:
 840          inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 841          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 842          break;
 843
 844       case ir_binop_greater:
 845       case ir_binop_gequal:
 846       case ir_binop_less:
 847       case ir_binop_lequal:
 848       case ir_binop_equal:
 849       case ir_binop_nequal:
 850          emit(CMP(dst_null_d(), op[0], op[1],
 851                   brw_conditional_for_comparison(expr->operation)));
 852          break;
 853
 854       default:
 855          unreachable("not reached");
 856       }
 857       return;
 858    }
 859
 860    ir->accept(this);
 861
 862    resolve_ud_negate(&this->result);
 863
 864    if (brw->gen >= 6) {
 865       vec4_instruction *inst = emit(AND(dst_null_d(),
 866                                         this->result, src_reg(1)));
 867       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 868    } else {
 869       vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
 870       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 871    }
 872 }
 873
 874 /**
 875  * Emit a gen6 IF statement with the comparison folded into the IF
 876  * instruction.
 877  */
 878 void
 879 vec4_visitor::emit_if_gen6(ir_if *ir)
 880 {
 881    ir_expression *expr = ir->condition->as_expression();
 882
 883    if (expr) {
 884       src_reg op[2];
 885       dst_reg temp;
 886
 887       assert(expr->get_num_operands() <= 2);
 888       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 889          expr->operands[i]->accept(this);
 890          op[i] = this->result;
 891       }
 892
 893       switch (expr->operation) {
 894       case ir_unop_logic_not:
 895          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
 896          return;
 897
 898       case ir_binop_logic_xor:
 899          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
 900          return;
 901
 902       case ir_binop_logic_or:
 903          temp = dst_reg(this, glsl_type::bool_type);
 904          emit(OR(temp, op[0], op[1]));
 905          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 906          return;
 907
 908       case ir_binop_logic_and:
 909          temp = dst_reg(this, glsl_type::bool_type);
 910          emit(AND(temp, op[0], op[1]));
 911          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 912          return;
 913
 914       case ir_unop_f2b:
 915          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 916          return;
 917
 918       case ir_unop_i2b:
 919          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 920          return;
 921
 922       case ir_binop_greater:
 923       case ir_binop_gequal:
 924       case ir_binop_less:
 925       case ir_binop_lequal:
 926       case ir_binop_equal:
 927       case ir_binop_nequal:
 928          emit(IF(op[0], op[1],
 929                  brw_conditional_for_comparison(expr->operation)));
 930          return;
 931
 932       case ir_binop_all_equal:
 933          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 934          emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
 935          return;
 936
 937       case ir_binop_any_nequal:
 938          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 939          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 940          return;
 941
 942       case ir_unop_any:
 943          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 944          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 945          return;
 946
 947       default:
 948          unreachable("not reached");
 949       }
 950       return;
 951    }
 952
 953    ir->condition->accept(this);
 954
 955    emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
 956 }
 957
 958 void
 959 vec4_visitor::visit(ir_variable *ir)
 960 {
 961    dst_reg *reg = NULL;
 962
 963    if (variable_storage(ir))
 964       return;
 965
 966    switch (ir->data.mode) {
 967    case ir_var_shader_in:
 968       reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
 969       break;
 970
 971    case ir_var_shader_out:
 972       reg = new(mem_ctx) dst_reg(this, ir->type);
 973
 974       for (int i = 0; i < type_size(ir->type); i++) {
 975          output_reg[ir->data.location + i] = *reg;
 976          output_reg[ir->data.location + i].reg_offset = i;
 977          output_reg[ir->data.location + i].type =
 978             brw_type_for_base_type(ir->type->get_scalar_type());
 979          output_reg_annotation[ir->data.location + i] = ir->name;
 980       }
 981       break;
 982
 983    case ir_var_auto:
 984    case ir_var_temporary:
 985       reg = new(mem_ctx) dst_reg(this, ir->type);
 986       break;
 987
 988    case ir_var_uniform:
 989       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
 990
 991       /* Thanks to the lower_ubo_reference pass, we will see only
 992        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
 993        * variables, so no need for them to be in variable_ht.
 994        *
 995        * Atomic counters take no uniform storage, no need to do
 996        * anything here.
 997        */
 998       if (ir->is_in_uniform_block() || ir->type->contains_atomic())
 999          return;
1000
1001       /* Track how big the whole uniform variable is, in case we need to put a
1002        * copy of its data into pull constants for array access.
1003        */
1004       assert(this->uniforms < uniform_array_size);
1005       this->uniform_size[this->uniforms] = type_size(ir->type);
1006
1007       if (!strncmp(ir->name, "gl_", 3)) {
1008          setup_builtin_uniform_values(ir);
1009       } else {
1010          setup_uniform_values(ir);
1011       }
1012       break;
1013
1014    case ir_var_system_value:
1015       reg = make_reg_for_system_value(ir);
1016       break;
1017
1018    default:
1019       unreachable("not reached");
1020    }
1021
1022    reg->type = brw_type_for_base_type(ir->type);
1023    hash_table_insert(this->variable_ht, reg, ir);
1024 }
1025
1026 void
1027 vec4_visitor::visit(ir_loop *ir)
1028 {
1029    /* We don't want debugging output to print the whole body of the
1030     * loop as the annotation.
1031     */
1032    this->base_ir = NULL;
1033
1034    emit(BRW_OPCODE_DO);
1035
1036    visit_instructions(&ir->body_instructions);
1037
1038    emit(BRW_OPCODE_WHILE);
1039 }
1040
1041 void
1042 vec4_visitor::visit(ir_loop_jump *ir)
1043 {
1044    switch (ir->mode) {
1045    case ir_loop_jump::jump_break:
1046       emit(BRW_OPCODE_BREAK);
1047       break;
1048    case ir_loop_jump::jump_continue:
1049       emit(BRW_OPCODE_CONTINUE);
1050       break;
1051    }
1052 }
1053
1054
1055 void
1056 vec4_visitor::visit(ir_function_signature *)
1057 {
1058    unreachable("not reached");
1059 }
1060
1061 void
1062 vec4_visitor::visit(ir_function *ir)
1063 {
1064    /* Ignore function bodies other than main() -- we shouldn't see calls to
1065     * them since they should all be inlined.
1066     */
1067    if (strcmp(ir->name, "main") == 0) {
1068       const ir_function_signature *sig;
1069       exec_list empty;
1070
1071       sig = ir->matching_signature(NULL, &empty, false);
1072
1073       assert(sig);
1074
1075       visit_instructions(&sig->body);
1076    }
1077 }
1078
1079 bool
1080 vec4_visitor::try_emit_sat(ir_expression *ir)
1081 {
1082    ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
1083    if (!sat_src)
1084       return false;
1085
1086    sat_src->accept(this);
1087    src_reg src = this->result;
1088
1089    this->result = src_reg(this, ir->type);
1090    vec4_instruction *inst;
1091    inst = emit(MOV(dst_reg(this->result), src));
1092    inst->saturate = true;
1093
1094    return true;
1095 }
1096
1097 bool
1098 vec4_visitor::try_emit_mad(ir_expression *ir)
1099 {
1100    /* 3-src instructions were introduced in gen6. */
1101    if (brw->gen < 6)
1102       return false;
1103
1104    /* MAD can only handle floating-point data. */
1105    if (ir->type->base_type != GLSL_TYPE_FLOAT)
1106       return false;
1107
1108    ir_rvalue *nonmul = ir->operands[1];
1109    ir_expression *mul = ir->operands[0]->as_expression();
1110
1111    if (!mul || mul->operation != ir_binop_mul) {
1112       nonmul = ir->operands[0];
1113       mul = ir->operands[1]->as_expression();
1114
1115       if (!mul || mul->operation != ir_binop_mul)
1116          return false;
1117    }
1118
1119    nonmul->accept(this);
1120    src_reg src0 = fix_3src_operand(this->result);
1121
1122    mul->operands[0]->accept(this);
1123    src_reg src1 = fix_3src_operand(this->result);
1124
1125    mul->operands[1]->accept(this);
1126    src_reg src2 = fix_3src_operand(this->result);
1127
1128    this->result = src_reg(this, ir->type);
1129    emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1130
1131    return true;
1132 }
1133
1134 bool
1135 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1136 {
1137    ir_expression *const cmp = ir->operands[0]->as_expression();
1138
1139    if (cmp == NULL)
1140       return false;
1141
1142    switch (cmp->operation) {
1143    case ir_binop_less:
1144    case ir_binop_greater:
1145    case ir_binop_lequal:
1146    case ir_binop_gequal:
1147    case ir_binop_equal:
1148    case ir_binop_nequal:
1149       break;
1150
1151    default:
1152       return false;
1153    }
1154
1155    cmp->operands[0]->accept(this);
1156    const src_reg cmp_src0 = this->result;
1157
1158    cmp->operands[1]->accept(this);
1159    const src_reg cmp_src1 = this->result;
1160
1161    this->result = src_reg(this, ir->type);
1162
1163    emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1164             brw_conditional_for_comparison(cmp->operation)));
1165
1166    /* If the comparison is false, this->result will just happen to be zero.
1167     */
1168    vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1169                                        this->result, src_reg(1.0f));
1170    inst->predicate = BRW_PREDICATE_NORMAL;
1171    inst->predicate_inverse = true;
1172
1173    return true;
1174 }
1175
1176 void
1177 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
1178                           src_reg src0, src_reg src1)
1179 {
1180    vec4_instruction *inst;
1181
1182    if (brw->gen >= 6) {
1183       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1184       inst->conditional_mod = conditionalmod;
1185    } else {
1186       emit(CMP(dst, src0, src1, conditionalmod));
1187
1188       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1189       inst->predicate = BRW_PREDICATE_NORMAL;
1190    }
1191 }
1192
1193 void
1194 vec4_visitor::emit_lrp(const dst_reg &dst,
1195                        const src_reg &x, const src_reg &y, const src_reg &a)
1196 {
1197    if (brw->gen >= 6) {
1198       /* Note that the instruction's argument order is reversed from GLSL
1199        * and the IR.
1200        */
1201       emit(LRP(dst,
1202                fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
1203    } else {
1204       /* Earlier generations don't support three source operations, so we
1205        * need to emit x*(1-a) + y*a.
1206        */
1207       dst_reg y_times_a           = dst_reg(this, glsl_type::vec4_type);
1208       dst_reg one_minus_a         = dst_reg(this, glsl_type::vec4_type);
1209       dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1210       y_times_a.writemask           = dst.writemask;
1211       one_minus_a.writemask         = dst.writemask;
1212       x_times_one_minus_a.writemask = dst.writemask;
1213
1214       emit(MUL(y_times_a, y, a));
1215       emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1216       emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1217       emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1218    }
1219 }
1220
1221 void
1222 vec4_visitor::visit(ir_expression *ir)
1223 {
1224    unsigned int operand;
1225    src_reg op[Elements(ir->operands)];
1226    src_reg result_src;
1227    dst_reg result_dst;
1228    vec4_instruction *inst;
1229
1230    if (try_emit_sat(ir))
1231       return;
1232
1233    if (ir->operation == ir_binop_add) {
1234       if (try_emit_mad(ir))
1235          return;
1236    }
1237
1238    if (ir->operation == ir_unop_b2f) {
1239       if (try_emit_b2f_of_compare(ir))
1240          return;
1241    }
1242
1243    for (operand = 0; operand < ir->get_num_operands(); operand++) {
1244       this->result.file = BAD_FILE;
1245       ir->operands[operand]->accept(this);
1246       if (this->result.file == BAD_FILE) {
1247          fprintf(stderr, "Failed to get tree for expression operand:\n");
1248          ir->operands[operand]->fprint(stderr);
1249          exit(1);
1250       }
1251       op[operand] = this->result;
1252
1253       /* Matrix expression operands should have been broken down to vector
1254        * operations already.
1255        */
1256       assert(!ir->operands[operand]->type->is_matrix());
1257    }
1258
1259    int vector_elements = ir->operands[0]->type->vector_elements;
1260    if (ir->operands[1]) {
1261       vector_elements = MAX2(vector_elements,
1262                              ir->operands[1]->type->vector_elements);
1263    }
1264
1265    this->result.file = BAD_FILE;
1266
1267    /* Storage for our result.  Ideally for an assignment we'd be using
1268     * the actual storage for the result here, instead.
1269     */
1270    result_src = src_reg(this, ir->type);
1271    /* convenience for the emit functions below. */
1272    result_dst = dst_reg(result_src);
1273    /* If nothing special happens, this is the result. */
1274    this->result = result_src;
1275    /* Limit writes to the channels that will be used by result_src later.
1276     * This does limit this temp's use as a temporary for multi-instruction
1277     * sequences.
1278     */
1279    result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1280
1281    switch (ir->operation) {
1282    case ir_unop_logic_not:
1283       /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
1284        * ones complement of the whole register, not just bit 0.
1285        */
1286       emit(XOR(result_dst, op[0], src_reg(1)));
1287       break;
1288    case ir_unop_neg:
1289       op[0].negate = !op[0].negate;
1290       emit(MOV(result_dst, op[0]));
1291       break;
1292    case ir_unop_abs:
1293       op[0].abs = true;
1294       op[0].negate = false;
1295       emit(MOV(result_dst, op[0]));
1296       break;
1297
1298    case ir_unop_sign:
1299       if (ir->type->is_float()) {
1300          /* AND(val, 0x80000000) gives the sign bit.
1301           *
1302           * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1303           * zero.
1304           */
1305          emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1306
1307          op[0].type = BRW_REGISTER_TYPE_UD;
1308          result_dst.type = BRW_REGISTER_TYPE_UD;
1309          emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1310
1311          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1312          inst->predicate = BRW_PREDICATE_NORMAL;
1313
1314          this->result.type = BRW_REGISTER_TYPE_F;
1315       } else {
1316          /*  ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1317           *               -> non-negative val generates 0x00000000.
1318           *  Predicated OR sets 1 if val is positive.
1319           */
1320          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1321
1322          emit(ASR(result_dst, op[0], src_reg(31)));
1323
1324          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1325          inst->predicate = BRW_PREDICATE_NORMAL;
1326       }
1327       break;
1328
1329    case ir_unop_rcp:
1330       emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1331       break;
1332
1333    case ir_unop_exp2:
1334       emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1335       break;
1336    case ir_unop_log2:
1337       emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1338       break;
1339    case ir_unop_exp:
1340    case ir_unop_log:
1341       unreachable("not reached: should be handled by ir_explog_to_explog2");
1342    case ir_unop_sin:
1343    case ir_unop_sin_reduced:
1344       emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1345       break;
1346    case ir_unop_cos:
1347    case ir_unop_cos_reduced:
1348       emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1349       break;
1350
1351    case ir_unop_dFdx:
1352    case ir_unop_dFdy:
1353       unreachable("derivatives not valid in vertex shader");
1354
1355    case ir_unop_bitfield_reverse:
1356       emit(BFREV(result_dst, op[0]));
1357       break;
1358    case ir_unop_bit_count:
1359       emit(CBIT(result_dst, op[0]));
1360       break;
1361    case ir_unop_find_msb: {
1362       src_reg temp = src_reg(this, glsl_type::uint_type);
1363
1364       inst = emit(FBH(dst_reg(temp), op[0]));
1365       inst->dst.writemask = WRITEMASK_XYZW;
1366
1367       /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1368        * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1369        * subtract the result from 31 to convert the MSB count into an LSB count.
1370        */
1371
1372       /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1373       temp.swizzle = BRW_SWIZZLE_NOOP;
1374       emit(MOV(result_dst, temp));
1375
1376       src_reg src_tmp = src_reg(result_dst);
1377       emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1378
1379       src_tmp.negate = true;
1380       inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1381       inst->predicate = BRW_PREDICATE_NORMAL;
1382       break;
1383    }
1384    case ir_unop_find_lsb:
1385       emit(FBL(result_dst, op[0]));
1386       break;
1387
1388    case ir_unop_noise:
1389       unreachable("not reached: should be handled by lower_noise");
1390
1391    case ir_binop_add:
1392       emit(ADD(result_dst, op[0], op[1]));
1393       break;
1394    case ir_binop_sub:
1395       unreachable("not reached: should be handled by ir_sub_to_add_neg");
1396
1397    case ir_binop_mul:
1398       if (brw->gen < 8 && ir->type->is_integer()) {
1399          /* For integer multiplication, the MUL uses the low 16 bits of one of
1400           * the operands (src0 through SNB, src1 on IVB and later).  The MACH
1401           * accumulates in the contribution of the upper 16 bits of that
1402           * operand.  If we can determine that one of the args is in the low
1403           * 16 bits, though, we can just emit a single MUL.
1404           */
1405          if (ir->operands[0]->is_uint16_constant()) {
1406             if (brw->gen < 7)
1407                emit(MUL(result_dst, op[0], op[1]));
1408             else
1409                emit(MUL(result_dst, op[1], op[0]));
1410          } else if (ir->operands[1]->is_uint16_constant()) {
1411             if (brw->gen < 7)
1412                emit(MUL(result_dst, op[1], op[0]));
1413             else
1414                emit(MUL(result_dst, op[0], op[1]));
1415          } else {
1416             struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1417
1418             emit(MUL(acc, op[0], op[1]));
1419             emit(MACH(dst_null_d(), op[0], op[1]));
1420             emit(MOV(result_dst, src_reg(acc)));
1421          }
1422       } else {
1423          emit(MUL(result_dst, op[0], op[1]));
1424       }
1425       break;
1426    case ir_binop_imul_high: {
1427       struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1428
1429       emit(MUL(acc, op[0], op[1]));
1430       emit(MACH(result_dst, op[0], op[1]));
1431       break;
1432    }
1433    case ir_binop_div:
1434       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1435       assert(ir->type->is_integer());
1436       emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1437       break;
1438    case ir_binop_carry: {
1439       struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1440
1441       emit(ADDC(dst_null_ud(), op[0], op[1]));
1442       emit(MOV(result_dst, src_reg(acc)));
1443       break;
1444    }
1445    case ir_binop_borrow: {
1446       struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1447
1448       emit(SUBB(dst_null_ud(), op[0], op[1]));
1449       emit(MOV(result_dst, src_reg(acc)));
1450       break;
1451    }
1452    case ir_binop_mod:
1453       /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1454       assert(ir->type->is_integer());
1455       emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1456       break;
1457
1458    case ir_binop_less:
1459    case ir_binop_greater:
1460    case ir_binop_lequal:
1461    case ir_binop_gequal:
1462    case ir_binop_equal:
1463    case ir_binop_nequal: {
1464       emit(CMP(result_dst, op[0], op[1],
1465                brw_conditional_for_comparison(ir->operation)));
1466       emit(AND(result_dst, result_src, src_reg(0x1)));
1467       break;
1468    }
1469
1470    case ir_binop_all_equal:
1471       /* "==" operator producing a scalar boolean. */
1472       if (ir->operands[0]->type->is_vector() ||
1473           ir->operands[1]->type->is_vector()) {
1474          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1475          emit(MOV(result_dst, src_reg(0)));
1476          inst = emit(MOV(result_dst, src_reg(1)));
1477          inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1478       } else {
1479          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1480          emit(AND(result_dst, result_src, src_reg(0x1)));
1481       }
1482       break;
1483    case ir_binop_any_nequal:
1484       /* "!=" operator producing a scalar boolean. */
1485       if (ir->operands[0]->type->is_vector() ||
1486           ir->operands[1]->type->is_vector()) {
1487          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1488
1489          emit(MOV(result_dst, src_reg(0)));
1490          inst = emit(MOV(result_dst, src_reg(1)));
1491          inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1492       } else {
1493          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1494          emit(AND(result_dst, result_src, src_reg(0x1)));
1495       }
1496       break;
1497
1498    case ir_unop_any:
1499       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1500       emit(MOV(result_dst, src_reg(0)));
1501
1502       inst = emit(MOV(result_dst, src_reg(1)));
1503       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1504       break;
1505
1506    case ir_binop_logic_xor:
1507       emit(XOR(result_dst, op[0], op[1]));
1508       break;
1509
1510    case ir_binop_logic_or:
1511       emit(OR(result_dst, op[0], op[1]));
1512       break;
1513
1514    case ir_binop_logic_and:
1515       emit(AND(result_dst, op[0], op[1]));
1516       break;
1517
1518    case ir_binop_dot:
1519       assert(ir->operands[0]->type->is_vector());
1520       assert(ir->operands[0]->type == ir->operands[1]->type);
1521       emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1522       break;
1523
1524    case ir_unop_sqrt:
1525       emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1526       break;
1527    case ir_unop_rsq:
1528       emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1529       break;
1530
1531    case ir_unop_bitcast_i2f:
1532    case ir_unop_bitcast_u2f:
1533       this->result = op[0];
1534       this->result.type = BRW_REGISTER_TYPE_F;
1535       break;
1536
1537    case ir_unop_bitcast_f2i:
1538       this->result = op[0];
1539       this->result.type = BRW_REGISTER_TYPE_D;
1540       break;
1541
1542    case ir_unop_bitcast_f2u:
1543       this->result = op[0];
1544       this->result.type = BRW_REGISTER_TYPE_UD;
1545       break;
1546
1547    case ir_unop_i2f:
1548    case ir_unop_i2u:
1549    case ir_unop_u2i:
1550    case ir_unop_u2f:
1551    case ir_unop_b2f:
1552    case ir_unop_b2i:
1553    case ir_unop_f2i:
1554    case ir_unop_f2u:
1555       emit(MOV(result_dst, op[0]));
1556       break;
1557    case ir_unop_f2b:
1558    case ir_unop_i2b: {
1559       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1560       emit(AND(result_dst, result_src, src_reg(1)));
1561       break;
1562    }
1563
1564    case ir_unop_trunc:
1565       emit(RNDZ(result_dst, op[0]));
1566       break;
1567    case ir_unop_ceil:
1568       op[0].negate = !op[0].negate;
1569       inst = emit(RNDD(result_dst, op[0]));
1570       this->result.negate = true;
1571       break;
1572    case ir_unop_floor:
1573       inst = emit(RNDD(result_dst, op[0]));
1574       break;
1575    case ir_unop_fract:
1576       inst = emit(FRC(result_dst, op[0]));
1577       break;
1578    case ir_unop_round_even:
1579       emit(RNDE(result_dst, op[0]));
1580       break;
1581
1582    case ir_binop_min:
1583       emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1584       break;
1585    case ir_binop_max:
1586       emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1587       break;
1588
1589    case ir_binop_pow:
1590       emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1591       break;
1592
1593    case ir_unop_bit_not:
1594       inst = emit(NOT(result_dst, op[0]));
1595       break;
1596    case ir_binop_bit_and:
1597       inst = emit(AND(result_dst, op[0], op[1]));
1598       break;
1599    case ir_binop_bit_xor:
1600       inst = emit(XOR(result_dst, op[0], op[1]));
1601       break;
1602    case ir_binop_bit_or:
1603       inst = emit(OR(result_dst, op[0], op[1]));
1604       break;
1605
1606    case ir_binop_lshift:
1607       inst = emit(SHL(result_dst, op[0], op[1]));
1608       break;
1609
1610    case ir_binop_rshift:
1611       if (ir->type->base_type == GLSL_TYPE_INT)
1612          inst = emit(ASR(result_dst, op[0], op[1]));
1613       else
1614          inst = emit(SHR(result_dst, op[0], op[1]));
1615       break;
1616
1617    case ir_binop_bfm:
1618       emit(BFI1(result_dst, op[0], op[1]));
1619       break;
1620
1621    case ir_binop_ubo_load: {
1622       ir_constant *uniform_block = ir->operands[0]->as_constant();
1623       ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1624       unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1625       src_reg offset;
1626
1627       /* Now, load the vector from that offset. */
1628       assert(ir->type->is_vector() || ir->type->is_scalar());
1629
1630       src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1631       packed_consts.type = result.type;
1632       src_reg surf_index =
1633          src_reg(prog_data->base.binding_table.ubo_start + uniform_block->value.u[0]);
1634       if (const_offset_ir) {
1635          if (brw->gen >= 8) {
1636             /* Store the offset in a GRF so we can send-from-GRF. */
1637             offset = src_reg(this, glsl_type::int_type);
1638             emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1639          } else {
1640             /* Immediates are fine on older generations since they'll be moved
1641              * to a (potentially fake) MRF at the generator level.
1642              */
1643             offset = src_reg(const_offset / 16);
1644          }
1645       } else {
1646          offset = src_reg(this, glsl_type::uint_type);
1647          emit(SHR(dst_reg(offset), op[1], src_reg(4)));
1648       }
1649
1650       if (brw->gen >= 7) {
1651          dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1652          grf_offset.type = offset.type;
1653
1654          emit(MOV(grf_offset, offset));
1655
1656          emit(new(mem_ctx) vec4_instruction(this,
1657                                             VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1658                                             dst_reg(packed_consts),
1659                                             surf_index,
1660                                             src_reg(grf_offset)));
1661       } else {
1662          vec4_instruction *pull =
1663             emit(new(mem_ctx) vec4_instruction(this,
1664                                                VS_OPCODE_PULL_CONSTANT_LOAD,
1665                                                dst_reg(packed_consts),
1666                                                surf_index,
1667                                                offset));
1668          pull->base_mrf = 14;
1669          pull->mlen = 1;
1670       }
1671
1672       packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1673       packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1674                                             const_offset % 16 / 4,
1675                                             const_offset % 16 / 4,
1676                                             const_offset % 16 / 4);
1677
1678       /* UBO bools are any nonzero int.  We store bools as either 0 or 1. */
1679       if (ir->type->base_type == GLSL_TYPE_BOOL) {
1680          emit(CMP(result_dst, packed_consts, src_reg(0u),
1681                   BRW_CONDITIONAL_NZ));
1682          emit(AND(result_dst, result, src_reg(0x1)));
1683       } else {
1684          emit(MOV(result_dst, packed_consts));
1685       }
1686       break;
1687    }
1688
1689    case ir_binop_vector_extract:
1690       unreachable("should have been lowered by vec_index_to_cond_assign");
1691
1692    case ir_triop_fma:
1693       op[0] = fix_3src_operand(op[0]);
1694       op[1] = fix_3src_operand(op[1]);
1695       op[2] = fix_3src_operand(op[2]);
1696       /* Note that the instruction's argument order is reversed from GLSL
1697        * and the IR.
1698        */
1699       emit(MAD(result_dst, op[2], op[1], op[0]));
1700       break;
1701
1702    case ir_triop_lrp:
1703       emit_lrp(result_dst, op[0], op[1], op[2]);
1704       break;
1705
1706    case ir_triop_csel:
1707       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1708       inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1709       inst->predicate = BRW_PREDICATE_NORMAL;
1710       break;
1711
1712    case ir_triop_bfi:
1713       op[0] = fix_3src_operand(op[0]);
1714       op[1] = fix_3src_operand(op[1]);
1715       op[2] = fix_3src_operand(op[2]);
1716       emit(BFI2(result_dst, op[0], op[1], op[2]));
1717       break;
1718
1719    case ir_triop_bitfield_extract:
1720       op[0] = fix_3src_operand(op[0]);
1721       op[1] = fix_3src_operand(op[1]);
1722       op[2] = fix_3src_operand(op[2]);
1723       /* Note that the instruction's argument order is reversed from GLSL
1724        * and the IR.
1725        */
1726       emit(BFE(result_dst, op[2], op[1], op[0]));
1727       break;
1728
1729    case ir_triop_vector_insert:
1730       unreachable("should have been lowered by lower_vector_insert");
1731
1732    case ir_quadop_bitfield_insert:
1733       unreachable("not reached: should be handled by "
1734               "bitfield_insert_to_bfm_bfi\n");
1735
1736    case ir_quadop_vector:
1737       unreachable("not reached: should be handled by lower_quadop_vector");
1738
1739    case ir_unop_pack_half_2x16:
1740       emit_pack_half_2x16(result_dst, op[0]);
1741       break;
1742    case ir_unop_unpack_half_2x16:
1743       emit_unpack_half_2x16(result_dst, op[0]);
1744       break;
1745    case ir_unop_pack_snorm_2x16:
1746    case ir_unop_pack_snorm_4x8:
1747    case ir_unop_pack_unorm_2x16:
1748    case ir_unop_pack_unorm_4x8:
1749    case ir_unop_unpack_snorm_2x16:
1750    case ir_unop_unpack_snorm_4x8:
1751    case ir_unop_unpack_unorm_2x16:
1752    case ir_unop_unpack_unorm_4x8:
1753       unreachable("not reached: should be handled by lower_packing_builtins");
1754    case ir_unop_unpack_half_2x16_split_x:
1755    case ir_unop_unpack_half_2x16_split_y:
1756    case ir_binop_pack_half_2x16_split:
1757    case ir_unop_interpolate_at_centroid:
1758    case ir_binop_interpolate_at_sample:
1759    case ir_binop_interpolate_at_offset:
1760       unreachable("not reached: should not occur in vertex shader");
1761    case ir_binop_ldexp:
1762       unreachable("not reached: should be handled by ldexp_to_arith()");
1763    }
1764 }
1765
1766
1767 void
1768 vec4_visitor::visit(ir_swizzle *ir)
1769 {
1770    src_reg src;
1771    int i = 0;
1772    int swizzle[4];
1773
1774    /* Note that this is only swizzles in expressions, not those on the left
1775     * hand side of an assignment, which do write masking.  See ir_assignment
1776     * for that.
1777     */
1778
1779    ir->val->accept(this);
1780    src = this->result;
1781    assert(src.file != BAD_FILE);
1782
1783    for (i = 0; i < ir->type->vector_elements; i++) {
1784       switch (i) {
1785       case 0:
1786          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1787          break;
1788       case 1:
1789          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1790          break;
1791       case 2:
1792          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1793          break;
1794       case 3:
1795          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1796             break;
1797       }
1798    }
1799    for (; i < 4; i++) {
1800       /* Replicate the last channel out. */
1801       swizzle[i] = swizzle[ir->type->vector_elements - 1];
1802    }
1803
1804    src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1805
1806    this->result = src;
1807 }
1808
1809 void
1810 vec4_visitor::visit(ir_dereference_variable *ir)
1811 {
1812    const struct glsl_type *type = ir->type;
1813    dst_reg *reg = variable_storage(ir->var);
1814
1815    if (!reg) {
1816       fail("Failed to find variable storage for %s\n", ir->var->name);
1817       this->result = src_reg(brw_null_reg());
1818       return;
1819    }
1820
1821    this->result = src_reg(*reg);
1822
1823    /* System values get their swizzle from the dst_reg writemask */
1824    if (ir->var->data.mode == ir_var_system_value)
1825       return;
1826
1827    if (type->is_scalar() || type->is_vector() || type->is_matrix())
1828       this->result.swizzle = swizzle_for_size(type->vector_elements);
1829 }
1830
1831
1832 int
1833 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1834 {
1835    /* Under normal circumstances array elements are stored consecutively, so
1836     * the stride is equal to the size of the array element.
1837     */
1838    return type_size(ir->type);
1839 }
1840
1841
1842 void
1843 vec4_visitor::visit(ir_dereference_array *ir)
1844 {
1845    ir_constant *constant_index;
1846    src_reg src;
1847    int array_stride = compute_array_stride(ir);
1848
1849    constant_index = ir->array_index->constant_expression_value();
1850
1851    ir->array->accept(this);
1852    src = this->result;
1853
1854    if (constant_index) {
1855       src.reg_offset += constant_index->value.i[0] * array_stride;
1856    } else {
1857       /* Variable index array dereference.  It eats the "vec4" of the
1858        * base of the array and an index that offsets the Mesa register
1859        * index.
1860        */
1861       ir->array_index->accept(this);
1862
1863       src_reg index_reg;
1864
1865       if (array_stride == 1) {
1866          index_reg = this->result;
1867       } else {
1868          index_reg = src_reg(this, glsl_type::int_type);
1869
1870          emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
1871       }
1872
1873       if (src.reladdr) {
1874          src_reg temp = src_reg(this, glsl_type::int_type);
1875
1876          emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1877
1878          index_reg = temp;
1879       }
1880
1881       src.reladdr = ralloc(mem_ctx, src_reg);
1882       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1883    }
1884
1885    /* If the type is smaller than a vec4, replicate the last channel out. */
1886    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1887       src.swizzle = swizzle_for_size(ir->type->vector_elements);
1888    else
1889       src.swizzle = BRW_SWIZZLE_NOOP;
1890    src.type = brw_type_for_base_type(ir->type);
1891
1892    this->result = src;
1893 }
1894
1895 void
1896 vec4_visitor::visit(ir_dereference_record *ir)
1897 {
1898    unsigned int i;
1899    const glsl_type *struct_type = ir->record->type;
1900    int offset = 0;
1901
1902    ir->record->accept(this);
1903
1904    for (i = 0; i < struct_type->length; i++) {
1905       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1906          break;
1907       offset += type_size(struct_type->fields.structure[i].type);
1908    }
1909
1910    /* If the type is smaller than a vec4, replicate the last channel out. */
1911    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1912       this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1913    else
1914       this->result.swizzle = BRW_SWIZZLE_NOOP;
1915    this->result.type = brw_type_for_base_type(ir->type);
1916
1917    this->result.reg_offset += offset;
1918 }
1919
1920 /**
1921  * We want to be careful in assignment setup to hit the actual storage
1922  * instead of potentially using a temporary like we might with the
1923  * ir_dereference handler.
1924  */
1925 static dst_reg
1926 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1927 {
1928    /* The LHS must be a dereference.  If the LHS is a variable indexed array
1929     * access of a vector, it must be separated into a series conditional moves
1930     * before reaching this point (see ir_vec_index_to_cond_assign).
1931     */
1932    assert(ir->as_dereference());
1933    ir_dereference_array *deref_array = ir->as_dereference_array();
1934    if (deref_array) {
1935       assert(!deref_array->array->type->is_vector());
1936    }
1937
1938    /* Use the rvalue deref handler for the most part.  We'll ignore
1939     * swizzles in it and write swizzles using writemask, though.
1940     */
1941    ir->accept(v);
1942    return dst_reg(v->result);
1943 }
1944
1945 void
1946 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1947                               const struct glsl_type *type,
1948                               enum brw_predicate predicate)
1949 {
1950    if (type->base_type == GLSL_TYPE_STRUCT) {
1951       for (unsigned int i = 0; i < type->length; i++) {
1952          emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1953       }
1954       return;
1955    }
1956
1957    if (type->is_array()) {
1958       for (unsigned int i = 0; i < type->length; i++) {
1959          emit_block_move(dst, src, type->fields.array, predicate);
1960       }
1961       return;
1962    }
1963
1964    if (type->is_matrix()) {
1965       const struct glsl_type *vec_type;
1966
1967       vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1968                                          type->vector_elements, 1);
1969
1970       for (int i = 0; i < type->matrix_columns; i++) {
1971          emit_block_move(dst, src, vec_type, predicate);
1972       }
1973       return;
1974    }
1975
1976    assert(type->is_scalar() || type->is_vector());
1977
1978    dst->type = brw_type_for_base_type(type);
1979    src->type = dst->type;
1980
1981    dst->writemask = (1 << type->vector_elements) - 1;
1982
1983    src->swizzle = swizzle_for_size(type->vector_elements);
1984
1985    vec4_instruction *inst = emit(MOV(*dst, *src));
1986    inst->predicate = predicate;
1987
1988    dst->reg_offset++;
1989    src->reg_offset++;
1990 }
1991
1992
1993 /* If the RHS processing resulted in an instruction generating a
1994  * temporary value, and it would be easy to rewrite the instruction to
1995  * generate its result right into the LHS instead, do so.  This ends
1996  * up reliably removing instructions where it can be tricky to do so
1997  * later without real UD chain information.
1998  */
1999 bool
2000 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2001                                      dst_reg dst,
2002                                      src_reg src,
2003                                      vec4_instruction *pre_rhs_inst,
2004                                      vec4_instruction *last_rhs_inst)
2005 {
2006    /* This could be supported, but it would take more smarts. */
2007    if (ir->condition)
2008       return false;
2009
2010    if (pre_rhs_inst == last_rhs_inst)
2011       return false; /* No instructions generated to work with. */
2012
2013    /* Make sure the last instruction generated our source reg. */
2014    if (src.file != GRF ||
2015        src.file != last_rhs_inst->dst.file ||
2016        src.reg != last_rhs_inst->dst.reg ||
2017        src.reg_offset != last_rhs_inst->dst.reg_offset ||
2018        src.reladdr ||
2019        src.abs ||
2020        src.negate ||
2021        last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2022       return false;
2023
2024    /* Check that that last instruction fully initialized the channels
2025     * we want to use, in the order we want to use them.  We could
2026     * potentially reswizzle the operands of many instructions so that
2027     * we could handle out of order channels, but don't yet.
2028     */
2029
2030    for (unsigned i = 0; i < 4; i++) {
2031       if (dst.writemask & (1 << i)) {
2032          if (!(last_rhs_inst->dst.writemask & (1 << i)))
2033             return false;
2034
2035          if (BRW_GET_SWZ(src.swizzle, i) != i)
2036             return false;
2037       }
2038    }
2039
2040    /* Success!  Rewrite the instruction. */
2041    last_rhs_inst->dst.file = dst.file;
2042    last_rhs_inst->dst.reg = dst.reg;
2043    last_rhs_inst->dst.reg_offset = dst.reg_offset;
2044    last_rhs_inst->dst.reladdr = dst.reladdr;
2045    last_rhs_inst->dst.writemask &= dst.writemask;
2046
2047    return true;
2048 }
2049
2050 void
2051 vec4_visitor::visit(ir_assignment *ir)
2052 {
2053    dst_reg dst = get_assignment_lhs(ir->lhs, this);
2054    enum brw_predicate predicate = BRW_PREDICATE_NONE;
2055
2056    if (!ir->lhs->type->is_scalar() &&
2057        !ir->lhs->type->is_vector()) {
2058       ir->rhs->accept(this);
2059       src_reg src = this->result;
2060
2061       if (ir->condition) {
2062          emit_bool_to_cond_code(ir->condition, &predicate);
2063       }
2064
2065       /* emit_block_move doesn't account for swizzles in the source register.
2066        * This should be ok, since the source register is a structure or an
2067        * array, and those can't be swizzled.  But double-check to be sure.
2068        */
2069       assert(src.swizzle ==
2070              (ir->rhs->type->is_matrix()
2071               ? swizzle_for_size(ir->rhs->type->vector_elements)
2072               : BRW_SWIZZLE_NOOP));
2073
2074       emit_block_move(&dst, &src, ir->rhs->type, predicate);
2075       return;
2076    }
2077
2078    /* Now we're down to just a scalar/vector with writemasks. */
2079    int i;
2080
2081    vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2082    pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2083
2084    ir->rhs->accept(this);
2085
2086    last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2087
2088    src_reg src = this->result;
2089
2090    int swizzles[4];
2091    int first_enabled_chan = 0;
2092    int src_chan = 0;
2093
2094    assert(ir->lhs->type->is_vector() ||
2095           ir->lhs->type->is_scalar());
2096    dst.writemask = ir->write_mask;
2097
2098    for (int i = 0; i < 4; i++) {
2099       if (dst.writemask & (1 << i)) {
2100          first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
2101          break;
2102       }
2103    }
2104
2105    /* Swizzle a small RHS vector into the channels being written.
2106     *
2107     * glsl ir treats write_mask as dictating how many channels are
2108     * present on the RHS while in our instructions we need to make
2109     * those channels appear in the slots of the vec4 they're written to.
2110     */
2111    for (int i = 0; i < 4; i++) {
2112       if (dst.writemask & (1 << i))
2113          swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
2114       else
2115          swizzles[i] = first_enabled_chan;
2116    }
2117    src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2118                               swizzles[2], swizzles[3]);
2119
2120    if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2121       return;
2122    }
2123
2124    if (ir->condition) {
2125       emit_bool_to_cond_code(ir->condition, &predicate);
2126    }
2127
2128    for (i = 0; i < type_size(ir->lhs->type); i++) {
2129       vec4_instruction *inst = emit(MOV(dst, src));
2130       inst->predicate = predicate;
2131
2132       dst.reg_offset++;
2133       src.reg_offset++;
2134    }
2135 }
2136
2137 void
2138 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2139 {
2140    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2141       foreach_in_list(ir_constant, field_value, &ir->components) {
2142          emit_constant_values(dst, field_value);
2143       }
2144       return;
2145    }
2146
2147    if (ir->type->is_array()) {
2148       for (unsigned int i = 0; i < ir->type->length; i++) {
2149          emit_constant_values(dst, ir->array_elements[i]);
2150       }
2151       return;
2152    }
2153
2154    if (ir->type->is_matrix()) {
2155       for (int i = 0; i < ir->type->matrix_columns; i++) {
2156          float *vec = &ir->value.f[i * ir->type->vector_elements];
2157
2158          for (int j = 0; j < ir->type->vector_elements; j++) {
2159             dst->writemask = 1 << j;
2160             dst->type = BRW_REGISTER_TYPE_F;
2161
2162             emit(MOV(*dst, src_reg(vec[j])));
2163          }
2164          dst->reg_offset++;
2165       }
2166       return;
2167    }
2168
2169    int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2170
2171    for (int i = 0; i < ir->type->vector_elements; i++) {
2172       if (!(remaining_writemask & (1 << i)))
2173          continue;
2174
2175       dst->writemask = 1 << i;
2176       dst->type = brw_type_for_base_type(ir->type);
2177
2178       /* Find other components that match the one we're about to
2179        * write.  Emits fewer instructions for things like vec4(0.5,
2180        * 1.5, 1.5, 1.5).
2181        */
2182       for (int j = i + 1; j < ir->type->vector_elements; j++) {
2183          if (ir->type->base_type == GLSL_TYPE_BOOL) {
2184             if (ir->value.b[i] == ir->value.b[j])
2185                dst->writemask |= (1 << j);
2186          } else {
2187             /* u, i, and f storage all line up, so no need for a
2188              * switch case for comparing each type.
2189              */
2190             if (ir->value.u[i] == ir->value.u[j])
2191                dst->writemask |= (1 << j);
2192          }
2193       }
2194
2195       switch (ir->type->base_type) {
2196       case GLSL_TYPE_FLOAT:
2197          emit(MOV(*dst, src_reg(ir->value.f[i])));
2198          break;
2199       case GLSL_TYPE_INT:
2200          emit(MOV(*dst, src_reg(ir->value.i[i])));
2201          break;
2202       case GLSL_TYPE_UINT:
2203          emit(MOV(*dst, src_reg(ir->value.u[i])));
2204          break;
2205       case GLSL_TYPE_BOOL:
2206          emit(MOV(*dst, src_reg(ir->value.b[i])));
2207          break;
2208       default:
2209          unreachable("Non-float/uint/int/bool constant");
2210       }
2211
2212       remaining_writemask &= ~dst->writemask;
2213    }
2214    dst->reg_offset++;
2215 }
2216
2217 void
2218 vec4_visitor::visit(ir_constant *ir)
2219 {
2220    dst_reg dst = dst_reg(this, ir->type);
2221    this->result = src_reg(dst);
2222
2223    emit_constant_values(&dst, ir);
2224 }
2225
2226 void
2227 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2228 {
2229    ir_dereference *deref = static_cast<ir_dereference *>(
2230       ir->actual_parameters.get_head());
2231    ir_variable *location = deref->variable_referenced();
2232    unsigned surf_index = (prog_data->base.binding_table.abo_start +
2233                           location->data.atomic.buffer_index);
2234
2235    /* Calculate the surface offset */
2236    src_reg offset(this, glsl_type::uint_type);
2237    ir_dereference_array *deref_array = deref->as_dereference_array();
2238    if (deref_array) {
2239       deref_array->array_index->accept(this);
2240
2241       src_reg tmp(this, glsl_type::uint_type);
2242       emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2243       emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2244    } else {
2245       offset = location->data.atomic.offset;
2246    }
2247
2248    /* Emit the appropriate machine instruction */
2249    const char *callee = ir->callee->function_name();
2250    dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2251
2252    if (!strcmp("__intrinsic_atomic_read", callee)) {
2253       emit_untyped_surface_read(surf_index, dst, offset);
2254
2255    } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2256       emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2257                           src_reg(), src_reg());
2258
2259    } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2260       emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2261                           src_reg(), src_reg());
2262    }
2263 }
2264
2265 void
2266 vec4_visitor::visit(ir_call *ir)
2267 {
2268    const char *callee = ir->callee->function_name();
2269
2270    if (!strcmp("__intrinsic_atomic_read", callee) ||
2271        !strcmp("__intrinsic_atomic_increment", callee) ||
2272        !strcmp("__intrinsic_atomic_predecrement", callee)) {
2273       visit_atomic_counter_intrinsic(ir);
2274    } else {
2275       unreachable("Unsupported intrinsic.");
2276    }
2277 }
2278
2279 src_reg
2280 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, int sampler)
2281 {
2282    vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_MCS);
2283    inst->base_mrf = 2;
2284    inst->mlen = 1;
2285    inst->sampler = sampler;
2286    inst->dst = dst_reg(this, glsl_type::uvec4_type);
2287    inst->dst.writemask = WRITEMASK_XYZW;
2288
2289    /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2290    int param_base = inst->base_mrf;
2291    int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2292    int zero_mask = 0xf & ~coord_mask;
2293
2294    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2295             coordinate));
2296
2297    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2298             src_reg(0)));
2299
2300    emit(inst);
2301    return src_reg(inst->dst);
2302 }
2303
2304 void
2305 vec4_visitor::visit(ir_texture *ir)
2306 {
2307    int sampler =
2308       _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2309
2310    /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2311     * emitting anything other than setting up the constant result.
2312     */
2313    if (ir->op == ir_tg4) {
2314       ir_constant *chan = ir->lod_info.component->as_constant();
2315       int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2316       if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2317          dst_reg result(this, ir->type);
2318          this->result = src_reg(result);
2319          emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2320          return;
2321       }
2322    }
2323
2324    /* Should be lowered by do_lower_texture_projection */
2325    assert(!ir->projector);
2326
2327    /* Should be lowered */
2328    assert(!ir->offset || !ir->offset->type->is_array());
2329
2330    /* Generate code to compute all the subexpression trees.  This has to be
2331     * done before loading any values into MRFs for the sampler message since
2332     * generating these values may involve SEND messages that need the MRFs.
2333     */
2334    src_reg coordinate;
2335    if (ir->coordinate) {
2336       ir->coordinate->accept(this);
2337       coordinate = this->result;
2338    }
2339
2340    src_reg shadow_comparitor;
2341    if (ir->shadow_comparitor) {
2342       ir->shadow_comparitor->accept(this);
2343       shadow_comparitor = this->result;
2344    }
2345
2346    bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2347    src_reg offset_value;
2348    if (has_nonconstant_offset) {
2349       ir->offset->accept(this);
2350       offset_value = src_reg(this->result);
2351    }
2352
2353    const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2354    src_reg lod, dPdx, dPdy, sample_index, mcs;
2355    switch (ir->op) {
2356    case ir_tex:
2357       lod = src_reg(0.0f);
2358       lod_type = glsl_type::float_type;
2359       break;
2360    case ir_txf:
2361    case ir_txl:
2362    case ir_txs:
2363       ir->lod_info.lod->accept(this);
2364       lod = this->result;
2365       lod_type = ir->lod_info.lod->type;
2366       break;
2367    case ir_query_levels:
2368       lod = src_reg(0);
2369       lod_type = glsl_type::int_type;
2370       break;
2371    case ir_txf_ms:
2372       ir->lod_info.sample_index->accept(this);
2373       sample_index = this->result;
2374       sample_index_type = ir->lod_info.sample_index->type;
2375
2376       if (brw->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2377          mcs = emit_mcs_fetch(ir, coordinate, sampler);
2378       else
2379          mcs = src_reg(0u);
2380       break;
2381    case ir_txd:
2382       ir->lod_info.grad.dPdx->accept(this);
2383       dPdx = this->result;
2384
2385       ir->lod_info.grad.dPdy->accept(this);
2386       dPdy = this->result;
2387
2388       lod_type = ir->lod_info.grad.dPdx->type;
2389       break;
2390    case ir_txb:
2391    case ir_lod:
2392    case ir_tg4:
2393       break;
2394    }
2395
2396    enum opcode opcode;
2397    switch (ir->op) {
2398    case ir_tex: opcode = SHADER_OPCODE_TXL; break;
2399    case ir_txl: opcode = SHADER_OPCODE_TXL; break;
2400    case ir_txd: opcode = SHADER_OPCODE_TXD; break;
2401    case ir_txf: opcode = SHADER_OPCODE_TXF; break;
2402    case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
2403    case ir_txs: opcode = SHADER_OPCODE_TXS; break;
2404    case ir_tg4: opcode = has_nonconstant_offset
2405                          ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
2406    case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
2407    case ir_txb:
2408       unreachable("TXB is not valid for vertex shaders.");
2409    case ir_lod:
2410       unreachable("LOD is not valid for vertex shaders.");
2411    default:
2412       unreachable("Unrecognized tex op");
2413    }
2414
2415    vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, opcode);
2416
2417    if (ir->offset != NULL && ir->op != ir_txf)
2418       inst->texture_offset = brw_texture_offset(ctx, ir->offset->as_constant());
2419
2420    /* Stuff the channel select bits in the top of the texture offset */
2421    if (ir->op == ir_tg4)
2422       inst->texture_offset |= gather_channel(ir, sampler) << 16;
2423
2424    /* The message header is necessary for:
2425     * - Gen4 (always)
2426     * - Texel offsets
2427     * - Gather channel selection
2428     * - Sampler indices too large to fit in a 4-bit value.
2429     */
2430    inst->header_present =
2431       brw->gen < 5 || inst->texture_offset != 0 || ir->op == ir_tg4 ||
2432       sampler >= 16;
2433    inst->base_mrf = 2;
2434    inst->mlen = inst->header_present + 1; /* always at least one */
2435    inst->sampler = sampler;
2436    inst->dst = dst_reg(this, ir->type);
2437    inst->dst.writemask = WRITEMASK_XYZW;
2438    inst->shadow_compare = ir->shadow_comparitor != NULL;
2439
2440    /* MRF for the first parameter */
2441    int param_base = inst->base_mrf + inst->header_present;
2442
2443    if (ir->op == ir_txs || ir->op == ir_query_levels) {
2444       int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2445       emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2446    } else {
2447       /* Load the coordinate */
2448       /* FINISHME: gl_clamp_mask and saturate */
2449       int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2450       int zero_mask = 0xf & ~coord_mask;
2451
2452       emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2453                coordinate));
2454
2455       if (zero_mask != 0) {
2456          emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2457                   src_reg(0)));
2458       }
2459       /* Load the shadow comparitor */
2460       if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2461          emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2462                           WRITEMASK_X),
2463                   shadow_comparitor));
2464          inst->mlen++;
2465       }
2466
2467       /* Load the LOD info */
2468       if (ir->op == ir_tex || ir->op == ir_txl) {
2469          int mrf, writemask;
2470          if (brw->gen >= 5) {
2471             mrf = param_base + 1;
2472             if (ir->shadow_comparitor) {
2473                writemask = WRITEMASK_Y;
2474                /* mlen already incremented */
2475             } else {
2476                writemask = WRITEMASK_X;
2477                inst->mlen++;
2478             }
2479          } else /* brw->gen == 4 */ {
2480             mrf = param_base;
2481             writemask = WRITEMASK_W;
2482          }
2483          emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2484       } else if (ir->op == ir_txf) {
2485          emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2486       } else if (ir->op == ir_txf_ms) {
2487          emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2488                   sample_index));
2489          if (brw->gen >= 7)
2490             /* MCS data is in the first channel of `mcs`, but we need to get it into
2491              * the .y channel of the second vec4 of params, so replicate .x across
2492              * the whole vec4 and then mask off everything except .y
2493              */
2494             mcs.swizzle = BRW_SWIZZLE_XXXX;
2495             emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2496                      mcs));
2497          inst->mlen++;
2498       } else if (ir->op == ir_txd) {
2499          const glsl_type *type = lod_type;
2500
2501          if (brw->gen >= 5) {
2502             dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2503             dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2504             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2505             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2506             inst->mlen++;
2507
2508             if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2509                dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2510                dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2511                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2512                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2513                inst->mlen++;
2514
2515                if (ir->shadow_comparitor) {
2516                   emit(MOV(dst_reg(MRF, param_base + 2,
2517                                    ir->shadow_comparitor->type, WRITEMASK_Z),
2518                            shadow_comparitor));
2519                }
2520             }
2521          } else /* brw->gen == 4 */ {
2522             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2523             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2524             inst->mlen += 2;
2525          }
2526       } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2527          if (ir->shadow_comparitor) {
2528             emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2529                      shadow_comparitor));
2530          }
2531
2532          emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2533                   offset_value));
2534          inst->mlen++;
2535       }
2536    }
2537
2538    emit(inst);
2539
2540    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2541     * spec requires layers.
2542     */
2543    if (ir->op == ir_txs) {
2544       glsl_type const *type = ir->sampler->type;
2545       if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2546           type->sampler_array) {
2547          emit_math(SHADER_OPCODE_INT_QUOTIENT,
2548                    writemask(inst->dst, WRITEMASK_Z),
2549                    src_reg(inst->dst), src_reg(6));
2550       }
2551    }
2552
2553    if (brw->gen == 6 && ir->op == ir_tg4) {
2554       emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2555    }
2556
2557    swizzle_result(ir, src_reg(inst->dst), sampler);
2558 }
2559
2560 /**
2561  * Apply workarounds for Gen6 gather with UINT/SINT
2562  */
2563 void
2564 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2565 {
2566    if (!wa)
2567       return;
2568
2569    int width = (wa & WA_8BIT) ? 8 : 16;
2570    dst_reg dst_f = dst;
2571    dst_f.type = BRW_REGISTER_TYPE_F;
2572
2573    /* Convert from UNORM to UINT */
2574    emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2575    emit(MOV(dst, src_reg(dst_f)));
2576
2577    if (wa & WA_SIGN) {
2578       /* Reinterpret the UINT value as a signed INT value by
2579        * shifting the sign bit into place, then shifting back
2580        * preserving sign.
2581        */
2582       emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2583       emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2584    }
2585 }
2586
2587 /**
2588  * Set up the gather channel based on the swizzle, for gather4.
2589  */
2590 uint32_t
2591 vec4_visitor::gather_channel(ir_texture *ir, int sampler)
2592 {
2593    ir_constant *chan = ir->lod_info.component->as_constant();
2594    int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2595    switch (swiz) {
2596       case SWIZZLE_X: return 0;
2597       case SWIZZLE_Y:
2598          /* gather4 sampler is broken for green channel on RG32F --
2599           * we must ask for blue instead.
2600           */
2601          if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2602             return 2;
2603          return 1;
2604       case SWIZZLE_Z: return 2;
2605       case SWIZZLE_W: return 3;
2606       default:
2607          unreachable("Not reached"); /* zero, one swizzles handled already */
2608    }
2609 }
2610
2611 void
2612 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, int sampler)
2613 {
2614    int s = key->tex.swizzles[sampler];
2615
2616    this->result = src_reg(this, ir->type);
2617    dst_reg swizzled_result(this->result);
2618
2619    if (ir->op == ir_query_levels) {
2620       /* # levels is in .w */
2621       orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2622       emit(MOV(swizzled_result, orig_val));
2623       return;
2624    }
2625
2626    if (ir->op == ir_txs || ir->type == glsl_type::float_type
2627                         || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2628       emit(MOV(swizzled_result, orig_val));
2629       return;
2630    }
2631
2632
2633    int zero_mask = 0, one_mask = 0, copy_mask = 0;
2634    int swizzle[4] = {0};
2635
2636    for (int i = 0; i < 4; i++) {
2637       switch (GET_SWZ(s, i)) {
2638       case SWIZZLE_ZERO:
2639          zero_mask |= (1 << i);
2640          break;
2641       case SWIZZLE_ONE:
2642          one_mask |= (1 << i);
2643          break;
2644       default:
2645          copy_mask |= (1 << i);
2646          swizzle[i] = GET_SWZ(s, i);
2647          break;
2648       }
2649    }
2650
2651    if (copy_mask) {
2652       orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2653       swizzled_result.writemask = copy_mask;
2654       emit(MOV(swizzled_result, orig_val));
2655    }
2656
2657    if (zero_mask) {
2658       swizzled_result.writemask = zero_mask;
2659       emit(MOV(swizzled_result, src_reg(0.0f)));
2660    }
2661
2662    if (one_mask) {
2663       swizzled_result.writemask = one_mask;
2664       emit(MOV(swizzled_result, src_reg(1.0f)));
2665    }
2666 }
2667
2668 void
2669 vec4_visitor::visit(ir_return *)
2670 {
2671    unreachable("not reached");
2672 }
2673
2674 void
2675 vec4_visitor::visit(ir_discard *)
2676 {
2677    unreachable("not reached");
2678 }
2679
2680 void
2681 vec4_visitor::visit(ir_if *ir)
2682 {
2683    /* Don't point the annotation at the if statement, because then it plus
2684     * the then and else blocks get printed.
2685     */
2686    this->base_ir = ir->condition;
2687
2688    if (brw->gen == 6) {
2689       emit_if_gen6(ir);
2690    } else {
2691       enum brw_predicate predicate;
2692       emit_bool_to_cond_code(ir->condition, &predicate);
2693       emit(IF(predicate));
2694    }
2695
2696    visit_instructions(&ir->then_instructions);
2697
2698    if (!ir->else_instructions.is_empty()) {
2699       this->base_ir = ir->condition;
2700       emit(BRW_OPCODE_ELSE);
2701
2702       visit_instructions(&ir->else_instructions);
2703    }
2704
2705    this->base_ir = ir->condition;
2706    emit(BRW_OPCODE_ENDIF);
2707 }
2708
2709 void
2710 vec4_visitor::visit(ir_emit_vertex *)
2711 {
2712    unreachable("not reached");
2713 }
2714
2715 void
2716 vec4_visitor::visit(ir_end_primitive *)
2717 {
2718    unreachable("not reached");
2719 }
2720
2721 void
2722 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2723                                   dst_reg dst, src_reg offset,
2724                                   src_reg src0, src_reg src1)
2725 {
2726    unsigned mlen = 0;
2727
2728    /* Set the atomic operation offset. */
2729    emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2730    mlen++;
2731
2732    /* Set the atomic operation arguments. */
2733    if (src0.file != BAD_FILE) {
2734       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2735       mlen++;
2736    }
2737
2738    if (src1.file != BAD_FILE) {
2739       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2740       mlen++;
2741    }
2742
2743    /* Emit the instruction.  Note that this maps to the normal SIMD8
2744     * untyped atomic message on Ivy Bridge, but that's OK because
2745     * unused channels will be masked out.
2746     */
2747    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
2748                                  src_reg(atomic_op), src_reg(surf_index));
2749    inst->base_mrf = 0;
2750    inst->mlen = mlen;
2751 }
2752
2753 void
2754 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
2755                                         src_reg offset)
2756 {
2757    /* Set the surface read offset. */
2758    emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
2759
2760    /* Emit the instruction.  Note that this maps to the normal SIMD8
2761     * untyped surface read message, but that's OK because unused
2762     * channels will be masked out.
2763     */
2764    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ,
2765                                  dst, src_reg(surf_index));
2766    inst->base_mrf = 0;
2767    inst->mlen = 1;
2768 }
2769
2770 void
2771 vec4_visitor::emit_ndc_computation()
2772 {
2773    /* Get the position */
2774    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2775
2776    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2777    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2778    output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2779
2780    current_annotation = "NDC";
2781    dst_reg ndc_w = ndc;
2782    ndc_w.writemask = WRITEMASK_W;
2783    src_reg pos_w = pos;
2784    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2785    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2786
2787    dst_reg ndc_xyz = ndc;
2788    ndc_xyz.writemask = WRITEMASK_XYZ;
2789
2790    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2791 }
2792
2793 void
2794 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2795 {
2796    if (brw->gen < 6 &&
2797        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2798         key->userclip_active || brw->has_negative_rhw_bug)) {
2799       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2800       dst_reg header1_w = header1;
2801       header1_w.writemask = WRITEMASK_W;
2802
2803       emit(MOV(header1, 0u));
2804
2805       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2806          src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2807
2808          current_annotation = "Point size";
2809          emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2810          emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2811       }
2812
2813       if (key->userclip_active) {
2814          current_annotation = "Clipping flags";
2815          dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
2816          dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
2817
2818          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
2819          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
2820          emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
2821
2822          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
2823          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
2824          emit(SHL(flags1, src_reg(flags1), src_reg(4)));
2825          emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
2826       }
2827
2828       /* i965 clipping workaround:
2829        * 1) Test for -ve rhw
2830        * 2) If set,
2831        *      set ndc = (0,0,0,0)
2832        *      set ucp[6] = 1
2833        *
2834        * Later, clipping will detect ucp[6] and ensure the primitive is
2835        * clipped against all fixed planes.
2836        */
2837       if (brw->has_negative_rhw_bug) {
2838          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
2839          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
2840          emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
2841          vec4_instruction *inst;
2842          inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
2843          inst->predicate = BRW_PREDICATE_NORMAL;
2844          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
2845          inst->predicate = BRW_PREDICATE_NORMAL;
2846       }
2847
2848       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2849    } else if (brw->gen < 6) {
2850       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2851    } else {
2852       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2853       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2854          emit(MOV(brw_writemask(reg, WRITEMASK_W),
2855                   src_reg(output_reg[VARYING_SLOT_PSIZ])));
2856       }
2857       if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
2858          emit(MOV(retype(brw_writemask(reg, WRITEMASK_Y), BRW_REGISTER_TYPE_D),
2859                   src_reg(output_reg[VARYING_SLOT_LAYER])));
2860       }
2861       if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
2862          emit(MOV(retype(brw_writemask(reg, WRITEMASK_Z), BRW_REGISTER_TYPE_D),
2863                   src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
2864       }
2865    }
2866 }
2867
2868 void
2869 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
2870 {
2871    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2872     *
2873     *     "If a linked set of shaders forming the vertex stage contains no
2874     *     static write to gl_ClipVertex or gl_ClipDistance, but the
2875     *     application has requested clipping against user clip planes through
2876     *     the API, then the coordinate written to gl_Position is used for
2877     *     comparison against the user clip planes."
2878     *
2879     * This function is only called if the shader didn't write to
2880     * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
2881     * if the user wrote to it; otherwise we use gl_Position.
2882     */
2883    gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
2884    if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
2885       clip_vertex = VARYING_SLOT_POS;
2886    }
2887
2888    for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
2889         ++i) {
2890       reg.writemask = 1 << i;
2891       emit(DP4(reg,
2892                src_reg(output_reg[clip_vertex]),
2893                src_reg(this->userplane[i + offset])));
2894    }
2895 }
2896
2897 void
2898 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
2899 {
2900    assert (varying < VARYING_SLOT_MAX);
2901    reg.type = output_reg[varying].type;
2902    current_annotation = output_reg_annotation[varying];
2903    /* Copy the register, saturating if necessary */
2904    vec4_instruction *inst = emit(MOV(reg,
2905                                      src_reg(output_reg[varying])));
2906    if ((varying == VARYING_SLOT_COL0 ||
2907         varying == VARYING_SLOT_COL1 ||
2908         varying == VARYING_SLOT_BFC0 ||
2909         varying == VARYING_SLOT_BFC1) &&
2910        key->clamp_vertex_color) {
2911       inst->saturate = true;
2912    }
2913 }
2914
2915 void
2916 vec4_visitor::emit_urb_slot(int mrf, int varying)
2917 {
2918    struct brw_reg hw_reg = brw_message_reg(mrf);
2919    dst_reg reg = dst_reg(MRF, mrf);
2920    reg.type = BRW_REGISTER_TYPE_F;
2921
2922    switch (varying) {
2923    case VARYING_SLOT_PSIZ:
2924       /* PSIZ is always in slot 0, and is coupled with other flags. */
2925       current_annotation = "indices, point width, clip flags";
2926       emit_psiz_and_flags(hw_reg);
2927       break;
2928    case BRW_VARYING_SLOT_NDC:
2929       current_annotation = "NDC";
2930       emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
2931       break;
2932    case VARYING_SLOT_POS:
2933       current_annotation = "gl_Position";
2934       emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
2935       break;
2936    case VARYING_SLOT_EDGE:
2937       /* This is present when doing unfilled polygons.  We're supposed to copy
2938        * the edge flag from the user-provided vertex array
2939        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
2940        * of that attribute (starts as 1.0f).  This is then used in clipping to
2941        * determine which edges should be drawn as wireframe.
2942        */
2943       current_annotation = "edge flag";
2944       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
2945                                     glsl_type::float_type, WRITEMASK_XYZW))));
2946       break;
2947    case BRW_VARYING_SLOT_PAD:
2948       /* No need to write to this slot */
2949       break;
2950    default:
2951       emit_generic_urb_slot(reg, varying);
2952       break;
2953    }
2954 }
2955
2956 static int
2957 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
2958 {
2959    if (brw->gen >= 6) {
2960       /* URB data written (does not include the message header reg) must
2961        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
2962        * section 5.4.3.2.2: URB_INTERLEAVED.
2963        *
2964        * URB entries are allocated on a multiple of 1024 bits, so an
2965        * extra 128 bits written here to make the end align to 256 is
2966        * no problem.
2967        */
2968       if ((mlen % 2) != 1)
2969          mlen++;
2970    }
2971
2972    return mlen;
2973 }
2974
2975
2976 /**
2977  * Generates the VUE payload plus the necessary URB write instructions to
2978  * output it.
2979  *
2980  * The VUE layout is documented in Volume 2a.
2981  */
2982 void
2983 vec4_visitor::emit_vertex()
2984 {
2985    /* MRF 0 is reserved for the debugger, so start with message header
2986     * in MRF 1.
2987     */
2988    int base_mrf = 1;
2989    int mrf = base_mrf;
2990    /* In the process of generating our URB write message contents, we
2991     * may need to unspill a register or load from an array.  Those
2992     * reads would use MRFs 14-15.
2993     */
2994    int max_usable_mrf = 13;
2995
2996    /* The following assertion verifies that max_usable_mrf causes an
2997     * even-numbered amount of URB write data, which will meet gen6's
2998     * requirements for length alignment.
2999     */
3000    assert ((max_usable_mrf - base_mrf) % 2 == 0);
3001
3002    /* First mrf is the g0-based message header containing URB handles and
3003     * such.
3004     */
3005    emit_urb_write_header(mrf++);
3006
3007    if (brw->gen < 6) {
3008       emit_ndc_computation();
3009    }
3010
3011    /* Lower legacy ff and ClipVertex clipping to clip distances */
3012    if (key->userclip_active && !prog->UsesClipDistanceOut) {
3013       current_annotation = "user clip distances";
3014
3015       output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3016       output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3017
3018       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3019       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3020    }
3021
3022    /* We may need to split this up into several URB writes, so do them in a
3023     * loop.
3024     */
3025    int slot = 0;
3026    bool complete = false;
3027    do {
3028       /* URB offset is in URB row increments, and each of our MRFs is half of
3029        * one of those, since we're doing interleaved writes.
3030        */
3031       int offset = slot / 2;
3032
3033       mrf = base_mrf + 1;
3034       for (; slot < prog_data->vue_map.num_slots; ++slot) {
3035          emit_urb_slot(mrf++, prog_data->vue_map.slot_to_varying[slot]);
3036
3037          /* If this was max_usable_mrf, we can't fit anything more into this
3038           * URB WRITE.
3039           */
3040          if (mrf > max_usable_mrf) {
3041             slot++;
3042             break;
3043          }
3044       }
3045
3046       complete = slot >= prog_data->vue_map.num_slots;
3047       current_annotation = "URB write";
3048       vec4_instruction *inst = emit_urb_write_opcode(complete);
3049       inst->base_mrf = base_mrf;
3050       inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
3051       inst->offset += offset;
3052    } while(!complete);
3053 }
3054
3055
3056 src_reg
3057 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
3058                                  src_reg *reladdr, int reg_offset)
3059 {
3060    /* Because we store the values to scratch interleaved like our
3061     * vertex data, we need to scale the vec4 index by 2.
3062     */
3063    int message_header_scale = 2;
3064
3065    /* Pre-gen6, the message header uses byte offsets instead of vec4
3066     * (16-byte) offset units.
3067     */
3068    if (brw->gen < 6)
3069       message_header_scale *= 16;
3070
3071    if (reladdr) {
3072       src_reg index = src_reg(this, glsl_type::int_type);
3073
3074       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3075       emit_before(inst, MUL(dst_reg(index),
3076                             index, src_reg(message_header_scale)));
3077
3078       return index;
3079    } else {
3080       return src_reg(reg_offset * message_header_scale);
3081    }
3082 }
3083
3084 src_reg
3085 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
3086                                        src_reg *reladdr, int reg_offset)
3087 {
3088    if (reladdr) {
3089       src_reg index = src_reg(this, glsl_type::int_type);
3090
3091       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3092
3093       /* Pre-gen6, the message header uses byte offsets instead of vec4
3094        * (16-byte) offset units.
3095        */
3096       if (brw->gen < 6) {
3097          emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
3098       }
3099
3100       return index;
3101    } else if (brw->gen >= 8) {
3102       /* Store the offset in a GRF so we can send-from-GRF. */
3103       src_reg offset = src_reg(this, glsl_type::int_type);
3104       emit_before(inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3105       return offset;
3106    } else {
3107       int message_header_scale = brw->gen < 6 ? 16 : 1;
3108       return src_reg(reg_offset * message_header_scale);
3109    }
3110 }
3111
3112 /**
3113  * Emits an instruction before @inst to load the value named by @orig_src
3114  * from scratch space at @base_offset to @temp.
3115  *
3116  * @base_offset is measured in 32-byte units (the size of a register).
3117  */
3118 void
3119 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
3120                                 dst_reg temp, src_reg orig_src,
3121                                 int base_offset)
3122 {
3123    int reg_offset = base_offset + orig_src.reg_offset;
3124    src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
3125
3126    emit_before(inst, SCRATCH_READ(temp, index));
3127 }
3128
3129 /**
3130  * Emits an instruction after @inst to store the value to be written
3131  * to @orig_dst to scratch space at @base_offset, from @temp.
3132  *
3133  * @base_offset is measured in 32-byte units (the size of a register).
3134  */
3135 void
3136 vec4_visitor::emit_scratch_write(vec4_instruction *inst, int base_offset)
3137 {
3138    int reg_offset = base_offset + inst->dst.reg_offset;
3139    src_reg index = get_scratch_offset(inst, inst->dst.reladdr, reg_offset);
3140
3141    /* Create a temporary register to store *inst's result in.
3142     *
3143     * We have to be careful in MOVing from our temporary result register in
3144     * the scratch write.  If we swizzle from channels of the temporary that
3145     * weren't initialized, it will confuse live interval analysis, which will
3146     * make spilling fail to make progress.
3147     */
3148    src_reg temp = src_reg(this, glsl_type::vec4_type);
3149    temp.type = inst->dst.type;
3150    int first_writemask_chan = ffs(inst->dst.writemask) - 1;
3151    int swizzles[4];
3152    for (int i = 0; i < 4; i++)
3153       if (inst->dst.writemask & (1 << i))
3154          swizzles[i] = i;
3155       else
3156          swizzles[i] = first_writemask_chan;
3157    temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
3158                                swizzles[2], swizzles[3]);
3159
3160    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3161                                        inst->dst.writemask));
3162    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3163    write->predicate = inst->predicate;
3164    write->ir = inst->ir;
3165    write->annotation = inst->annotation;
3166    inst->insert_after(write);
3167
3168    inst->dst.file = temp.file;
3169    inst->dst.reg = temp.reg;
3170    inst->dst.reg_offset = temp.reg_offset;
3171    inst->dst.reladdr = NULL;
3172 }
3173
3174 /**
3175  * We can't generally support array access in GRF space, because a
3176  * single instruction's destination can only span 2 contiguous
3177  * registers.  So, we send all GRF arrays that get variable index
3178  * access to scratch space.
3179  */
3180 void
3181 vec4_visitor::move_grf_array_access_to_scratch()
3182 {
3183    int scratch_loc[this->virtual_grf_count];
3184
3185    for (int i = 0; i < this->virtual_grf_count; i++) {
3186       scratch_loc[i] = -1;
3187    }
3188
3189    /* First, calculate the set of virtual GRFs that need to be punted
3190     * to scratch due to having any array access on them, and where in
3191     * scratch.
3192     */
3193    foreach_in_list(vec4_instruction, inst, &instructions) {
3194       if (inst->dst.file == GRF && inst->dst.reladdr &&
3195           scratch_loc[inst->dst.reg] == -1) {
3196          scratch_loc[inst->dst.reg] = c->last_scratch;
3197          c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
3198       }
3199
3200       for (int i = 0 ; i < 3; i++) {
3201          src_reg *src = &inst->src[i];
3202
3203          if (src->file == GRF && src->reladdr &&
3204              scratch_loc[src->reg] == -1) {
3205             scratch_loc[src->reg] = c->last_scratch;
3206             c->last_scratch += this->virtual_grf_sizes[src->reg];
3207          }
3208       }
3209    }
3210
3211    /* Now, for anything that will be accessed through scratch, rewrite
3212     * it to load/store.  Note that this is a _safe list walk, because
3213     * we may generate a new scratch_write instruction after the one
3214     * we're processing.
3215     */
3216    foreach_in_list_safe(vec4_instruction, inst, &instructions) {
3217       /* Set up the annotation tracking for new generated instructions. */
3218       base_ir = inst->ir;
3219       current_annotation = inst->annotation;
3220
3221       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
3222          emit_scratch_write(inst, scratch_loc[inst->dst.reg]);
3223       }
3224
3225       for (int i = 0 ; i < 3; i++) {
3226          if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
3227             continue;
3228
3229          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3230
3231          emit_scratch_read(inst, temp, inst->src[i],
3232                            scratch_loc[inst->src[i].reg]);
3233
3234          inst->src[i].file = temp.file;
3235          inst->src[i].reg = temp.reg;
3236          inst->src[i].reg_offset = temp.reg_offset;
3237          inst->src[i].reladdr = NULL;
3238       }
3239    }
3240 }
3241
3242 /**
3243  * Emits an instruction before @inst to load the value named by @orig_src
3244  * from the pull constant buffer (surface) at @base_offset to @temp.
3245  */
3246 void
3247 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
3248                                       dst_reg temp, src_reg orig_src,
3249                                       int base_offset)
3250 {
3251    int reg_offset = base_offset + orig_src.reg_offset;
3252    src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3253    src_reg offset = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
3254    vec4_instruction *load;
3255
3256    if (brw->gen >= 7) {
3257       dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
3258       grf_offset.type = offset.type;
3259       emit_before(inst, MOV(grf_offset, offset));
3260
3261       load = new(mem_ctx) vec4_instruction(this,
3262                                            VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
3263                                            temp, index, src_reg(grf_offset));
3264    } else {
3265       load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
3266                                            temp, index, offset);
3267       load->base_mrf = 14;
3268       load->mlen = 1;
3269    }
3270    emit_before(inst, load);
3271 }
3272
3273 /**
3274  * Implements array access of uniforms by inserting a
3275  * PULL_CONSTANT_LOAD instruction.
3276  *
3277  * Unlike temporary GRF array access (where we don't support it due to
3278  * the difficulty of doing relative addressing on instruction
3279  * destinations), we could potentially do array access of uniforms
3280  * that were loaded in GRF space as push constants.  In real-world
3281  * usage we've seen, though, the arrays being used are always larger
3282  * than we could load as push constants, so just always move all
3283  * uniform array access out to a pull constant buffer.
3284  */
3285 void
3286 vec4_visitor::move_uniform_array_access_to_pull_constants()
3287 {
3288    int pull_constant_loc[this->uniforms];
3289
3290    for (int i = 0; i < this->uniforms; i++) {
3291       pull_constant_loc[i] = -1;
3292    }
3293
3294    /* Walk through and find array access of uniforms.  Put a copy of that
3295     * uniform in the pull constant buffer.
3296     *
3297     * Note that we don't move constant-indexed accesses to arrays.  No
3298     * testing has been done of the performance impact of this choice.
3299     */
3300    foreach_in_list_safe(vec4_instruction, inst, &instructions) {
3301       for (int i = 0 ; i < 3; i++) {
3302          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3303             continue;
3304
3305          int uniform = inst->src[i].reg;
3306
3307          /* If this array isn't already present in the pull constant buffer,
3308           * add it.
3309           */
3310          if (pull_constant_loc[uniform] == -1) {
3311             const float **values = &stage_prog_data->param[uniform * 4];
3312
3313             pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3314
3315             assert(uniform < uniform_array_size);
3316             for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3317                stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3318                   = values[j];
3319             }
3320          }
3321
3322          /* Set up the annotation tracking for new generated instructions. */
3323          base_ir = inst->ir;
3324          current_annotation = inst->annotation;
3325
3326          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3327
3328          emit_pull_constant_load(inst, temp, inst->src[i],
3329                                  pull_constant_loc[uniform]);
3330
3331          inst->src[i].file = temp.file;
3332          inst->src[i].reg = temp.reg;
3333          inst->src[i].reg_offset = temp.reg_offset;
3334          inst->src[i].reladdr = NULL;
3335       }
3336    }
3337
3338    /* Now there are no accesses of the UNIFORM file with a reladdr, so
3339     * no need to track them as larger-than-vec4 objects.  This will be
3340     * relied on in cutting out unused uniform vectors from push
3341     * constants.
3342     */
3343    split_uniform_registers();
3344 }
3345
3346 void
3347 vec4_visitor::resolve_ud_negate(src_reg *reg)
3348 {
3349    if (reg->type != BRW_REGISTER_TYPE_UD ||
3350        !reg->negate)
3351       return;
3352
3353    src_reg temp = src_reg(this, glsl_type::uvec4_type);
3354    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3355    *reg = temp;
3356 }
3357
3358 vec4_visitor::vec4_visitor(struct brw_context *brw,
3359                            struct brw_vec4_compile *c,
3360                            struct gl_program *prog,
3361                            const struct brw_vec4_prog_key *key,
3362                            struct brw_vec4_prog_data *prog_data,
3363                            struct gl_shader_program *shader_prog,
3364                            gl_shader_stage stage,
3365                            void *mem_ctx,
3366                            bool debug_flag,
3367                            bool no_spills,
3368                            shader_time_shader_type st_base,
3369                            shader_time_shader_type st_written,
3370                            shader_time_shader_type st_reset)
3371    : backend_visitor(brw, shader_prog, prog, &prog_data->base, stage),
3372      c(c),
3373      key(key),
3374      prog_data(prog_data),
3375      sanity_param_count(0),
3376      fail_msg(NULL),
3377      first_non_payload_grf(0),
3378      need_all_constants_in_pull_buffer(false),
3379      debug_flag(debug_flag),
3380      no_spills(no_spills),
3381      st_base(st_base),
3382      st_written(st_written),
3383      st_reset(st_reset)
3384 {
3385    this->mem_ctx = mem_ctx;
3386    this->failed = false;
3387
3388    this->base_ir = NULL;
3389    this->current_annotation = NULL;
3390    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3391
3392    this->variable_ht = hash_table_ctor(0,
3393                                        hash_table_pointer_hash,
3394                                        hash_table_pointer_compare);
3395
3396    this->virtual_grf_start = NULL;
3397    this->virtual_grf_end = NULL;
3398    this->virtual_grf_sizes = NULL;
3399    this->virtual_grf_count = 0;
3400    this->virtual_grf_reg_map = NULL;
3401    this->virtual_grf_reg_count = 0;
3402    this->virtual_grf_array_size = 0;
3403    this->live_intervals_valid = false;
3404
3405    this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3406
3407    this->uniforms = 0;
3408
3409    /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3410     * at least one. See setup_uniforms() in brw_vec4.cpp.
3411     */
3412    this->uniform_array_size = 1;
3413    if (prog_data) {
3414       this->uniform_array_size = MAX2(stage_prog_data->nr_params, 1);
3415    }
3416
3417    this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3418    this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3419 }
3420
3421 vec4_visitor::~vec4_visitor()
3422 {
3423    hash_table_dtor(this->variable_ht);
3424 }
3425
3426
3427 void
3428 vec4_visitor::fail(const char *format, ...)
3429 {
3430    va_list va;
3431    char *msg;
3432
3433    if (failed)
3434       return;
3435
3436    failed = true;
3437
3438    va_start(va, format);
3439    msg = ralloc_vasprintf(mem_ctx, format, va);
3440    va_end(va);
3441    msg = ralloc_asprintf(mem_ctx, "vec4 compile failed: %s\n", msg);
3442
3443    this->fail_msg = msg;
3444
3445    if (debug_flag) {
3446       fprintf(stderr, "%s",  msg);
3447    }
3448 }
3449
3450 } /* namespace brw */