src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "glsl/ir_uniform.h"
  26 extern "C" {
  27 #include "program/sampler.h"
  28 }
  29
  30 namespace brw {
  31
  32 vec4_instruction::vec4_instruction(vec4_visitor *v,
  33                                    enum opcode opcode, dst_reg dst,
  34                                    src_reg src0, src_reg src1, src_reg src2)
  35 {
  36    this->opcode = opcode;
  37    this->dst = dst;
  38    this->src[0] = src0;
  39    this->src[1] = src1;
  40    this->src[2] = src2;
  41    this->saturate = false;
  42    this->force_writemask_all = false;
  43    this->no_dd_clear = false;
  44    this->no_dd_check = false;
  45    this->conditional_mod = BRW_CONDITIONAL_NONE;
  46    this->sampler = 0;
  47    this->texture_offset = 0;
  48    this->target = 0;
  49    this->shadow_compare = false;
  50    this->ir = v->base_ir;
  51    this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
  52    this->header_present = false;
  53    this->mlen = 0;
  54    this->base_mrf = 0;
  55    this->offset = 0;
  56    this->annotation = v->current_annotation;
  57 }
  58
  59 vec4_instruction *
  60 vec4_visitor::emit(vec4_instruction *inst)
  61 {
  62    this->instructions.push_tail(inst);
  63
  64    return inst;
  65 }
  66
  67 vec4_instruction *
  68 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
  69 {
  70    new_inst->ir = inst->ir;
  71    new_inst->annotation = inst->annotation;
  72
  73    inst->insert_before(new_inst);
  74
  75    return inst;
  76 }
  77
  78 vec4_instruction *
  79 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
  80                    src_reg src0, src_reg src1, src_reg src2)
  81 {
  82    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
  83                                              src0, src1, src2));
  84 }
  85
  86
  87 vec4_instruction *
  88 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
  89 {
  90    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
  91 }
  92
  93 vec4_instruction *
  94 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
  95 {
  96    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
  97 }
  98
  99 vec4_instruction *
 100 vec4_visitor::emit(enum opcode opcode, dst_reg dst)
 101 {
 102    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst));
 103 }
 104
 105 vec4_instruction *
 106 vec4_visitor::emit(enum opcode opcode)
 107 {
 108    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
 109 }
 110
 111 #define ALU1(op)                                                        \
 112    vec4_instruction *                                                   \
 113    vec4_visitor::op(dst_reg dst, src_reg src0)                          \
 114    {                                                                    \
 115       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 116                                            src0);                       \
 117    }
 118
 119 #define ALU2(op)                                                        \
 120    vec4_instruction *                                                   \
 121    vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1)            \
 122    {                                                                    \
 123       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 124                                            src0, src1);                 \
 125    }
 126
 127 #define ALU3(op)                                                        \
 128    vec4_instruction *                                                   \
 129    vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1, src_reg src2)\
 130    {                                                                    \
 131       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 132                                            src0, src1, src2);           \
 133    }
 134
 135 ALU1(NOT)
 136 ALU1(MOV)
 137 ALU1(FRC)
 138 ALU1(RNDD)
 139 ALU1(RNDE)
 140 ALU1(RNDZ)
 141 ALU1(F32TO16)
 142 ALU1(F16TO32)
 143 ALU2(ADD)
 144 ALU2(MUL)
 145 ALU2(MACH)
 146 ALU2(AND)
 147 ALU2(OR)
 148 ALU2(XOR)
 149 ALU2(DP3)
 150 ALU2(DP4)
 151 ALU2(DPH)
 152 ALU2(SHL)
 153 ALU2(SHR)
 154 ALU2(ASR)
 155 ALU3(LRP)
 156 ALU1(BFREV)
 157 ALU3(BFE)
 158 ALU2(BFI1)
 159 ALU3(BFI2)
 160 ALU1(FBH)
 161 ALU1(FBL)
 162 ALU1(CBIT)
 163 ALU3(MAD)
 164 ALU2(ADDC)
 165 ALU2(SUBB)
 166
 167 /** Gen4 predicated IF. */
 168 vec4_instruction *
 169 vec4_visitor::IF(uint32_t predicate)
 170 {
 171    vec4_instruction *inst;
 172
 173    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
 174    inst->predicate = predicate;
 175
 176    return inst;
 177 }
 178
 179 /** Gen6 IF with embedded comparison. */
 180 vec4_instruction *
 181 vec4_visitor::IF(src_reg src0, src_reg src1, uint32_t condition)
 182 {
 183    assert(brw->gen == 6);
 184
 185    vec4_instruction *inst;
 186
 187    resolve_ud_negate(&src0);
 188    resolve_ud_negate(&src1);
 189
 190    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
 191                                         src0, src1);
 192    inst->conditional_mod = condition;
 193
 194    return inst;
 195 }
 196
 197 /**
 198  * CMP: Sets the low bit of the destination channels with the result
 199  * of the comparison, while the upper bits are undefined, and updates
 200  * the flag register with the packed 16 bits of the result.
 201  */
 202 vec4_instruction *
 203 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, uint32_t condition)
 204 {
 205    vec4_instruction *inst;
 206
 207    /* original gen4 does type conversion to the destination type
 208     * before before comparison, producing garbage results for floating
 209     * point comparisons.
 210     */
 211    if (brw->gen == 4) {
 212       dst.type = src0.type;
 213       if (dst.file == HW_REG)
 214          dst.fixed_hw_reg.type = dst.type;
 215    }
 216
 217    resolve_ud_negate(&src0);
 218    resolve_ud_negate(&src1);
 219
 220    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
 221    inst->conditional_mod = condition;
 222
 223    return inst;
 224 }
 225
 226 vec4_instruction *
 227 vec4_visitor::SCRATCH_READ(dst_reg dst, src_reg index)
 228 {
 229    vec4_instruction *inst;
 230
 231    inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_READ,
 232                                         dst, index);
 233    inst->base_mrf = 14;
 234    inst->mlen = 2;
 235
 236    return inst;
 237 }
 238
 239 vec4_instruction *
 240 vec4_visitor::SCRATCH_WRITE(dst_reg dst, src_reg src, src_reg index)
 241 {
 242    vec4_instruction *inst;
 243
 244    inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_WRITE,
 245                                         dst, src, index);
 246    inst->base_mrf = 13;
 247    inst->mlen = 3;
 248
 249    return inst;
 250 }
 251
 252 void
 253 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
 254 {
 255    static enum opcode dot_opcodes[] = {
 256       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
 257    };
 258
 259    emit(dot_opcodes[elements - 2], dst, src0, src1);
 260 }
 261
 262 src_reg
 263 vec4_visitor::fix_3src_operand(src_reg src)
 264 {
 265    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
 266     * able to use vertical stride of zero to replicate the vec4 uniform, like
 267     *
 268     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
 269     *
 270     * But you can't, since vertical stride is always four in three-source
 271     * instructions. Instead, insert a MOV instruction to do the replication so
 272     * that the three-source instruction can consume it.
 273     */
 274
 275    /* The MOV is only needed if the source is a uniform or immediate. */
 276    if (src.file != UNIFORM && src.file != IMM)
 277       return src;
 278
 279    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 280    expanded.type = src.type;
 281    emit(MOV(expanded, src));
 282    return src_reg(expanded);
 283 }
 284
 285 src_reg
 286 vec4_visitor::fix_math_operand(src_reg src)
 287 {
 288    /* The gen6 math instruction ignores the source modifiers --
 289     * swizzle, abs, negate, and at least some parts of the register
 290     * region description.
 291     *
 292     * Rather than trying to enumerate all these cases, *always* expand the
 293     * operand to a temp GRF for gen6.
 294     *
 295     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 296     * can't use.
 297     */
 298
 299    if (brw->gen == 7 && src.file != IMM)
 300       return src;
 301
 302    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 303    expanded.type = src.type;
 304    emit(MOV(expanded, src));
 305    return src_reg(expanded);
 306 }
 307
 308 void
 309 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
 310 {
 311    src = fix_math_operand(src);
 312
 313    if (dst.writemask != WRITEMASK_XYZW) {
 314       /* The gen6 math instruction must be align1, so we can't do
 315        * writemasks.
 316        */
 317       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 318
 319       emit(opcode, temp_dst, src);
 320
 321       emit(MOV(dst, src_reg(temp_dst)));
 322    } else {
 323       emit(opcode, dst, src);
 324    }
 325 }
 326
 327 void
 328 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
 329 {
 330    vec4_instruction *inst = emit(opcode, dst, src);
 331    inst->base_mrf = 1;
 332    inst->mlen = 1;
 333 }
 334
 335 void
 336 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
 337 {
 338    switch (opcode) {
 339    case SHADER_OPCODE_RCP:
 340    case SHADER_OPCODE_RSQ:
 341    case SHADER_OPCODE_SQRT:
 342    case SHADER_OPCODE_EXP2:
 343    case SHADER_OPCODE_LOG2:
 344    case SHADER_OPCODE_SIN:
 345    case SHADER_OPCODE_COS:
 346       break;
 347    default:
 348       assert(!"not reached: bad math opcode");
 349       return;
 350    }
 351
 352    if (brw->gen >= 6) {
 353       return emit_math1_gen6(opcode, dst, src);
 354    } else {
 355       return emit_math1_gen4(opcode, dst, src);
 356    }
 357 }
 358
 359 void
 360 vec4_visitor::emit_math2_gen6(enum opcode opcode,
 361                               dst_reg dst, src_reg src0, src_reg src1)
 362 {
 363    src0 = fix_math_operand(src0);
 364    src1 = fix_math_operand(src1);
 365
 366    if (dst.writemask != WRITEMASK_XYZW) {
 367       /* The gen6 math instruction must be align1, so we can't do
 368        * writemasks.
 369        */
 370       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 371       temp_dst.type = dst.type;
 372
 373       emit(opcode, temp_dst, src0, src1);
 374
 375       emit(MOV(dst, src_reg(temp_dst)));
 376    } else {
 377       emit(opcode, dst, src0, src1);
 378    }
 379 }
 380
 381 void
 382 vec4_visitor::emit_math2_gen4(enum opcode opcode,
 383                               dst_reg dst, src_reg src0, src_reg src1)
 384 {
 385    vec4_instruction *inst = emit(opcode, dst, src0, src1);
 386    inst->base_mrf = 1;
 387    inst->mlen = 2;
 388 }
 389
 390 void
 391 vec4_visitor::emit_math(enum opcode opcode,
 392                         dst_reg dst, src_reg src0, src_reg src1)
 393 {
 394    switch (opcode) {
 395    case SHADER_OPCODE_POW:
 396    case SHADER_OPCODE_INT_QUOTIENT:
 397    case SHADER_OPCODE_INT_REMAINDER:
 398       break;
 399    default:
 400       assert(!"not reached: unsupported binary math opcode");
 401       return;
 402    }
 403
 404    if (brw->gen >= 6) {
 405       return emit_math2_gen6(opcode, dst, src0, src1);
 406    } else {
 407       return emit_math2_gen4(opcode, dst, src0, src1);
 408    }
 409 }
 410
 411 void
 412 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 413 {
 414    if (brw->gen < 7)
 415       assert(!"ir_unop_pack_half_2x16 should be lowered");
 416
 417    assert(dst.type == BRW_REGISTER_TYPE_UD);
 418    assert(src0.type == BRW_REGISTER_TYPE_F);
 419
 420    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
 421     *
 422     *   Because this instruction does not have a 16-bit floating-point type,
 423     *   the destination data type must be Word (W).
 424     *
 425     *   The destination must be DWord-aligned and specify a horizontal stride
 426     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
 427     *   each destination channel and the upper word is not modified.
 428     *
 429     * The above restriction implies that the f32to16 instruction must use
 430     * align1 mode, because only in align1 mode is it possible to specify
 431     * horizontal stride.  We choose here to defy the hardware docs and emit
 432     * align16 instructions.
 433     *
 434     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
 435     * instructions. I was partially successful in that the code passed all
 436     * tests.  However, the code was dubiously correct and fragile, and the
 437     * tests were not harsh enough to probe that frailty. Not trusting the
 438     * code, I chose instead to remain in align16 mode in defiance of the hw
 439     * docs).
 440     *
 441     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
 442     * simulator, emitting a f32to16 in align16 mode with UD as destination
 443     * data type is safe. The behavior differs from that specified in the PRM
 444     * in that the upper word of each destination channel is cleared to 0.
 445     */
 446
 447    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 448    src_reg tmp_src(tmp_dst);
 449
 450 #if 0
 451    /* Verify the undocumented behavior on which the following instructions
 452     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
 453     * then the result of the bit-or instruction below will be incorrect.
 454     *
 455     * You should inspect the disasm output in order to verify that the MOV is
 456     * not optimized away.
 457     */
 458    emit(MOV(tmp_dst, src_reg(0x12345678u)));
 459 #endif
 460
 461    /* Give tmp the form below, where "." means untouched.
 462     *
 463     *     w z          y          x w z          y          x
 464     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
 465     *
 466     * That the upper word of each write-channel be 0 is required for the
 467     * following bit-shift and bit-or instructions to work. Note that this
 468     * relies on the undocumented hardware behavior mentioned above.
 469     */
 470    tmp_dst.writemask = WRITEMASK_XY;
 471    emit(F32TO16(tmp_dst, src0));
 472
 473    /* Give the write-channels of dst the form:
 474     *   0xhhhh0000
 475     */
 476    tmp_src.swizzle = BRW_SWIZZLE_YYYY;
 477    emit(SHL(dst, tmp_src, src_reg(16u)));
 478
 479    /* Finally, give the write-channels of dst the form of packHalf2x16's
 480     * output:
 481     *   0xhhhhllll
 482     */
 483    tmp_src.swizzle = BRW_SWIZZLE_XXXX;
 484    emit(OR(dst, src_reg(dst), tmp_src));
 485 }
 486
 487 void
 488 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 489 {
 490    if (brw->gen < 7)
 491       assert(!"ir_unop_unpack_half_2x16 should be lowered");
 492
 493    assert(dst.type == BRW_REGISTER_TYPE_F);
 494    assert(src0.type == BRW_REGISTER_TYPE_UD);
 495
 496    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
 497     *
 498     *   Because this instruction does not have a 16-bit floating-point type,
 499     *   the source data type must be Word (W). The destination type must be
 500     *   F (Float).
 501     *
 502     * To use W as the source data type, we must adjust horizontal strides,
 503     * which is only possible in align1 mode. All my [chadv] attempts at
 504     * emitting align1 instructions for unpackHalf2x16 failed to pass the
 505     * Piglit tests, so I gave up.
 506     *
 507     * I've verified that, on gen7 hardware and the simulator, it is safe to
 508     * emit f16to32 in align16 mode with UD as source data type.
 509     */
 510
 511    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 512    src_reg tmp_src(tmp_dst);
 513
 514    tmp_dst.writemask = WRITEMASK_X;
 515    emit(AND(tmp_dst, src0, src_reg(0xffffu)));
 516
 517    tmp_dst.writemask = WRITEMASK_Y;
 518    emit(SHR(tmp_dst, src0, src_reg(16u)));
 519
 520    dst.writemask = WRITEMASK_XY;
 521    emit(F16TO32(dst, tmp_src));
 522 }
 523
 524 void
 525 vec4_visitor::visit_instructions(const exec_list *list)
 526 {
 527    foreach_list(node, list) {
 528       ir_instruction *ir = (ir_instruction *)node;
 529
 530       base_ir = ir;
 531       ir->accept(this);
 532    }
 533 }
 534
 535
 536 static int
 537 type_size(const struct glsl_type *type)
 538 {
 539    unsigned int i;
 540    int size;
 541
 542    switch (type->base_type) {
 543    case GLSL_TYPE_UINT:
 544    case GLSL_TYPE_INT:
 545    case GLSL_TYPE_FLOAT:
 546    case GLSL_TYPE_BOOL:
 547       if (type->is_matrix()) {
 548          return type->matrix_columns;
 549       } else {
 550          /* Regardless of size of vector, it gets a vec4. This is bad
 551           * packing for things like floats, but otherwise arrays become a
 552           * mess.  Hopefully a later pass over the code can pack scalars
 553           * down if appropriate.
 554           */
 555          return 1;
 556       }
 557    case GLSL_TYPE_ARRAY:
 558       assert(type->length > 0);
 559       return type_size(type->fields.array) * type->length;
 560    case GLSL_TYPE_STRUCT:
 561       size = 0;
 562       for (i = 0; i < type->length; i++) {
 563          size += type_size(type->fields.structure[i].type);
 564       }
 565       return size;
 566    case GLSL_TYPE_SAMPLER:
 567       /* Samplers take up one slot in UNIFORMS[], but they're baked in
 568        * at link time.
 569        */
 570       return 1;
 571    case GLSL_TYPE_ATOMIC_UINT:
 572       return 0;
 573    case GLSL_TYPE_IMAGE:
 574    case GLSL_TYPE_VOID:
 575    case GLSL_TYPE_ERROR:
 576    case GLSL_TYPE_INTERFACE:
 577       assert(0);
 578       break;
 579    }
 580
 581    return 0;
 582 }
 583
 584 int
 585 vec4_visitor::virtual_grf_alloc(int size)
 586 {
 587    if (virtual_grf_array_size <= virtual_grf_count) {
 588       if (virtual_grf_array_size == 0)
 589          virtual_grf_array_size = 16;
 590       else
 591          virtual_grf_array_size *= 2;
 592       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 593                                    virtual_grf_array_size);
 594       virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
 595                                      virtual_grf_array_size);
 596    }
 597    virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
 598    virtual_grf_reg_count += size;
 599    virtual_grf_sizes[virtual_grf_count] = size;
 600    return virtual_grf_count++;
 601 }
 602
 603 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 604 {
 605    init();
 606
 607    this->file = GRF;
 608    this->reg = v->virtual_grf_alloc(type_size(type));
 609
 610    if (type->is_array() || type->is_record()) {
 611       this->swizzle = BRW_SWIZZLE_NOOP;
 612    } else {
 613       this->swizzle = swizzle_for_size(type->vector_elements);
 614    }
 615
 616    this->type = brw_type_for_base_type(type);
 617 }
 618
 619 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 620 {
 621    init();
 622
 623    this->file = GRF;
 624    this->reg = v->virtual_grf_alloc(type_size(type));
 625
 626    if (type->is_array() || type->is_record()) {
 627       this->writemask = WRITEMASK_XYZW;
 628    } else {
 629       this->writemask = (1 << type->vector_elements) - 1;
 630    }
 631
 632    this->type = brw_type_for_base_type(type);
 633 }
 634
 635 /* Our support for uniforms is piggy-backed on the struct
 636  * gl_fragment_program, because that's where the values actually
 637  * get stored, rather than in some global gl_shader_program uniform
 638  * store.
 639  */
 640 void
 641 vec4_visitor::setup_uniform_values(ir_variable *ir)
 642 {
 643    int namelen = strlen(ir->name);
 644
 645    /* The data for our (non-builtin) uniforms is stored in a series of
 646     * gl_uniform_driver_storage structs for each subcomponent that
 647     * glGetUniformLocation() could name.  We know it's been set up in the same
 648     * order we'd walk the type, so walk the list of storage and find anything
 649     * with our name, or the prefix of a component that starts with our name.
 650     */
 651    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 652       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 653
 654       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 655           (storage->name[namelen] != 0 &&
 656            storage->name[namelen] != '.' &&
 657            storage->name[namelen] != '[')) {
 658          continue;
 659       }
 660
 661       gl_constant_value *components = storage->storage;
 662       unsigned vector_count = (MAX2(storage->array_elements, 1) *
 663                                storage->type->matrix_columns);
 664
 665       for (unsigned s = 0; s < vector_count; s++) {
 666          uniform_vector_size[uniforms] = storage->type->vector_elements;
 667
 668          int i;
 669          for (i = 0; i < uniform_vector_size[uniforms]; i++) {
 670             stage_prog_data->param[uniforms * 4 + i] = &components->f;
 671             components++;
 672          }
 673          for (; i < 4; i++) {
 674             static float zero = 0;
 675             stage_prog_data->param[uniforms * 4 + i] = &zero;
 676          }
 677
 678          uniforms++;
 679       }
 680    }
 681 }
 682
 683 void
 684 vec4_visitor::setup_uniform_clipplane_values()
 685 {
 686    gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
 687
 688    for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
 689       this->uniform_vector_size[this->uniforms] = 4;
 690       this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
 691       this->userplane[i].type = BRW_REGISTER_TYPE_F;
 692       for (int j = 0; j < 4; ++j) {
 693          stage_prog_data->param[this->uniforms * 4 + j] = &clip_planes[i][j];
 694       }
 695       ++this->uniforms;
 696    }
 697 }
 698
 699 /* Our support for builtin uniforms is even scarier than non-builtin.
 700  * It sits on top of the PROG_STATE_VAR parameters that are
 701  * automatically updated from GL context state.
 702  */
 703 void
 704 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
 705 {
 706    const ir_state_slot *const slots = ir->state_slots;
 707    assert(ir->state_slots != NULL);
 708
 709    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 710       /* This state reference has already been setup by ir_to_mesa,
 711        * but we'll get the same index back here.  We can reference
 712        * ParameterValues directly, since unlike brw_fs.cpp, we never
 713        * add new state references during compile.
 714        */
 715       int index = _mesa_add_state_reference(this->prog->Parameters,
 716                                             (gl_state_index *)slots[i].tokens);
 717       float *values = &this->prog->Parameters->ParameterValues[index][0].f;
 718
 719       this->uniform_vector_size[this->uniforms] = 0;
 720       /* Add each of the unique swizzled channels of the element.
 721        * This will end up matching the size of the glsl_type of this field.
 722        */
 723       int last_swiz = -1;
 724       for (unsigned int j = 0; j < 4; j++) {
 725          int swiz = GET_SWZ(slots[i].swizzle, j);
 726          last_swiz = swiz;
 727
 728          stage_prog_data->param[this->uniforms * 4 + j] = &values[swiz];
 729          if (swiz <= last_swiz)
 730             this->uniform_vector_size[this->uniforms]++;
 731       }
 732       this->uniforms++;
 733    }
 734 }
 735
 736 dst_reg *
 737 vec4_visitor::variable_storage(ir_variable *var)
 738 {
 739    return (dst_reg *)hash_table_find(this->variable_ht, var);
 740 }
 741
 742 void
 743 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir, uint32_t *predicate)
 744 {
 745    ir_expression *expr = ir->as_expression();
 746
 747    *predicate = BRW_PREDICATE_NORMAL;
 748
 749    if (expr) {
 750       src_reg op[2];
 751       vec4_instruction *inst;
 752
 753       assert(expr->get_num_operands() <= 2);
 754       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 755          expr->operands[i]->accept(this);
 756          op[i] = this->result;
 757
 758          resolve_ud_negate(&op[i]);
 759       }
 760
 761       switch (expr->operation) {
 762       case ir_unop_logic_not:
 763          inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
 764          inst->conditional_mod = BRW_CONDITIONAL_Z;
 765          break;
 766
 767       case ir_binop_logic_xor:
 768          inst = emit(XOR(dst_null_d(), op[0], op[1]));
 769          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 770          break;
 771
 772       case ir_binop_logic_or:
 773          inst = emit(OR(dst_null_d(), op[0], op[1]));
 774          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 775          break;
 776
 777       case ir_binop_logic_and:
 778          inst = emit(AND(dst_null_d(), op[0], op[1]));
 779          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 780          break;
 781
 782       case ir_unop_f2b:
 783          if (brw->gen >= 6) {
 784             emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
 785          } else {
 786             inst = emit(MOV(dst_null_f(), op[0]));
 787             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 788          }
 789          break;
 790
 791       case ir_unop_i2b:
 792          if (brw->gen >= 6) {
 793             emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 794          } else {
 795             inst = emit(MOV(dst_null_d(), op[0]));
 796             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 797          }
 798          break;
 799
 800       case ir_binop_all_equal:
 801          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 802          *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
 803          break;
 804
 805       case ir_binop_any_nequal:
 806          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 807          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 808          break;
 809
 810       case ir_unop_any:
 811          inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 812          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 813          break;
 814
 815       case ir_binop_greater:
 816       case ir_binop_gequal:
 817       case ir_binop_less:
 818       case ir_binop_lequal:
 819       case ir_binop_equal:
 820       case ir_binop_nequal:
 821          emit(CMP(dst_null_d(), op[0], op[1],
 822                   brw_conditional_for_comparison(expr->operation)));
 823          break;
 824
 825       default:
 826          assert(!"not reached");
 827          break;
 828       }
 829       return;
 830    }
 831
 832    ir->accept(this);
 833
 834    resolve_ud_negate(&this->result);
 835
 836    if (brw->gen >= 6) {
 837       vec4_instruction *inst = emit(AND(dst_null_d(),
 838                                         this->result, src_reg(1)));
 839       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 840    } else {
 841       vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
 842       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 843    }
 844 }
 845
 846 /**
 847  * Emit a gen6 IF statement with the comparison folded into the IF
 848  * instruction.
 849  */
 850 void
 851 vec4_visitor::emit_if_gen6(ir_if *ir)
 852 {
 853    ir_expression *expr = ir->condition->as_expression();
 854
 855    if (expr) {
 856       src_reg op[2];
 857       dst_reg temp;
 858
 859       assert(expr->get_num_operands() <= 2);
 860       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 861          expr->operands[i]->accept(this);
 862          op[i] = this->result;
 863       }
 864
 865       switch (expr->operation) {
 866       case ir_unop_logic_not:
 867          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
 868          return;
 869
 870       case ir_binop_logic_xor:
 871          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
 872          return;
 873
 874       case ir_binop_logic_or:
 875          temp = dst_reg(this, glsl_type::bool_type);
 876          emit(OR(temp, op[0], op[1]));
 877          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 878          return;
 879
 880       case ir_binop_logic_and:
 881          temp = dst_reg(this, glsl_type::bool_type);
 882          emit(AND(temp, op[0], op[1]));
 883          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 884          return;
 885
 886       case ir_unop_f2b:
 887          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 888          return;
 889
 890       case ir_unop_i2b:
 891          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 892          return;
 893
 894       case ir_binop_greater:
 895       case ir_binop_gequal:
 896       case ir_binop_less:
 897       case ir_binop_lequal:
 898       case ir_binop_equal:
 899       case ir_binop_nequal:
 900          emit(IF(op[0], op[1],
 901                  brw_conditional_for_comparison(expr->operation)));
 902          return;
 903
 904       case ir_binop_all_equal:
 905          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 906          emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
 907          return;
 908
 909       case ir_binop_any_nequal:
 910          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 911          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 912          return;
 913
 914       case ir_unop_any:
 915          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 916          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 917          return;
 918
 919       default:
 920          assert(!"not reached");
 921          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 922          return;
 923       }
 924       return;
 925    }
 926
 927    ir->condition->accept(this);
 928
 929    emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
 930 }
 931
 932 void
 933 vec4_visitor::visit(ir_variable *ir)
 934 {
 935    dst_reg *reg = NULL;
 936
 937    if (variable_storage(ir))
 938       return;
 939
 940    switch (ir->data.mode) {
 941    case ir_var_shader_in:
 942       reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
 943       break;
 944
 945    case ir_var_shader_out:
 946       reg = new(mem_ctx) dst_reg(this, ir->type);
 947
 948       for (int i = 0; i < type_size(ir->type); i++) {
 949          output_reg[ir->data.location + i] = *reg;
 950          output_reg[ir->data.location + i].reg_offset = i;
 951          output_reg[ir->data.location + i].type =
 952             brw_type_for_base_type(ir->type->get_scalar_type());
 953          output_reg_annotation[ir->data.location + i] = ir->name;
 954       }
 955       break;
 956
 957    case ir_var_auto:
 958    case ir_var_temporary:
 959       reg = new(mem_ctx) dst_reg(this, ir->type);
 960       break;
 961
 962    case ir_var_uniform:
 963       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
 964
 965       /* Thanks to the lower_ubo_reference pass, we will see only
 966        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
 967        * variables, so no need for them to be in variable_ht.
 968        *
 969        * Atomic counters take no uniform storage, no need to do
 970        * anything here.
 971        */
 972       if (ir->is_in_uniform_block() || ir->type->contains_atomic())
 973          return;
 974
 975       /* Track how big the whole uniform variable is, in case we need to put a
 976        * copy of its data into pull constants for array access.
 977        */
 978       this->uniform_size[this->uniforms] = type_size(ir->type);
 979
 980       if (!strncmp(ir->name, "gl_", 3)) {
 981          setup_builtin_uniform_values(ir);
 982       } else {
 983          setup_uniform_values(ir);
 984       }
 985       break;
 986
 987    case ir_var_system_value:
 988       reg = make_reg_for_system_value(ir);
 989       break;
 990
 991    default:
 992       assert(!"not reached");
 993    }
 994
 995    reg->type = brw_type_for_base_type(ir->type);
 996    hash_table_insert(this->variable_ht, reg, ir);
 997 }
 998
 999 void
1000 vec4_visitor::visit(ir_loop *ir)
1001 {
1002    /* We don't want debugging output to print the whole body of the
1003     * loop as the annotation.
1004     */
1005    this->base_ir = NULL;
1006
1007    emit(BRW_OPCODE_DO);
1008
1009    visit_instructions(&ir->body_instructions);
1010
1011    emit(BRW_OPCODE_WHILE);
1012 }
1013
1014 void
1015 vec4_visitor::visit(ir_loop_jump *ir)
1016 {
1017    switch (ir->mode) {
1018    case ir_loop_jump::jump_break:
1019       emit(BRW_OPCODE_BREAK);
1020       break;
1021    case ir_loop_jump::jump_continue:
1022       emit(BRW_OPCODE_CONTINUE);
1023       break;
1024    }
1025 }
1026
1027
1028 void
1029 vec4_visitor::visit(ir_function_signature *ir)
1030 {
1031    assert(0);
1032    (void)ir;
1033 }
1034
1035 void
1036 vec4_visitor::visit(ir_function *ir)
1037 {
1038    /* Ignore function bodies other than main() -- we shouldn't see calls to
1039     * them since they should all be inlined.
1040     */
1041    if (strcmp(ir->name, "main") == 0) {
1042       const ir_function_signature *sig;
1043       exec_list empty;
1044
1045       sig = ir->matching_signature(NULL, &empty);
1046
1047       assert(sig);
1048
1049       visit_instructions(&sig->body);
1050    }
1051 }
1052
1053 bool
1054 vec4_visitor::try_emit_sat(ir_expression *ir)
1055 {
1056    ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
1057    if (!sat_src)
1058       return false;
1059
1060    sat_src->accept(this);
1061    src_reg src = this->result;
1062
1063    this->result = src_reg(this, ir->type);
1064    vec4_instruction *inst;
1065    inst = emit(MOV(dst_reg(this->result), src));
1066    inst->saturate = true;
1067
1068    return true;
1069 }
1070
1071 bool
1072 vec4_visitor::try_emit_mad(ir_expression *ir, int mul_arg)
1073 {
1074    /* 3-src instructions were introduced in gen6. */
1075    if (brw->gen < 6)
1076       return false;
1077
1078    /* MAD can only handle floating-point data. */
1079    if (ir->type->base_type != GLSL_TYPE_FLOAT)
1080       return false;
1081
1082    ir_rvalue *nonmul = ir->operands[1 - mul_arg];
1083    ir_expression *mul = ir->operands[mul_arg]->as_expression();
1084
1085    if (!mul || mul->operation != ir_binop_mul)
1086       return false;
1087
1088    nonmul->accept(this);
1089    src_reg src0 = fix_3src_operand(this->result);
1090
1091    mul->operands[0]->accept(this);
1092    src_reg src1 = fix_3src_operand(this->result);
1093
1094    mul->operands[1]->accept(this);
1095    src_reg src2 = fix_3src_operand(this->result);
1096
1097    this->result = src_reg(this, ir->type);
1098    emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1099
1100    return true;
1101 }
1102
1103 void
1104 vec4_visitor::emit_bool_comparison(unsigned int op,
1105                                  dst_reg dst, src_reg src0, src_reg src1)
1106 {
1107    /* original gen4 does destination conversion before comparison. */
1108    if (brw->gen < 5)
1109       dst.type = src0.type;
1110
1111    emit(CMP(dst, src0, src1, brw_conditional_for_comparison(op)));
1112
1113    dst.type = BRW_REGISTER_TYPE_D;
1114    emit(AND(dst, src_reg(dst), src_reg(0x1)));
1115 }
1116
1117 void
1118 vec4_visitor::emit_minmax(uint32_t conditionalmod, dst_reg dst,
1119                           src_reg src0, src_reg src1)
1120 {
1121    vec4_instruction *inst;
1122
1123    if (brw->gen >= 6) {
1124       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1125       inst->conditional_mod = conditionalmod;
1126    } else {
1127       emit(CMP(dst, src0, src1, conditionalmod));
1128
1129       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1130       inst->predicate = BRW_PREDICATE_NORMAL;
1131    }
1132 }
1133
1134 static bool
1135 is_16bit_constant(ir_rvalue *rvalue)
1136 {
1137    ir_constant *constant = rvalue->as_constant();
1138    if (!constant)
1139       return false;
1140
1141    if (constant->type != glsl_type::int_type &&
1142        constant->type != glsl_type::uint_type)
1143       return false;
1144
1145    return constant->value.u[0] < (1 << 16);
1146 }
1147
1148 void
1149 vec4_visitor::visit(ir_expression *ir)
1150 {
1151    unsigned int operand;
1152    src_reg op[Elements(ir->operands)];
1153    src_reg result_src;
1154    dst_reg result_dst;
1155    vec4_instruction *inst;
1156
1157    if (try_emit_sat(ir))
1158       return;
1159
1160    if (ir->operation == ir_binop_add) {
1161       if (try_emit_mad(ir, 0) || try_emit_mad(ir, 1))
1162          return;
1163    }
1164
1165    for (operand = 0; operand < ir->get_num_operands(); operand++) {
1166       this->result.file = BAD_FILE;
1167       ir->operands[operand]->accept(this);
1168       if (this->result.file == BAD_FILE) {
1169          printf("Failed to get tree for expression operand:\n");
1170          ir->operands[operand]->print();
1171          exit(1);
1172       }
1173       op[operand] = this->result;
1174
1175       /* Matrix expression operands should have been broken down to vector
1176        * operations already.
1177        */
1178       assert(!ir->operands[operand]->type->is_matrix());
1179    }
1180
1181    int vector_elements = ir->operands[0]->type->vector_elements;
1182    if (ir->operands[1]) {
1183       vector_elements = MAX2(vector_elements,
1184                              ir->operands[1]->type->vector_elements);
1185    }
1186
1187    this->result.file = BAD_FILE;
1188
1189    /* Storage for our result.  Ideally for an assignment we'd be using
1190     * the actual storage for the result here, instead.
1191     */
1192    result_src = src_reg(this, ir->type);
1193    /* convenience for the emit functions below. */
1194    result_dst = dst_reg(result_src);
1195    /* If nothing special happens, this is the result. */
1196    this->result = result_src;
1197    /* Limit writes to the channels that will be used by result_src later.
1198     * This does limit this temp's use as a temporary for multi-instruction
1199     * sequences.
1200     */
1201    result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1202
1203    switch (ir->operation) {
1204    case ir_unop_logic_not:
1205       /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
1206        * ones complement of the whole register, not just bit 0.
1207        */
1208       emit(XOR(result_dst, op[0], src_reg(1)));
1209       break;
1210    case ir_unop_neg:
1211       op[0].negate = !op[0].negate;
1212       emit(MOV(result_dst, op[0]));
1213       break;
1214    case ir_unop_abs:
1215       op[0].abs = true;
1216       op[0].negate = false;
1217       emit(MOV(result_dst, op[0]));
1218       break;
1219
1220    case ir_unop_sign:
1221       if (ir->type->is_float()) {
1222          /* AND(val, 0x80000000) gives the sign bit.
1223           *
1224           * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1225           * zero.
1226           */
1227          emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1228
1229          op[0].type = BRW_REGISTER_TYPE_UD;
1230          result_dst.type = BRW_REGISTER_TYPE_UD;
1231          emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1232
1233          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1234          inst->predicate = BRW_PREDICATE_NORMAL;
1235
1236          this->result.type = BRW_REGISTER_TYPE_F;
1237       } else {
1238          /*  ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1239           *               -> non-negative val generates 0x00000000.
1240           *  Predicated OR sets 1 if val is positive.
1241           */
1242          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1243
1244          emit(ASR(result_dst, op[0], src_reg(31)));
1245
1246          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1247          inst->predicate = BRW_PREDICATE_NORMAL;
1248       }
1249       break;
1250
1251    case ir_unop_rcp:
1252       emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1253       break;
1254
1255    case ir_unop_exp2:
1256       emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1257       break;
1258    case ir_unop_log2:
1259       emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1260       break;
1261    case ir_unop_exp:
1262    case ir_unop_log:
1263       assert(!"not reached: should be handled by ir_explog_to_explog2");
1264       break;
1265    case ir_unop_sin:
1266    case ir_unop_sin_reduced:
1267       emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1268       break;
1269    case ir_unop_cos:
1270    case ir_unop_cos_reduced:
1271       emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1272       break;
1273
1274    case ir_unop_dFdx:
1275    case ir_unop_dFdy:
1276       assert(!"derivatives not valid in vertex shader");
1277       break;
1278
1279    case ir_unop_bitfield_reverse:
1280       emit(BFREV(result_dst, op[0]));
1281       break;
1282    case ir_unop_bit_count:
1283       emit(CBIT(result_dst, op[0]));
1284       break;
1285    case ir_unop_find_msb: {
1286       src_reg temp = src_reg(this, glsl_type::uint_type);
1287
1288       inst = emit(FBH(dst_reg(temp), op[0]));
1289       inst->dst.writemask = WRITEMASK_XYZW;
1290
1291       /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1292        * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1293        * subtract the result from 31 to convert the MSB count into an LSB count.
1294        */
1295
1296       /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1297       temp.swizzle = BRW_SWIZZLE_NOOP;
1298       emit(MOV(result_dst, temp));
1299
1300       src_reg src_tmp = src_reg(result_dst);
1301       emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1302
1303       src_tmp.negate = true;
1304       inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1305       inst->predicate = BRW_PREDICATE_NORMAL;
1306       break;
1307    }
1308    case ir_unop_find_lsb:
1309       emit(FBL(result_dst, op[0]));
1310       break;
1311
1312    case ir_unop_noise:
1313       assert(!"not reached: should be handled by lower_noise");
1314       break;
1315
1316    case ir_binop_add:
1317       emit(ADD(result_dst, op[0], op[1]));
1318       break;
1319    case ir_binop_sub:
1320       assert(!"not reached: should be handled by ir_sub_to_add_neg");
1321       break;
1322
1323    case ir_binop_mul:
1324       if (brw->gen < 8 && ir->type->is_integer()) {
1325          /* For integer multiplication, the MUL uses the low 16 bits of one of
1326           * the operands (src0 through SNB, src1 on IVB and later).  The MACH
1327           * accumulates in the contribution of the upper 16 bits of that
1328           * operand.  If we can determine that one of the args is in the low
1329           * 16 bits, though, we can just emit a single MUL.
1330           */
1331          if (is_16bit_constant(ir->operands[0])) {
1332             if (brw->gen < 7)
1333                emit(MUL(result_dst, op[0], op[1]));
1334             else
1335                emit(MUL(result_dst, op[1], op[0]));
1336          } else if (is_16bit_constant(ir->operands[1])) {
1337             if (brw->gen < 7)
1338                emit(MUL(result_dst, op[1], op[0]));
1339             else
1340                emit(MUL(result_dst, op[0], op[1]));
1341          } else {
1342             struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1343
1344             emit(MUL(acc, op[0], op[1]));
1345             emit(MACH(dst_null_d(), op[0], op[1]));
1346             emit(MOV(result_dst, src_reg(acc)));
1347          }
1348       } else {
1349          emit(MUL(result_dst, op[0], op[1]));
1350       }
1351       break;
1352    case ir_binop_imul_high: {
1353       struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1354
1355       emit(MUL(acc, op[0], op[1]));
1356       emit(MACH(result_dst, op[0], op[1]));
1357       break;
1358    }
1359    case ir_binop_div:
1360       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1361       assert(ir->type->is_integer());
1362       emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1363       break;
1364    case ir_binop_carry: {
1365       struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1366
1367       emit(ADDC(dst_null_ud(), op[0], op[1]));
1368       emit(MOV(result_dst, src_reg(acc)));
1369       break;
1370    }
1371    case ir_binop_borrow: {
1372       struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1373
1374       emit(SUBB(dst_null_ud(), op[0], op[1]));
1375       emit(MOV(result_dst, src_reg(acc)));
1376       break;
1377    }
1378    case ir_binop_mod:
1379       /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1380       assert(ir->type->is_integer());
1381       emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1382       break;
1383
1384    case ir_binop_less:
1385    case ir_binop_greater:
1386    case ir_binop_lequal:
1387    case ir_binop_gequal:
1388    case ir_binop_equal:
1389    case ir_binop_nequal: {
1390       emit(CMP(result_dst, op[0], op[1],
1391                brw_conditional_for_comparison(ir->operation)));
1392       emit(AND(result_dst, result_src, src_reg(0x1)));
1393       break;
1394    }
1395
1396    case ir_binop_all_equal:
1397       /* "==" operator producing a scalar boolean. */
1398       if (ir->operands[0]->type->is_vector() ||
1399           ir->operands[1]->type->is_vector()) {
1400          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1401          emit(MOV(result_dst, src_reg(0)));
1402          inst = emit(MOV(result_dst, src_reg(1)));
1403          inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1404       } else {
1405          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1406          emit(AND(result_dst, result_src, src_reg(0x1)));
1407       }
1408       break;
1409    case ir_binop_any_nequal:
1410       /* "!=" operator producing a scalar boolean. */
1411       if (ir->operands[0]->type->is_vector() ||
1412           ir->operands[1]->type->is_vector()) {
1413          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1414
1415          emit(MOV(result_dst, src_reg(0)));
1416          inst = emit(MOV(result_dst, src_reg(1)));
1417          inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1418       } else {
1419          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1420          emit(AND(result_dst, result_src, src_reg(0x1)));
1421       }
1422       break;
1423
1424    case ir_unop_any:
1425       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1426       emit(MOV(result_dst, src_reg(0)));
1427
1428       inst = emit(MOV(result_dst, src_reg(1)));
1429       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1430       break;
1431
1432    case ir_binop_logic_xor:
1433       emit(XOR(result_dst, op[0], op[1]));
1434       break;
1435
1436    case ir_binop_logic_or:
1437       emit(OR(result_dst, op[0], op[1]));
1438       break;
1439
1440    case ir_binop_logic_and:
1441       emit(AND(result_dst, op[0], op[1]));
1442       break;
1443
1444    case ir_binop_dot:
1445       assert(ir->operands[0]->type->is_vector());
1446       assert(ir->operands[0]->type == ir->operands[1]->type);
1447       emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1448       break;
1449
1450    case ir_unop_sqrt:
1451       emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1452       break;
1453    case ir_unop_rsq:
1454       emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1455       break;
1456
1457    case ir_unop_bitcast_i2f:
1458    case ir_unop_bitcast_u2f:
1459       this->result = op[0];
1460       this->result.type = BRW_REGISTER_TYPE_F;
1461       break;
1462
1463    case ir_unop_bitcast_f2i:
1464       this->result = op[0];
1465       this->result.type = BRW_REGISTER_TYPE_D;
1466       break;
1467
1468    case ir_unop_bitcast_f2u:
1469       this->result = op[0];
1470       this->result.type = BRW_REGISTER_TYPE_UD;
1471       break;
1472
1473    case ir_unop_i2f:
1474    case ir_unop_i2u:
1475    case ir_unop_u2i:
1476    case ir_unop_u2f:
1477    case ir_unop_b2f:
1478    case ir_unop_b2i:
1479    case ir_unop_f2i:
1480    case ir_unop_f2u:
1481       emit(MOV(result_dst, op[0]));
1482       break;
1483    case ir_unop_f2b:
1484    case ir_unop_i2b: {
1485       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1486       emit(AND(result_dst, result_src, src_reg(1)));
1487       break;
1488    }
1489
1490    case ir_unop_trunc:
1491       emit(RNDZ(result_dst, op[0]));
1492       break;
1493    case ir_unop_ceil:
1494       op[0].negate = !op[0].negate;
1495       inst = emit(RNDD(result_dst, op[0]));
1496       this->result.negate = true;
1497       break;
1498    case ir_unop_floor:
1499       inst = emit(RNDD(result_dst, op[0]));
1500       break;
1501    case ir_unop_fract:
1502       inst = emit(FRC(result_dst, op[0]));
1503       break;
1504    case ir_unop_round_even:
1505       emit(RNDE(result_dst, op[0]));
1506       break;
1507
1508    case ir_binop_min:
1509       emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1510       break;
1511    case ir_binop_max:
1512       emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1513       break;
1514
1515    case ir_binop_pow:
1516       emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1517       break;
1518
1519    case ir_unop_bit_not:
1520       inst = emit(NOT(result_dst, op[0]));
1521       break;
1522    case ir_binop_bit_and:
1523       inst = emit(AND(result_dst, op[0], op[1]));
1524       break;
1525    case ir_binop_bit_xor:
1526       inst = emit(XOR(result_dst, op[0], op[1]));
1527       break;
1528    case ir_binop_bit_or:
1529       inst = emit(OR(result_dst, op[0], op[1]));
1530       break;
1531
1532    case ir_binop_lshift:
1533       inst = emit(SHL(result_dst, op[0], op[1]));
1534       break;
1535
1536    case ir_binop_rshift:
1537       if (ir->type->base_type == GLSL_TYPE_INT)
1538          inst = emit(ASR(result_dst, op[0], op[1]));
1539       else
1540          inst = emit(SHR(result_dst, op[0], op[1]));
1541       break;
1542
1543    case ir_binop_bfm:
1544       emit(BFI1(result_dst, op[0], op[1]));
1545       break;
1546
1547    case ir_binop_ubo_load: {
1548       ir_constant *uniform_block = ir->operands[0]->as_constant();
1549       ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1550       unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1551       src_reg offset;
1552
1553       /* Now, load the vector from that offset. */
1554       assert(ir->type->is_vector() || ir->type->is_scalar());
1555
1556       src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1557       packed_consts.type = result.type;
1558       src_reg surf_index =
1559          src_reg(prog_data->base.binding_table.ubo_start + uniform_block->value.u[0]);
1560       if (const_offset_ir) {
1561          if (brw->gen >= 8) {
1562             /* Store the offset in a GRF so we can send-from-GRF. */
1563             offset = src_reg(this, glsl_type::int_type);
1564             emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1565          } else {
1566             /* Immediates are fine on older generations since they'll be moved
1567              * to a (potentially fake) MRF at the generator level.
1568              */
1569             offset = src_reg(const_offset / 16);
1570          }
1571       } else {
1572          offset = src_reg(this, glsl_type::uint_type);
1573          emit(SHR(dst_reg(offset), op[1], src_reg(4)));
1574       }
1575
1576       if (brw->gen >= 7) {
1577          dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1578          grf_offset.type = offset.type;
1579
1580          emit(MOV(grf_offset, offset));
1581
1582          emit(new(mem_ctx) vec4_instruction(this,
1583                                             VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1584                                             dst_reg(packed_consts),
1585                                             surf_index,
1586                                             src_reg(grf_offset)));
1587       } else {
1588          vec4_instruction *pull =
1589             emit(new(mem_ctx) vec4_instruction(this,
1590                                                VS_OPCODE_PULL_CONSTANT_LOAD,
1591                                                dst_reg(packed_consts),
1592                                                surf_index,
1593                                                offset));
1594          pull->base_mrf = 14;
1595          pull->mlen = 1;
1596       }
1597
1598       packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1599       packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1600                                             const_offset % 16 / 4,
1601                                             const_offset % 16 / 4,
1602                                             const_offset % 16 / 4);
1603
1604       /* UBO bools are any nonzero int.  We store bools as either 0 or 1. */
1605       if (ir->type->base_type == GLSL_TYPE_BOOL) {
1606          emit(CMP(result_dst, packed_consts, src_reg(0u),
1607                   BRW_CONDITIONAL_NZ));
1608          emit(AND(result_dst, result, src_reg(0x1)));
1609       } else {
1610          emit(MOV(result_dst, packed_consts));
1611       }
1612       break;
1613    }
1614
1615    case ir_binop_vector_extract:
1616       assert(!"should have been lowered by vec_index_to_cond_assign");
1617       break;
1618
1619    case ir_triop_fma:
1620       op[0] = fix_3src_operand(op[0]);
1621       op[1] = fix_3src_operand(op[1]);
1622       op[2] = fix_3src_operand(op[2]);
1623       /* Note that the instruction's argument order is reversed from GLSL
1624        * and the IR.
1625        */
1626       emit(MAD(result_dst, op[2], op[1], op[0]));
1627       break;
1628
1629    case ir_triop_lrp:
1630       op[0] = fix_3src_operand(op[0]);
1631       op[1] = fix_3src_operand(op[1]);
1632       op[2] = fix_3src_operand(op[2]);
1633       /* Note that the instruction's argument order is reversed from GLSL
1634        * and the IR.
1635        */
1636       emit(LRP(result_dst, op[2], op[1], op[0]));
1637       break;
1638
1639    case ir_triop_csel:
1640       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1641       inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1642       inst->predicate = BRW_PREDICATE_NORMAL;
1643       break;
1644
1645    case ir_triop_bfi:
1646       op[0] = fix_3src_operand(op[0]);
1647       op[1] = fix_3src_operand(op[1]);
1648       op[2] = fix_3src_operand(op[2]);
1649       emit(BFI2(result_dst, op[0], op[1], op[2]));
1650       break;
1651
1652    case ir_triop_bitfield_extract:
1653       op[0] = fix_3src_operand(op[0]);
1654       op[1] = fix_3src_operand(op[1]);
1655       op[2] = fix_3src_operand(op[2]);
1656       /* Note that the instruction's argument order is reversed from GLSL
1657        * and the IR.
1658        */
1659       emit(BFE(result_dst, op[2], op[1], op[0]));
1660       break;
1661
1662    case ir_triop_vector_insert:
1663       assert(!"should have been lowered by lower_vector_insert");
1664       break;
1665
1666    case ir_quadop_bitfield_insert:
1667       assert(!"not reached: should be handled by "
1668               "bitfield_insert_to_bfm_bfi\n");
1669       break;
1670
1671    case ir_quadop_vector:
1672       assert(!"not reached: should be handled by lower_quadop_vector");
1673       break;
1674
1675    case ir_unop_pack_half_2x16:
1676       emit_pack_half_2x16(result_dst, op[0]);
1677       break;
1678    case ir_unop_unpack_half_2x16:
1679       emit_unpack_half_2x16(result_dst, op[0]);
1680       break;
1681    case ir_unop_pack_snorm_2x16:
1682    case ir_unop_pack_snorm_4x8:
1683    case ir_unop_pack_unorm_2x16:
1684    case ir_unop_pack_unorm_4x8:
1685    case ir_unop_unpack_snorm_2x16:
1686    case ir_unop_unpack_snorm_4x8:
1687    case ir_unop_unpack_unorm_2x16:
1688    case ir_unop_unpack_unorm_4x8:
1689       assert(!"not reached: should be handled by lower_packing_builtins");
1690       break;
1691    case ir_unop_unpack_half_2x16_split_x:
1692    case ir_unop_unpack_half_2x16_split_y:
1693    case ir_binop_pack_half_2x16_split:
1694       assert(!"not reached: should not occur in vertex shader");
1695       break;
1696    case ir_binop_ldexp:
1697       assert(!"not reached: should be handled by ldexp_to_arith()");
1698       break;
1699    }
1700 }
1701
1702
1703 void
1704 vec4_visitor::visit(ir_swizzle *ir)
1705 {
1706    src_reg src;
1707    int i = 0;
1708    int swizzle[4];
1709
1710    /* Note that this is only swizzles in expressions, not those on the left
1711     * hand side of an assignment, which do write masking.  See ir_assignment
1712     * for that.
1713     */
1714
1715    ir->val->accept(this);
1716    src = this->result;
1717    assert(src.file != BAD_FILE);
1718
1719    for (i = 0; i < ir->type->vector_elements; i++) {
1720       switch (i) {
1721       case 0:
1722          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1723          break;
1724       case 1:
1725          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1726          break;
1727       case 2:
1728          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1729          break;
1730       case 3:
1731          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1732             break;
1733       }
1734    }
1735    for (; i < 4; i++) {
1736       /* Replicate the last channel out. */
1737       swizzle[i] = swizzle[ir->type->vector_elements - 1];
1738    }
1739
1740    src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1741
1742    this->result = src;
1743 }
1744
1745 void
1746 vec4_visitor::visit(ir_dereference_variable *ir)
1747 {
1748    const struct glsl_type *type = ir->type;
1749    dst_reg *reg = variable_storage(ir->var);
1750
1751    if (!reg) {
1752       fail("Failed to find variable storage for %s\n", ir->var->name);
1753       this->result = src_reg(brw_null_reg());
1754       return;
1755    }
1756
1757    this->result = src_reg(*reg);
1758
1759    /* System values get their swizzle from the dst_reg writemask */
1760    if (ir->var->data.mode == ir_var_system_value)
1761       return;
1762
1763    if (type->is_scalar() || type->is_vector() || type->is_matrix())
1764       this->result.swizzle = swizzle_for_size(type->vector_elements);
1765 }
1766
1767
1768 int
1769 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1770 {
1771    /* Under normal circumstances array elements are stored consecutively, so
1772     * the stride is equal to the size of the array element.
1773     */
1774    return type_size(ir->type);
1775 }
1776
1777
1778 void
1779 vec4_visitor::visit(ir_dereference_array *ir)
1780 {
1781    ir_constant *constant_index;
1782    src_reg src;
1783    int array_stride = compute_array_stride(ir);
1784
1785    constant_index = ir->array_index->constant_expression_value();
1786
1787    ir->array->accept(this);
1788    src = this->result;
1789
1790    if (constant_index) {
1791       src.reg_offset += constant_index->value.i[0] * array_stride;
1792    } else {
1793       /* Variable index array dereference.  It eats the "vec4" of the
1794        * base of the array and an index that offsets the Mesa register
1795        * index.
1796        */
1797       ir->array_index->accept(this);
1798
1799       src_reg index_reg;
1800
1801       if (array_stride == 1) {
1802          index_reg = this->result;
1803       } else {
1804          index_reg = src_reg(this, glsl_type::int_type);
1805
1806          emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
1807       }
1808
1809       if (src.reladdr) {
1810          src_reg temp = src_reg(this, glsl_type::int_type);
1811
1812          emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1813
1814          index_reg = temp;
1815       }
1816
1817       src.reladdr = ralloc(mem_ctx, src_reg);
1818       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1819    }
1820
1821    /* If the type is smaller than a vec4, replicate the last channel out. */
1822    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1823       src.swizzle = swizzle_for_size(ir->type->vector_elements);
1824    else
1825       src.swizzle = BRW_SWIZZLE_NOOP;
1826    src.type = brw_type_for_base_type(ir->type);
1827
1828    this->result = src;
1829 }
1830
1831 void
1832 vec4_visitor::visit(ir_dereference_record *ir)
1833 {
1834    unsigned int i;
1835    const glsl_type *struct_type = ir->record->type;
1836    int offset = 0;
1837
1838    ir->record->accept(this);
1839
1840    for (i = 0; i < struct_type->length; i++) {
1841       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1842          break;
1843       offset += type_size(struct_type->fields.structure[i].type);
1844    }
1845
1846    /* If the type is smaller than a vec4, replicate the last channel out. */
1847    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1848       this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1849    else
1850       this->result.swizzle = BRW_SWIZZLE_NOOP;
1851    this->result.type = brw_type_for_base_type(ir->type);
1852
1853    this->result.reg_offset += offset;
1854 }
1855
1856 /**
1857  * We want to be careful in assignment setup to hit the actual storage
1858  * instead of potentially using a temporary like we might with the
1859  * ir_dereference handler.
1860  */
1861 static dst_reg
1862 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1863 {
1864    /* The LHS must be a dereference.  If the LHS is a variable indexed array
1865     * access of a vector, it must be separated into a series conditional moves
1866     * before reaching this point (see ir_vec_index_to_cond_assign).
1867     */
1868    assert(ir->as_dereference());
1869    ir_dereference_array *deref_array = ir->as_dereference_array();
1870    if (deref_array) {
1871       assert(!deref_array->array->type->is_vector());
1872    }
1873
1874    /* Use the rvalue deref handler for the most part.  We'll ignore
1875     * swizzles in it and write swizzles using writemask, though.
1876     */
1877    ir->accept(v);
1878    return dst_reg(v->result);
1879 }
1880
1881 void
1882 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1883                               const struct glsl_type *type, uint32_t predicate)
1884 {
1885    if (type->base_type == GLSL_TYPE_STRUCT) {
1886       for (unsigned int i = 0; i < type->length; i++) {
1887          emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1888       }
1889       return;
1890    }
1891
1892    if (type->is_array()) {
1893       for (unsigned int i = 0; i < type->length; i++) {
1894          emit_block_move(dst, src, type->fields.array, predicate);
1895       }
1896       return;
1897    }
1898
1899    if (type->is_matrix()) {
1900       const struct glsl_type *vec_type;
1901
1902       vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1903                                          type->vector_elements, 1);
1904
1905       for (int i = 0; i < type->matrix_columns; i++) {
1906          emit_block_move(dst, src, vec_type, predicate);
1907       }
1908       return;
1909    }
1910
1911    assert(type->is_scalar() || type->is_vector());
1912
1913    dst->type = brw_type_for_base_type(type);
1914    src->type = dst->type;
1915
1916    dst->writemask = (1 << type->vector_elements) - 1;
1917
1918    src->swizzle = swizzle_for_size(type->vector_elements);
1919
1920    vec4_instruction *inst = emit(MOV(*dst, *src));
1921    inst->predicate = predicate;
1922
1923    dst->reg_offset++;
1924    src->reg_offset++;
1925 }
1926
1927
1928 /* If the RHS processing resulted in an instruction generating a
1929  * temporary value, and it would be easy to rewrite the instruction to
1930  * generate its result right into the LHS instead, do so.  This ends
1931  * up reliably removing instructions where it can be tricky to do so
1932  * later without real UD chain information.
1933  */
1934 bool
1935 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
1936                                      dst_reg dst,
1937                                      src_reg src,
1938                                      vec4_instruction *pre_rhs_inst,
1939                                      vec4_instruction *last_rhs_inst)
1940 {
1941    /* This could be supported, but it would take more smarts. */
1942    if (ir->condition)
1943       return false;
1944
1945    if (pre_rhs_inst == last_rhs_inst)
1946       return false; /* No instructions generated to work with. */
1947
1948    /* Make sure the last instruction generated our source reg. */
1949    if (src.file != GRF ||
1950        src.file != last_rhs_inst->dst.file ||
1951        src.reg != last_rhs_inst->dst.reg ||
1952        src.reg_offset != last_rhs_inst->dst.reg_offset ||
1953        src.reladdr ||
1954        src.abs ||
1955        src.negate ||
1956        last_rhs_inst->predicate != BRW_PREDICATE_NONE)
1957       return false;
1958
1959    /* Check that that last instruction fully initialized the channels
1960     * we want to use, in the order we want to use them.  We could
1961     * potentially reswizzle the operands of many instructions so that
1962     * we could handle out of order channels, but don't yet.
1963     */
1964
1965    for (unsigned i = 0; i < 4; i++) {
1966       if (dst.writemask & (1 << i)) {
1967          if (!(last_rhs_inst->dst.writemask & (1 << i)))
1968             return false;
1969
1970          if (BRW_GET_SWZ(src.swizzle, i) != i)
1971             return false;
1972       }
1973    }
1974
1975    /* Success!  Rewrite the instruction. */
1976    last_rhs_inst->dst.file = dst.file;
1977    last_rhs_inst->dst.reg = dst.reg;
1978    last_rhs_inst->dst.reg_offset = dst.reg_offset;
1979    last_rhs_inst->dst.reladdr = dst.reladdr;
1980    last_rhs_inst->dst.writemask &= dst.writemask;
1981
1982    return true;
1983 }
1984
1985 void
1986 vec4_visitor::visit(ir_assignment *ir)
1987 {
1988    dst_reg dst = get_assignment_lhs(ir->lhs, this);
1989    uint32_t predicate = BRW_PREDICATE_NONE;
1990
1991    if (!ir->lhs->type->is_scalar() &&
1992        !ir->lhs->type->is_vector()) {
1993       ir->rhs->accept(this);
1994       src_reg src = this->result;
1995
1996       if (ir->condition) {
1997          emit_bool_to_cond_code(ir->condition, &predicate);
1998       }
1999
2000       /* emit_block_move doesn't account for swizzles in the source register.
2001        * This should be ok, since the source register is a structure or an
2002        * array, and those can't be swizzled.  But double-check to be sure.
2003        */
2004       assert(src.swizzle ==
2005              (ir->rhs->type->is_matrix()
2006               ? swizzle_for_size(ir->rhs->type->vector_elements)
2007               : BRW_SWIZZLE_NOOP));
2008
2009       emit_block_move(&dst, &src, ir->rhs->type, predicate);
2010       return;
2011    }
2012
2013    /* Now we're down to just a scalar/vector with writemasks. */
2014    int i;
2015
2016    vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2017    pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2018
2019    ir->rhs->accept(this);
2020
2021    last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2022
2023    src_reg src = this->result;
2024
2025    int swizzles[4];
2026    int first_enabled_chan = 0;
2027    int src_chan = 0;
2028
2029    assert(ir->lhs->type->is_vector() ||
2030           ir->lhs->type->is_scalar());
2031    dst.writemask = ir->write_mask;
2032
2033    for (int i = 0; i < 4; i++) {
2034       if (dst.writemask & (1 << i)) {
2035          first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
2036          break;
2037       }
2038    }
2039
2040    /* Swizzle a small RHS vector into the channels being written.
2041     *
2042     * glsl ir treats write_mask as dictating how many channels are
2043     * present on the RHS while in our instructions we need to make
2044     * those channels appear in the slots of the vec4 they're written to.
2045     */
2046    for (int i = 0; i < 4; i++) {
2047       if (dst.writemask & (1 << i))
2048          swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
2049       else
2050          swizzles[i] = first_enabled_chan;
2051    }
2052    src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2053                               swizzles[2], swizzles[3]);
2054
2055    if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2056       return;
2057    }
2058
2059    if (ir->condition) {
2060       emit_bool_to_cond_code(ir->condition, &predicate);
2061    }
2062
2063    for (i = 0; i < type_size(ir->lhs->type); i++) {
2064       vec4_instruction *inst = emit(MOV(dst, src));
2065       inst->predicate = predicate;
2066
2067       dst.reg_offset++;
2068       src.reg_offset++;
2069    }
2070 }
2071
2072 void
2073 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2074 {
2075    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2076       foreach_list(node, &ir->components) {
2077          ir_constant *field_value = (ir_constant *)node;
2078
2079          emit_constant_values(dst, field_value);
2080       }
2081       return;
2082    }
2083
2084    if (ir->type->is_array()) {
2085       for (unsigned int i = 0; i < ir->type->length; i++) {
2086          emit_constant_values(dst, ir->array_elements[i]);
2087       }
2088       return;
2089    }
2090
2091    if (ir->type->is_matrix()) {
2092       for (int i = 0; i < ir->type->matrix_columns; i++) {
2093          float *vec = &ir->value.f[i * ir->type->vector_elements];
2094
2095          for (int j = 0; j < ir->type->vector_elements; j++) {
2096             dst->writemask = 1 << j;
2097             dst->type = BRW_REGISTER_TYPE_F;
2098
2099             emit(MOV(*dst, src_reg(vec[j])));
2100          }
2101          dst->reg_offset++;
2102       }
2103       return;
2104    }
2105
2106    int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2107
2108    for (int i = 0; i < ir->type->vector_elements; i++) {
2109       if (!(remaining_writemask & (1 << i)))
2110          continue;
2111
2112       dst->writemask = 1 << i;
2113       dst->type = brw_type_for_base_type(ir->type);
2114
2115       /* Find other components that match the one we're about to
2116        * write.  Emits fewer instructions for things like vec4(0.5,
2117        * 1.5, 1.5, 1.5).
2118        */
2119       for (int j = i + 1; j < ir->type->vector_elements; j++) {
2120          if (ir->type->base_type == GLSL_TYPE_BOOL) {
2121             if (ir->value.b[i] == ir->value.b[j])
2122                dst->writemask |= (1 << j);
2123          } else {
2124             /* u, i, and f storage all line up, so no need for a
2125              * switch case for comparing each type.
2126              */
2127             if (ir->value.u[i] == ir->value.u[j])
2128                dst->writemask |= (1 << j);
2129          }
2130       }
2131
2132       switch (ir->type->base_type) {
2133       case GLSL_TYPE_FLOAT:
2134          emit(MOV(*dst, src_reg(ir->value.f[i])));
2135          break;
2136       case GLSL_TYPE_INT:
2137          emit(MOV(*dst, src_reg(ir->value.i[i])));
2138          break;
2139       case GLSL_TYPE_UINT:
2140          emit(MOV(*dst, src_reg(ir->value.u[i])));
2141          break;
2142       case GLSL_TYPE_BOOL:
2143          emit(MOV(*dst, src_reg(ir->value.b[i])));
2144          break;
2145       default:
2146          assert(!"Non-float/uint/int/bool constant");
2147          break;
2148       }
2149
2150       remaining_writemask &= ~dst->writemask;
2151    }
2152    dst->reg_offset++;
2153 }
2154
2155 void
2156 vec4_visitor::visit(ir_constant *ir)
2157 {
2158    dst_reg dst = dst_reg(this, ir->type);
2159    this->result = src_reg(dst);
2160
2161    emit_constant_values(&dst, ir);
2162 }
2163
2164 void
2165 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2166 {
2167    ir_dereference *deref = static_cast<ir_dereference *>(
2168       ir->actual_parameters.get_head());
2169    ir_variable *location = deref->variable_referenced();
2170    unsigned surf_index = (prog_data->base.binding_table.abo_start +
2171                           location->data.atomic.buffer_index);
2172
2173    /* Calculate the surface offset */
2174    src_reg offset(this, glsl_type::uint_type);
2175    ir_dereference_array *deref_array = deref->as_dereference_array();
2176    if (deref_array) {
2177       deref_array->array_index->accept(this);
2178
2179       src_reg tmp(this, glsl_type::uint_type);
2180       emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2181       emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2182    } else {
2183       offset = location->data.atomic.offset;
2184    }
2185
2186    /* Emit the appropriate machine instruction */
2187    const char *callee = ir->callee->function_name();
2188    dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2189
2190    if (!strcmp("__intrinsic_atomic_read", callee)) {
2191       emit_untyped_surface_read(surf_index, dst, offset);
2192
2193    } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2194       emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2195                           src_reg(), src_reg());
2196
2197    } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2198       emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2199                           src_reg(), src_reg());
2200    }
2201 }
2202
2203 void
2204 vec4_visitor::visit(ir_call *ir)
2205 {
2206    const char *callee = ir->callee->function_name();
2207
2208    if (!strcmp("__intrinsic_atomic_read", callee) ||
2209        !strcmp("__intrinsic_atomic_increment", callee) ||
2210        !strcmp("__intrinsic_atomic_predecrement", callee)) {
2211       visit_atomic_counter_intrinsic(ir);
2212    } else {
2213       assert(!"Unsupported intrinsic.");
2214    }
2215 }
2216
2217 src_reg
2218 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, int sampler)
2219 {
2220    vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_MCS);
2221    inst->base_mrf = 2;
2222    inst->mlen = 1;
2223    inst->sampler = sampler;
2224    inst->dst = dst_reg(this, glsl_type::uvec4_type);
2225    inst->dst.writemask = WRITEMASK_XYZW;
2226
2227    /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2228    int param_base = inst->base_mrf;
2229    int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2230    int zero_mask = 0xf & ~coord_mask;
2231
2232    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2233             coordinate));
2234
2235    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2236             src_reg(0)));
2237
2238    emit(inst);
2239    return src_reg(inst->dst);
2240 }
2241
2242 void
2243 vec4_visitor::visit(ir_texture *ir)
2244 {
2245    int sampler =
2246       _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2247
2248    /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2249     * emitting anything other than setting up the constant result.
2250     */
2251    if (ir->op == ir_tg4) {
2252       ir_constant *chan = ir->lod_info.component->as_constant();
2253       int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2254       if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2255          dst_reg result(this, ir->type);
2256          this->result = src_reg(result);
2257          emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2258          return;
2259       }
2260    }
2261
2262    /* Should be lowered by do_lower_texture_projection */
2263    assert(!ir->projector);
2264
2265    /* Should be lowered */
2266    assert(!ir->offset || !ir->offset->type->is_array());
2267
2268    /* Generate code to compute all the subexpression trees.  This has to be
2269     * done before loading any values into MRFs for the sampler message since
2270     * generating these values may involve SEND messages that need the MRFs.
2271     */
2272    src_reg coordinate;
2273    if (ir->coordinate) {
2274       ir->coordinate->accept(this);
2275       coordinate = this->result;
2276    }
2277
2278    src_reg shadow_comparitor;
2279    if (ir->shadow_comparitor) {
2280       ir->shadow_comparitor->accept(this);
2281       shadow_comparitor = this->result;
2282    }
2283
2284    bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2285    src_reg offset_value;
2286    if (has_nonconstant_offset) {
2287       ir->offset->accept(this);
2288       offset_value = src_reg(this->result);
2289    }
2290
2291    const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2292    src_reg lod, dPdx, dPdy, sample_index, mcs;
2293    switch (ir->op) {
2294    case ir_tex:
2295       lod = src_reg(0.0f);
2296       lod_type = glsl_type::float_type;
2297       break;
2298    case ir_txf:
2299    case ir_txl:
2300    case ir_txs:
2301       ir->lod_info.lod->accept(this);
2302       lod = this->result;
2303       lod_type = ir->lod_info.lod->type;
2304       break;
2305    case ir_query_levels:
2306       lod = src_reg(0);
2307       lod_type = glsl_type::int_type;
2308       break;
2309    case ir_txf_ms:
2310       ir->lod_info.sample_index->accept(this);
2311       sample_index = this->result;
2312       sample_index_type = ir->lod_info.sample_index->type;
2313
2314       if (brw->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2315          mcs = emit_mcs_fetch(ir, coordinate, sampler);
2316       else
2317          mcs = src_reg(0u);
2318       break;
2319    case ir_txd:
2320       ir->lod_info.grad.dPdx->accept(this);
2321       dPdx = this->result;
2322
2323       ir->lod_info.grad.dPdy->accept(this);
2324       dPdy = this->result;
2325
2326       lod_type = ir->lod_info.grad.dPdx->type;
2327       break;
2328    case ir_txb:
2329    case ir_lod:
2330    case ir_tg4:
2331       break;
2332    }
2333
2334    vec4_instruction *inst = NULL;
2335    switch (ir->op) {
2336    case ir_tex:
2337    case ir_txl:
2338       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXL);
2339       break;
2340    case ir_txd:
2341       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXD);
2342       break;
2343    case ir_txf:
2344       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF);
2345       break;
2346    case ir_txf_ms:
2347       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_CMS);
2348       break;
2349    case ir_txs:
2350       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
2351       break;
2352    case ir_tg4:
2353       if (has_nonconstant_offset)
2354          inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TG4_OFFSET);
2355       else
2356          inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TG4);
2357       break;
2358    case ir_query_levels:
2359       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
2360       break;
2361    case ir_txb:
2362       assert(!"TXB is not valid for vertex shaders.");
2363       break;
2364    case ir_lod:
2365       assert(!"LOD is not valid for vertex shaders.");
2366       break;
2367    default:
2368       assert(!"Unrecognized tex op");
2369    }
2370
2371    if (ir->offset != NULL && ir->op != ir_txf)
2372       inst->texture_offset = brw_texture_offset(ctx, ir->offset->as_constant());
2373
2374    /* Stuff the channel select bits in the top of the texture offset */
2375    if (ir->op == ir_tg4)
2376       inst->texture_offset |= gather_channel(ir, sampler) << 16;
2377
2378    /* The message header is necessary for:
2379     * - Gen4 (always)
2380     * - Texel offsets
2381     * - Gather channel selection
2382     * - Sampler indices too large to fit in a 4-bit value.
2383     */
2384    inst->header_present =
2385       brw->gen < 5 || inst->texture_offset != 0 || ir->op == ir_tg4 ||
2386       sampler >= 16;
2387    inst->base_mrf = 2;
2388    inst->mlen = inst->header_present + 1; /* always at least one */
2389    inst->sampler = sampler;
2390    inst->dst = dst_reg(this, ir->type);
2391    inst->dst.writemask = WRITEMASK_XYZW;
2392    inst->shadow_compare = ir->shadow_comparitor != NULL;
2393
2394    /* MRF for the first parameter */
2395    int param_base = inst->base_mrf + inst->header_present;
2396
2397    if (ir->op == ir_txs || ir->op == ir_query_levels) {
2398       int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2399       emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2400    } else {
2401       /* Load the coordinate */
2402       /* FINISHME: gl_clamp_mask and saturate */
2403       int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2404       int zero_mask = 0xf & ~coord_mask;
2405
2406       emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2407                coordinate));
2408
2409       if (zero_mask != 0) {
2410          emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2411                   src_reg(0)));
2412       }
2413       /* Load the shadow comparitor */
2414       if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2415          emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2416                           WRITEMASK_X),
2417                   shadow_comparitor));
2418          inst->mlen++;
2419       }
2420
2421       /* Load the LOD info */
2422       if (ir->op == ir_tex || ir->op == ir_txl) {
2423          int mrf, writemask;
2424          if (brw->gen >= 5) {
2425             mrf = param_base + 1;
2426             if (ir->shadow_comparitor) {
2427                writemask = WRITEMASK_Y;
2428                /* mlen already incremented */
2429             } else {
2430                writemask = WRITEMASK_X;
2431                inst->mlen++;
2432             }
2433          } else /* brw->gen == 4 */ {
2434             mrf = param_base;
2435             writemask = WRITEMASK_W;
2436          }
2437          emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2438       } else if (ir->op == ir_txf) {
2439          emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2440       } else if (ir->op == ir_txf_ms) {
2441          emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2442                   sample_index));
2443          if (brw->gen >= 7)
2444             /* MCS data is in the first channel of `mcs`, but we need to get it into
2445              * the .y channel of the second vec4 of params, so replicate .x across
2446              * the whole vec4 and then mask off everything except .y
2447              */
2448             mcs.swizzle = BRW_SWIZZLE_XXXX;
2449             emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2450                      mcs));
2451          inst->mlen++;
2452       } else if (ir->op == ir_txd) {
2453          const glsl_type *type = lod_type;
2454
2455          if (brw->gen >= 5) {
2456             dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2457             dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2458             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2459             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2460             inst->mlen++;
2461
2462             if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2463                dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2464                dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2465                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2466                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2467                inst->mlen++;
2468
2469                if (ir->shadow_comparitor) {
2470                   emit(MOV(dst_reg(MRF, param_base + 2,
2471                                    ir->shadow_comparitor->type, WRITEMASK_Z),
2472                            shadow_comparitor));
2473                }
2474             }
2475          } else /* brw->gen == 4 */ {
2476             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2477             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2478             inst->mlen += 2;
2479          }
2480       } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2481          if (ir->shadow_comparitor) {
2482             emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2483                      shadow_comparitor));
2484          }
2485
2486          emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2487                   offset_value));
2488          inst->mlen++;
2489       }
2490    }
2491
2492    emit(inst);
2493
2494    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2495     * spec requires layers.
2496     */
2497    if (ir->op == ir_txs) {
2498       glsl_type const *type = ir->sampler->type;
2499       if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2500           type->sampler_array) {
2501          emit_math(SHADER_OPCODE_INT_QUOTIENT,
2502                    writemask(inst->dst, WRITEMASK_Z),
2503                    src_reg(inst->dst), src_reg(6));
2504       }
2505    }
2506
2507    if (brw->gen == 6 && ir->op == ir_tg4) {
2508       emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2509    }
2510
2511    swizzle_result(ir, src_reg(inst->dst), sampler);
2512 }
2513
2514 /**
2515  * Apply workarounds for Gen6 gather with UINT/SINT
2516  */
2517 void
2518 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2519 {
2520    if (!wa)
2521       return;
2522
2523    int width = (wa & WA_8BIT) ? 8 : 16;
2524    dst_reg dst_f = dst;
2525    dst_f.type = BRW_REGISTER_TYPE_F;
2526
2527    /* Convert from UNORM to UINT */
2528    emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2529    emit(MOV(dst, src_reg(dst_f)));
2530
2531    if (wa & WA_SIGN) {
2532       /* Reinterpret the UINT value as a signed INT value by
2533        * shifting the sign bit into place, then shifting back
2534        * preserving sign.
2535        */
2536       emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2537       emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2538    }
2539 }
2540
2541 /**
2542  * Set up the gather channel based on the swizzle, for gather4.
2543  */
2544 uint32_t
2545 vec4_visitor::gather_channel(ir_texture *ir, int sampler)
2546 {
2547    ir_constant *chan = ir->lod_info.component->as_constant();
2548    int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2549    switch (swiz) {
2550       case SWIZZLE_X: return 0;
2551       case SWIZZLE_Y:
2552          /* gather4 sampler is broken for green channel on RG32F --
2553           * we must ask for blue instead.
2554           */
2555          if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2556             return 2;
2557          return 1;
2558       case SWIZZLE_Z: return 2;
2559       case SWIZZLE_W: return 3;
2560       default:
2561          assert(!"Not reached"); /* zero, one swizzles handled already */
2562          return 0;
2563    }
2564 }
2565
2566 void
2567 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, int sampler)
2568 {
2569    int s = key->tex.swizzles[sampler];
2570
2571    this->result = src_reg(this, ir->type);
2572    dst_reg swizzled_result(this->result);
2573
2574    if (ir->op == ir_query_levels) {
2575       /* # levels is in .w */
2576       orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2577       emit(MOV(swizzled_result, orig_val));
2578       return;
2579    }
2580
2581    if (ir->op == ir_txs || ir->type == glsl_type::float_type
2582                         || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2583       emit(MOV(swizzled_result, orig_val));
2584       return;
2585    }
2586
2587
2588    int zero_mask = 0, one_mask = 0, copy_mask = 0;
2589    int swizzle[4] = {0};
2590
2591    for (int i = 0; i < 4; i++) {
2592       switch (GET_SWZ(s, i)) {
2593       case SWIZZLE_ZERO:
2594          zero_mask |= (1 << i);
2595          break;
2596       case SWIZZLE_ONE:
2597          one_mask |= (1 << i);
2598          break;
2599       default:
2600          copy_mask |= (1 << i);
2601          swizzle[i] = GET_SWZ(s, i);
2602          break;
2603       }
2604    }
2605
2606    if (copy_mask) {
2607       orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2608       swizzled_result.writemask = copy_mask;
2609       emit(MOV(swizzled_result, orig_val));
2610    }
2611
2612    if (zero_mask) {
2613       swizzled_result.writemask = zero_mask;
2614       emit(MOV(swizzled_result, src_reg(0.0f)));
2615    }
2616
2617    if (one_mask) {
2618       swizzled_result.writemask = one_mask;
2619       emit(MOV(swizzled_result, src_reg(1.0f)));
2620    }
2621 }
2622
2623 void
2624 vec4_visitor::visit(ir_return *ir)
2625 {
2626    assert(!"not reached");
2627 }
2628
2629 void
2630 vec4_visitor::visit(ir_discard *ir)
2631 {
2632    assert(!"not reached");
2633 }
2634
2635 void
2636 vec4_visitor::visit(ir_if *ir)
2637 {
2638    /* Don't point the annotation at the if statement, because then it plus
2639     * the then and else blocks get printed.
2640     */
2641    this->base_ir = ir->condition;
2642
2643    if (brw->gen == 6) {
2644       emit_if_gen6(ir);
2645    } else {
2646       uint32_t predicate;
2647       emit_bool_to_cond_code(ir->condition, &predicate);
2648       emit(IF(predicate));
2649    }
2650
2651    visit_instructions(&ir->then_instructions);
2652
2653    if (!ir->else_instructions.is_empty()) {
2654       this->base_ir = ir->condition;
2655       emit(BRW_OPCODE_ELSE);
2656
2657       visit_instructions(&ir->else_instructions);
2658    }
2659
2660    this->base_ir = ir->condition;
2661    emit(BRW_OPCODE_ENDIF);
2662 }
2663
2664 void
2665 vec4_visitor::visit(ir_emit_vertex *)
2666 {
2667    assert(!"not reached");
2668 }
2669
2670 void
2671 vec4_visitor::visit(ir_end_primitive *)
2672 {
2673    assert(!"not reached");
2674 }
2675
2676 void
2677 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2678                                   dst_reg dst, src_reg offset,
2679                                   src_reg src0, src_reg src1)
2680 {
2681    unsigned mlen = 0;
2682
2683    /* Set the atomic operation offset. */
2684    emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2685    mlen++;
2686
2687    /* Set the atomic operation arguments. */
2688    if (src0.file != BAD_FILE) {
2689       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2690       mlen++;
2691    }
2692
2693    if (src1.file != BAD_FILE) {
2694       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2695       mlen++;
2696    }
2697
2698    /* Emit the instruction.  Note that this maps to the normal SIMD8
2699     * untyped atomic message on Ivy Bridge, but that's OK because
2700     * unused channels will be masked out.
2701     */
2702    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
2703                                  src_reg(atomic_op), src_reg(surf_index));
2704    inst->base_mrf = 0;
2705    inst->mlen = mlen;
2706 }
2707
2708 void
2709 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
2710                                         src_reg offset)
2711 {
2712    /* Set the surface read offset. */
2713    emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
2714
2715    /* Emit the instruction.  Note that this maps to the normal SIMD8
2716     * untyped surface read message, but that's OK because unused
2717     * channels will be masked out.
2718     */
2719    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ,
2720                                  dst, src_reg(surf_index));
2721    inst->base_mrf = 0;
2722    inst->mlen = 1;
2723 }
2724
2725 void
2726 vec4_visitor::emit_ndc_computation()
2727 {
2728    /* Get the position */
2729    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2730
2731    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2732    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2733    output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2734
2735    current_annotation = "NDC";
2736    dst_reg ndc_w = ndc;
2737    ndc_w.writemask = WRITEMASK_W;
2738    src_reg pos_w = pos;
2739    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2740    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2741
2742    dst_reg ndc_xyz = ndc;
2743    ndc_xyz.writemask = WRITEMASK_XYZ;
2744
2745    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2746 }
2747
2748 void
2749 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2750 {
2751    if (brw->gen < 6 &&
2752        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2753         key->userclip_active || brw->has_negative_rhw_bug)) {
2754       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2755       dst_reg header1_w = header1;
2756       header1_w.writemask = WRITEMASK_W;
2757
2758       emit(MOV(header1, 0u));
2759
2760       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2761          src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2762
2763          current_annotation = "Point size";
2764          emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2765          emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2766       }
2767
2768       if (key->userclip_active) {
2769          current_annotation = "Clipping flags";
2770          dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
2771          dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
2772
2773          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
2774          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
2775          emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
2776
2777          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
2778          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
2779          emit(SHL(flags1, src_reg(flags1), src_reg(4)));
2780          emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
2781       }
2782
2783       /* i965 clipping workaround:
2784        * 1) Test for -ve rhw
2785        * 2) If set,
2786        *      set ndc = (0,0,0,0)
2787        *      set ucp[6] = 1
2788        *
2789        * Later, clipping will detect ucp[6] and ensure the primitive is
2790        * clipped against all fixed planes.
2791        */
2792       if (brw->has_negative_rhw_bug) {
2793          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
2794          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
2795          emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
2796          vec4_instruction *inst;
2797          inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
2798          inst->predicate = BRW_PREDICATE_NORMAL;
2799          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
2800          inst->predicate = BRW_PREDICATE_NORMAL;
2801       }
2802
2803       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2804    } else if (brw->gen < 6) {
2805       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2806    } else {
2807       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2808       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2809          emit(MOV(brw_writemask(reg, WRITEMASK_W),
2810                   src_reg(output_reg[VARYING_SLOT_PSIZ])));
2811       }
2812       if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
2813          emit(MOV(retype(brw_writemask(reg, WRITEMASK_Y), BRW_REGISTER_TYPE_D),
2814                   src_reg(output_reg[VARYING_SLOT_LAYER])));
2815       }
2816       if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
2817          emit(MOV(retype(brw_writemask(reg, WRITEMASK_Z), BRW_REGISTER_TYPE_D),
2818                   src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
2819       }
2820    }
2821 }
2822
2823 void
2824 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
2825 {
2826    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2827     *
2828     *     "If a linked set of shaders forming the vertex stage contains no
2829     *     static write to gl_ClipVertex or gl_ClipDistance, but the
2830     *     application has requested clipping against user clip planes through
2831     *     the API, then the coordinate written to gl_Position is used for
2832     *     comparison against the user clip planes."
2833     *
2834     * This function is only called if the shader didn't write to
2835     * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
2836     * if the user wrote to it; otherwise we use gl_Position.
2837     */
2838    gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
2839    if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
2840       clip_vertex = VARYING_SLOT_POS;
2841    }
2842
2843    for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
2844         ++i) {
2845       reg.writemask = 1 << i;
2846       emit(DP4(reg,
2847                src_reg(output_reg[clip_vertex]),
2848                src_reg(this->userplane[i + offset])));
2849    }
2850 }
2851
2852 void
2853 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
2854 {
2855    assert (varying < VARYING_SLOT_MAX);
2856    reg.type = output_reg[varying].type;
2857    current_annotation = output_reg_annotation[varying];
2858    /* Copy the register, saturating if necessary */
2859    vec4_instruction *inst = emit(MOV(reg,
2860                                      src_reg(output_reg[varying])));
2861    if ((varying == VARYING_SLOT_COL0 ||
2862         varying == VARYING_SLOT_COL1 ||
2863         varying == VARYING_SLOT_BFC0 ||
2864         varying == VARYING_SLOT_BFC1) &&
2865        key->clamp_vertex_color) {
2866       inst->saturate = true;
2867    }
2868 }
2869
2870 void
2871 vec4_visitor::emit_urb_slot(int mrf, int varying)
2872 {
2873    struct brw_reg hw_reg = brw_message_reg(mrf);
2874    dst_reg reg = dst_reg(MRF, mrf);
2875    reg.type = BRW_REGISTER_TYPE_F;
2876
2877    switch (varying) {
2878    case VARYING_SLOT_PSIZ:
2879       /* PSIZ is always in slot 0, and is coupled with other flags. */
2880       current_annotation = "indices, point width, clip flags";
2881       emit_psiz_and_flags(hw_reg);
2882       break;
2883    case BRW_VARYING_SLOT_NDC:
2884       current_annotation = "NDC";
2885       emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
2886       break;
2887    case VARYING_SLOT_POS:
2888       current_annotation = "gl_Position";
2889       emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
2890       break;
2891    case VARYING_SLOT_EDGE:
2892       /* This is present when doing unfilled polygons.  We're supposed to copy
2893        * the edge flag from the user-provided vertex array
2894        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
2895        * of that attribute (starts as 1.0f).  This is then used in clipping to
2896        * determine which edges should be drawn as wireframe.
2897        */
2898       current_annotation = "edge flag";
2899       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
2900                                     glsl_type::float_type, WRITEMASK_XYZW))));
2901       break;
2902    case BRW_VARYING_SLOT_PAD:
2903       /* No need to write to this slot */
2904       break;
2905    default:
2906       emit_generic_urb_slot(reg, varying);
2907       break;
2908    }
2909 }
2910
2911 static int
2912 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
2913 {
2914    if (brw->gen >= 6) {
2915       /* URB data written (does not include the message header reg) must
2916        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
2917        * section 5.4.3.2.2: URB_INTERLEAVED.
2918        *
2919        * URB entries are allocated on a multiple of 1024 bits, so an
2920        * extra 128 bits written here to make the end align to 256 is
2921        * no problem.
2922        */
2923       if ((mlen % 2) != 1)
2924          mlen++;
2925    }
2926
2927    return mlen;
2928 }
2929
2930
2931 /**
2932  * Generates the VUE payload plus the necessary URB write instructions to
2933  * output it.
2934  *
2935  * The VUE layout is documented in Volume 2a.
2936  */
2937 void
2938 vec4_visitor::emit_vertex()
2939 {
2940    /* MRF 0 is reserved for the debugger, so start with message header
2941     * in MRF 1.
2942     */
2943    int base_mrf = 1;
2944    int mrf = base_mrf;
2945    /* In the process of generating our URB write message contents, we
2946     * may need to unspill a register or load from an array.  Those
2947     * reads would use MRFs 14-15.
2948     */
2949    int max_usable_mrf = 13;
2950
2951    /* The following assertion verifies that max_usable_mrf causes an
2952     * even-numbered amount of URB write data, which will meet gen6's
2953     * requirements for length alignment.
2954     */
2955    assert ((max_usable_mrf - base_mrf) % 2 == 0);
2956
2957    /* First mrf is the g0-based message header containing URB handles and
2958     * such.
2959     */
2960    emit_urb_write_header(mrf++);
2961
2962    if (brw->gen < 6) {
2963       emit_ndc_computation();
2964    }
2965
2966    /* Lower legacy ff and ClipVertex clipping to clip distances */
2967    if (key->userclip_active && !prog->UsesClipDistanceOut) {
2968       current_annotation = "user clip distances";
2969
2970       output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
2971       output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
2972
2973       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
2974       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
2975    }
2976
2977    /* We may need to split this up into several URB writes, so do them in a
2978     * loop.
2979     */
2980    int slot = 0;
2981    bool complete = false;
2982    do {
2983       /* URB offset is in URB row increments, and each of our MRFs is half of
2984        * one of those, since we're doing interleaved writes.
2985        */
2986       int offset = slot / 2;
2987
2988       mrf = base_mrf + 1;
2989       for (; slot < prog_data->vue_map.num_slots; ++slot) {
2990          emit_urb_slot(mrf++, prog_data->vue_map.slot_to_varying[slot]);
2991
2992          /* If this was max_usable_mrf, we can't fit anything more into this
2993           * URB WRITE.
2994           */
2995          if (mrf > max_usable_mrf) {
2996             slot++;
2997             break;
2998          }
2999       }
3000
3001       complete = slot >= prog_data->vue_map.num_slots;
3002       current_annotation = "URB write";
3003       vec4_instruction *inst = emit_urb_write_opcode(complete);
3004       inst->base_mrf = base_mrf;
3005       inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
3006       inst->offset += offset;
3007    } while(!complete);
3008 }
3009
3010
3011 src_reg
3012 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
3013                                  src_reg *reladdr, int reg_offset)
3014 {
3015    /* Because we store the values to scratch interleaved like our
3016     * vertex data, we need to scale the vec4 index by 2.
3017     */
3018    int message_header_scale = 2;
3019
3020    /* Pre-gen6, the message header uses byte offsets instead of vec4
3021     * (16-byte) offset units.
3022     */
3023    if (brw->gen < 6)
3024       message_header_scale *= 16;
3025
3026    if (reladdr) {
3027       src_reg index = src_reg(this, glsl_type::int_type);
3028
3029       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3030       emit_before(inst, MUL(dst_reg(index),
3031                             index, src_reg(message_header_scale)));
3032
3033       return index;
3034    } else {
3035       return src_reg(reg_offset * message_header_scale);
3036    }
3037 }
3038
3039 src_reg
3040 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
3041                                        src_reg *reladdr, int reg_offset)
3042 {
3043    if (reladdr) {
3044       src_reg index = src_reg(this, glsl_type::int_type);
3045
3046       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3047
3048       /* Pre-gen6, the message header uses byte offsets instead of vec4
3049        * (16-byte) offset units.
3050        */
3051       if (brw->gen < 6) {
3052          emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
3053       }
3054
3055       return index;
3056    } else if (brw->gen >= 8) {
3057       /* Store the offset in a GRF so we can send-from-GRF. */
3058       src_reg offset = src_reg(this, glsl_type::int_type);
3059       emit_before(inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3060       return offset;
3061    } else {
3062       int message_header_scale = brw->gen < 6 ? 16 : 1;
3063       return src_reg(reg_offset * message_header_scale);
3064    }
3065 }
3066
3067 /**
3068  * Emits an instruction before @inst to load the value named by @orig_src
3069  * from scratch space at @base_offset to @temp.
3070  *
3071  * @base_offset is measured in 32-byte units (the size of a register).
3072  */
3073 void
3074 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
3075                                 dst_reg temp, src_reg orig_src,
3076                                 int base_offset)
3077 {
3078    int reg_offset = base_offset + orig_src.reg_offset;
3079    src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
3080
3081    emit_before(inst, SCRATCH_READ(temp, index));
3082 }
3083
3084 /**
3085  * Emits an instruction after @inst to store the value to be written
3086  * to @orig_dst to scratch space at @base_offset, from @temp.
3087  *
3088  * @base_offset is measured in 32-byte units (the size of a register).
3089  */
3090 void
3091 vec4_visitor::emit_scratch_write(vec4_instruction *inst, int base_offset)
3092 {
3093    int reg_offset = base_offset + inst->dst.reg_offset;
3094    src_reg index = get_scratch_offset(inst, inst->dst.reladdr, reg_offset);
3095
3096    /* Create a temporary register to store *inst's result in.
3097     *
3098     * We have to be careful in MOVing from our temporary result register in
3099     * the scratch write.  If we swizzle from channels of the temporary that
3100     * weren't initialized, it will confuse live interval analysis, which will
3101     * make spilling fail to make progress.
3102     */
3103    src_reg temp = src_reg(this, glsl_type::vec4_type);
3104    temp.type = inst->dst.type;
3105    int first_writemask_chan = ffs(inst->dst.writemask) - 1;
3106    int swizzles[4];
3107    for (int i = 0; i < 4; i++)
3108       if (inst->dst.writemask & (1 << i))
3109          swizzles[i] = i;
3110       else
3111          swizzles[i] = first_writemask_chan;
3112    temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
3113                                swizzles[2], swizzles[3]);
3114
3115    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3116                                        inst->dst.writemask));
3117    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3118    write->predicate = inst->predicate;
3119    write->ir = inst->ir;
3120    write->annotation = inst->annotation;
3121    inst->insert_after(write);
3122
3123    inst->dst.file = temp.file;
3124    inst->dst.reg = temp.reg;
3125    inst->dst.reg_offset = temp.reg_offset;
3126    inst->dst.reladdr = NULL;
3127 }
3128
3129 /**
3130  * We can't generally support array access in GRF space, because a
3131  * single instruction's destination can only span 2 contiguous
3132  * registers.  So, we send all GRF arrays that get variable index
3133  * access to scratch space.
3134  */
3135 void
3136 vec4_visitor::move_grf_array_access_to_scratch()
3137 {
3138    int scratch_loc[this->virtual_grf_count];
3139
3140    for (int i = 0; i < this->virtual_grf_count; i++) {
3141       scratch_loc[i] = -1;
3142    }
3143
3144    /* First, calculate the set of virtual GRFs that need to be punted
3145     * to scratch due to having any array access on them, and where in
3146     * scratch.
3147     */
3148    foreach_list(node, &this->instructions) {
3149       vec4_instruction *inst = (vec4_instruction *)node;
3150
3151       if (inst->dst.file == GRF && inst->dst.reladdr &&
3152           scratch_loc[inst->dst.reg] == -1) {
3153          scratch_loc[inst->dst.reg] = c->last_scratch;
3154          c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
3155       }
3156
3157       for (int i = 0 ; i < 3; i++) {
3158          src_reg *src = &inst->src[i];
3159
3160          if (src->file == GRF && src->reladdr &&
3161              scratch_loc[src->reg] == -1) {
3162             scratch_loc[src->reg] = c->last_scratch;
3163             c->last_scratch += this->virtual_grf_sizes[src->reg];
3164          }
3165       }
3166    }
3167
3168    /* Now, for anything that will be accessed through scratch, rewrite
3169     * it to load/store.  Note that this is a _safe list walk, because
3170     * we may generate a new scratch_write instruction after the one
3171     * we're processing.
3172     */
3173    foreach_list_safe(node, &this->instructions) {
3174       vec4_instruction *inst = (vec4_instruction *)node;
3175
3176       /* Set up the annotation tracking for new generated instructions. */
3177       base_ir = inst->ir;
3178       current_annotation = inst->annotation;
3179
3180       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
3181          emit_scratch_write(inst, scratch_loc[inst->dst.reg]);
3182       }
3183
3184       for (int i = 0 ; i < 3; i++) {
3185          if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
3186             continue;
3187
3188          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3189
3190          emit_scratch_read(inst, temp, inst->src[i],
3191                            scratch_loc[inst->src[i].reg]);
3192
3193          inst->src[i].file = temp.file;
3194          inst->src[i].reg = temp.reg;
3195          inst->src[i].reg_offset = temp.reg_offset;
3196          inst->src[i].reladdr = NULL;
3197       }
3198    }
3199 }
3200
3201 /**
3202  * Emits an instruction before @inst to load the value named by @orig_src
3203  * from the pull constant buffer (surface) at @base_offset to @temp.
3204  */
3205 void
3206 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
3207                                       dst_reg temp, src_reg orig_src,
3208                                       int base_offset)
3209 {
3210    int reg_offset = base_offset + orig_src.reg_offset;
3211    src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3212    src_reg offset = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
3213    vec4_instruction *load;
3214
3215    if (brw->gen >= 7) {
3216       dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
3217       grf_offset.type = offset.type;
3218       emit_before(inst, MOV(grf_offset, offset));
3219
3220       load = new(mem_ctx) vec4_instruction(this,
3221                                            VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
3222                                            temp, index, src_reg(grf_offset));
3223    } else {
3224       load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
3225                                            temp, index, offset);
3226       load->base_mrf = 14;
3227       load->mlen = 1;
3228    }
3229    emit_before(inst, load);
3230 }
3231
3232 /**
3233  * Implements array access of uniforms by inserting a
3234  * PULL_CONSTANT_LOAD instruction.
3235  *
3236  * Unlike temporary GRF array access (where we don't support it due to
3237  * the difficulty of doing relative addressing on instruction
3238  * destinations), we could potentially do array access of uniforms
3239  * that were loaded in GRF space as push constants.  In real-world
3240  * usage we've seen, though, the arrays being used are always larger
3241  * than we could load as push constants, so just always move all
3242  * uniform array access out to a pull constant buffer.
3243  */
3244 void
3245 vec4_visitor::move_uniform_array_access_to_pull_constants()
3246 {
3247    int pull_constant_loc[this->uniforms];
3248
3249    for (int i = 0; i < this->uniforms; i++) {
3250       pull_constant_loc[i] = -1;
3251    }
3252
3253    /* Walk through and find array access of uniforms.  Put a copy of that
3254     * uniform in the pull constant buffer.
3255     *
3256     * Note that we don't move constant-indexed accesses to arrays.  No
3257     * testing has been done of the performance impact of this choice.
3258     */
3259    foreach_list_safe(node, &this->instructions) {
3260       vec4_instruction *inst = (vec4_instruction *)node;
3261
3262       for (int i = 0 ; i < 3; i++) {
3263          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3264             continue;
3265
3266          int uniform = inst->src[i].reg;
3267
3268          /* If this array isn't already present in the pull constant buffer,
3269           * add it.
3270           */
3271          if (pull_constant_loc[uniform] == -1) {
3272             const float **values = &stage_prog_data->param[uniform * 4];
3273
3274             pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3275
3276             for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3277                stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3278                   = values[j];
3279             }
3280          }
3281
3282          /* Set up the annotation tracking for new generated instructions. */
3283          base_ir = inst->ir;
3284          current_annotation = inst->annotation;
3285
3286          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3287
3288          emit_pull_constant_load(inst, temp, inst->src[i],
3289                                  pull_constant_loc[uniform]);
3290
3291          inst->src[i].file = temp.file;
3292          inst->src[i].reg = temp.reg;
3293          inst->src[i].reg_offset = temp.reg_offset;
3294          inst->src[i].reladdr = NULL;
3295       }
3296    }
3297
3298    /* Now there are no accesses of the UNIFORM file with a reladdr, so
3299     * no need to track them as larger-than-vec4 objects.  This will be
3300     * relied on in cutting out unused uniform vectors from push
3301     * constants.
3302     */
3303    split_uniform_registers();
3304 }
3305
3306 void
3307 vec4_visitor::resolve_ud_negate(src_reg *reg)
3308 {
3309    if (reg->type != BRW_REGISTER_TYPE_UD ||
3310        !reg->negate)
3311       return;
3312
3313    src_reg temp = src_reg(this, glsl_type::uvec4_type);
3314    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3315    *reg = temp;
3316 }
3317
3318 vec4_visitor::vec4_visitor(struct brw_context *brw,
3319                            struct brw_vec4_compile *c,
3320                            struct gl_program *prog,
3321                            const struct brw_vec4_prog_key *key,
3322                            struct brw_vec4_prog_data *prog_data,
3323                            struct gl_shader_program *shader_prog,
3324                            struct brw_shader *shader,
3325                            void *mem_ctx,
3326                            bool debug_flag,
3327                            bool no_spills,
3328                            shader_time_shader_type st_base,
3329                            shader_time_shader_type st_written,
3330                            shader_time_shader_type st_reset)
3331    : sanity_param_count(0),
3332      fail_msg(NULL),
3333      first_non_payload_grf(0),
3334      need_all_constants_in_pull_buffer(false),
3335      debug_flag(debug_flag),
3336      no_spills(no_spills),
3337      st_base(st_base),
3338      st_written(st_written),
3339      st_reset(st_reset)
3340 {
3341    this->brw = brw;
3342    this->ctx = &brw->ctx;
3343    this->shader_prog = shader_prog;
3344    this->shader = shader;
3345
3346    this->mem_ctx = mem_ctx;
3347    this->failed = false;
3348
3349    this->base_ir = NULL;
3350    this->current_annotation = NULL;
3351    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3352
3353    this->c = c;
3354    this->prog = prog;
3355    this->key = key;
3356    this->prog_data = prog_data;
3357    this->stage_prog_data = &prog_data->base;
3358
3359    this->variable_ht = hash_table_ctor(0,
3360                                        hash_table_pointer_hash,
3361                                        hash_table_pointer_compare);
3362
3363    this->virtual_grf_start = NULL;
3364    this->virtual_grf_end = NULL;
3365    this->virtual_grf_sizes = NULL;
3366    this->virtual_grf_count = 0;
3367    this->virtual_grf_reg_map = NULL;
3368    this->virtual_grf_reg_count = 0;
3369    this->virtual_grf_array_size = 0;
3370    this->live_intervals_valid = false;
3371
3372    this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3373
3374    this->uniforms = 0;
3375 }
3376
3377 vec4_visitor::~vec4_visitor()
3378 {
3379    hash_table_dtor(this->variable_ht);
3380 }
3381
3382
3383 void
3384 vec4_visitor::fail(const char *format, ...)
3385 {
3386    va_list va;
3387    char *msg;
3388
3389    if (failed)
3390       return;
3391
3392    failed = true;
3393
3394    va_start(va, format);
3395    msg = ralloc_vasprintf(mem_ctx, format, va);
3396    va_end(va);
3397    msg = ralloc_asprintf(mem_ctx, "vec4 compile failed: %s\n", msg);
3398
3399    this->fail_msg = msg;
3400
3401    if (debug_flag) {
3402       fprintf(stderr, "%s",  msg);
3403    }
3404 }
3405
3406 } /* namespace brw */