src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "glsl/ir_uniform.h"
  26 extern "C" {
  27 #include "program/sampler.h"
  28 }
  29
  30 namespace brw {
  31
  32 vec4_instruction::vec4_instruction(vec4_visitor *v,
  33                                    enum opcode opcode, dst_reg dst,
  34                                    src_reg src0, src_reg src1, src_reg src2)
  35 {
  36    this->opcode = opcode;
  37    this->dst = dst;
  38    this->src[0] = src0;
  39    this->src[1] = src1;
  40    this->src[2] = src2;
  41    this->saturate = false;
  42    this->force_writemask_all = false;
  43    this->no_dd_clear = false;
  44    this->no_dd_check = false;
  45    this->conditional_mod = BRW_CONDITIONAL_NONE;
  46    this->sampler = 0;
  47    this->texture_offset = 0;
  48    this->target = 0;
  49    this->shadow_compare = false;
  50    this->ir = v->base_ir;
  51    this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
  52    this->header_present = false;
  53    this->mlen = 0;
  54    this->base_mrf = 0;
  55    this->offset = 0;
  56    this->annotation = v->current_annotation;
  57 }
  58
  59 vec4_instruction *
  60 vec4_visitor::emit(vec4_instruction *inst)
  61 {
  62    this->instructions.push_tail(inst);
  63
  64    return inst;
  65 }
  66
  67 vec4_instruction *
  68 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
  69 {
  70    new_inst->ir = inst->ir;
  71    new_inst->annotation = inst->annotation;
  72
  73    inst->insert_before(new_inst);
  74
  75    return inst;
  76 }
  77
  78 vec4_instruction *
  79 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
  80                    src_reg src0, src_reg src1, src_reg src2)
  81 {
  82    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
  83                                              src0, src1, src2));
  84 }
  85
  86
  87 vec4_instruction *
  88 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
  89 {
  90    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
  91 }
  92
  93 vec4_instruction *
  94 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
  95 {
  96    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
  97 }
  98
  99 vec4_instruction *
 100 vec4_visitor::emit(enum opcode opcode, dst_reg dst)
 101 {
 102    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst));
 103 }
 104
 105 vec4_instruction *
 106 vec4_visitor::emit(enum opcode opcode)
 107 {
 108    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
 109 }
 110
 111 #define ALU1(op)                                                        \
 112    vec4_instruction *                                                   \
 113    vec4_visitor::op(dst_reg dst, src_reg src0)                          \
 114    {                                                                    \
 115       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 116                                            src0);                       \
 117    }
 118
 119 #define ALU2(op)                                                        \
 120    vec4_instruction *                                                   \
 121    vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1)            \
 122    {                                                                    \
 123       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 124                                            src0, src1);                 \
 125    }
 126
 127 #define ALU3(op)                                                        \
 128    vec4_instruction *                                                   \
 129    vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1, src_reg src2)\
 130    {                                                                    \
 131       assert(brw->gen >= 6);                                            \
 132       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 133                                            src0, src1, src2);           \
 134    }
 135
 136 ALU1(NOT)
 137 ALU1(MOV)
 138 ALU1(FRC)
 139 ALU1(RNDD)
 140 ALU1(RNDE)
 141 ALU1(RNDZ)
 142 ALU1(F32TO16)
 143 ALU1(F16TO32)
 144 ALU2(ADD)
 145 ALU2(MUL)
 146 ALU2(MACH)
 147 ALU2(AND)
 148 ALU2(OR)
 149 ALU2(XOR)
 150 ALU2(DP3)
 151 ALU2(DP4)
 152 ALU2(DPH)
 153 ALU2(SHL)
 154 ALU2(SHR)
 155 ALU2(ASR)
 156 ALU3(LRP)
 157 ALU1(BFREV)
 158 ALU3(BFE)
 159 ALU2(BFI1)
 160 ALU3(BFI2)
 161 ALU1(FBH)
 162 ALU1(FBL)
 163 ALU1(CBIT)
 164 ALU3(MAD)
 165 ALU2(ADDC)
 166 ALU2(SUBB)
 167
 168 /** Gen4 predicated IF. */
 169 vec4_instruction *
 170 vec4_visitor::IF(uint32_t predicate)
 171 {
 172    vec4_instruction *inst;
 173
 174    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
 175    inst->predicate = predicate;
 176
 177    return inst;
 178 }
 179
 180 /** Gen6 IF with embedded comparison. */
 181 vec4_instruction *
 182 vec4_visitor::IF(src_reg src0, src_reg src1, uint32_t condition)
 183 {
 184    assert(brw->gen == 6);
 185
 186    vec4_instruction *inst;
 187
 188    resolve_ud_negate(&src0);
 189    resolve_ud_negate(&src1);
 190
 191    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
 192                                         src0, src1);
 193    inst->conditional_mod = condition;
 194
 195    return inst;
 196 }
 197
 198 /**
 199  * CMP: Sets the low bit of the destination channels with the result
 200  * of the comparison, while the upper bits are undefined, and updates
 201  * the flag register with the packed 16 bits of the result.
 202  */
 203 vec4_instruction *
 204 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, uint32_t condition)
 205 {
 206    vec4_instruction *inst;
 207
 208    /* original gen4 does type conversion to the destination type
 209     * before before comparison, producing garbage results for floating
 210     * point comparisons.
 211     */
 212    if (brw->gen == 4) {
 213       dst.type = src0.type;
 214       if (dst.file == HW_REG)
 215          dst.fixed_hw_reg.type = dst.type;
 216    }
 217
 218    resolve_ud_negate(&src0);
 219    resolve_ud_negate(&src1);
 220
 221    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
 222    inst->conditional_mod = condition;
 223
 224    return inst;
 225 }
 226
 227 vec4_instruction *
 228 vec4_visitor::SCRATCH_READ(dst_reg dst, src_reg index)
 229 {
 230    vec4_instruction *inst;
 231
 232    inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_READ,
 233                                         dst, index);
 234    inst->base_mrf = 14;
 235    inst->mlen = 2;
 236
 237    return inst;
 238 }
 239
 240 vec4_instruction *
 241 vec4_visitor::SCRATCH_WRITE(dst_reg dst, src_reg src, src_reg index)
 242 {
 243    vec4_instruction *inst;
 244
 245    inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_WRITE,
 246                                         dst, src, index);
 247    inst->base_mrf = 13;
 248    inst->mlen = 3;
 249
 250    return inst;
 251 }
 252
 253 void
 254 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
 255 {
 256    static enum opcode dot_opcodes[] = {
 257       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
 258    };
 259
 260    emit(dot_opcodes[elements - 2], dst, src0, src1);
 261 }
 262
 263 src_reg
 264 vec4_visitor::fix_3src_operand(src_reg src)
 265 {
 266    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
 267     * able to use vertical stride of zero to replicate the vec4 uniform, like
 268     *
 269     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
 270     *
 271     * But you can't, since vertical stride is always four in three-source
 272     * instructions. Instead, insert a MOV instruction to do the replication so
 273     * that the three-source instruction can consume it.
 274     */
 275
 276    /* The MOV is only needed if the source is a uniform or immediate. */
 277    if (src.file != UNIFORM && src.file != IMM)
 278       return src;
 279
 280    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 281    expanded.type = src.type;
 282    emit(MOV(expanded, src));
 283    return src_reg(expanded);
 284 }
 285
 286 src_reg
 287 vec4_visitor::fix_math_operand(src_reg src)
 288 {
 289    /* The gen6 math instruction ignores the source modifiers --
 290     * swizzle, abs, negate, and at least some parts of the register
 291     * region description.
 292     *
 293     * Rather than trying to enumerate all these cases, *always* expand the
 294     * operand to a temp GRF for gen6.
 295     *
 296     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 297     * can't use.
 298     */
 299
 300    if (brw->gen == 7 && src.file != IMM)
 301       return src;
 302
 303    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 304    expanded.type = src.type;
 305    emit(MOV(expanded, src));
 306    return src_reg(expanded);
 307 }
 308
 309 void
 310 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
 311 {
 312    src = fix_math_operand(src);
 313
 314    if (dst.writemask != WRITEMASK_XYZW) {
 315       /* The gen6 math instruction must be align1, so we can't do
 316        * writemasks.
 317        */
 318       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 319
 320       emit(opcode, temp_dst, src);
 321
 322       emit(MOV(dst, src_reg(temp_dst)));
 323    } else {
 324       emit(opcode, dst, src);
 325    }
 326 }
 327
 328 void
 329 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
 330 {
 331    vec4_instruction *inst = emit(opcode, dst, src);
 332    inst->base_mrf = 1;
 333    inst->mlen = 1;
 334 }
 335
 336 void
 337 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
 338 {
 339    switch (opcode) {
 340    case SHADER_OPCODE_RCP:
 341    case SHADER_OPCODE_RSQ:
 342    case SHADER_OPCODE_SQRT:
 343    case SHADER_OPCODE_EXP2:
 344    case SHADER_OPCODE_LOG2:
 345    case SHADER_OPCODE_SIN:
 346    case SHADER_OPCODE_COS:
 347       break;
 348    default:
 349       assert(!"not reached: bad math opcode");
 350       return;
 351    }
 352
 353    if (brw->gen >= 6) {
 354       return emit_math1_gen6(opcode, dst, src);
 355    } else {
 356       return emit_math1_gen4(opcode, dst, src);
 357    }
 358 }
 359
 360 void
 361 vec4_visitor::emit_math2_gen6(enum opcode opcode,
 362                               dst_reg dst, src_reg src0, src_reg src1)
 363 {
 364    src0 = fix_math_operand(src0);
 365    src1 = fix_math_operand(src1);
 366
 367    if (dst.writemask != WRITEMASK_XYZW) {
 368       /* The gen6 math instruction must be align1, so we can't do
 369        * writemasks.
 370        */
 371       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 372       temp_dst.type = dst.type;
 373
 374       emit(opcode, temp_dst, src0, src1);
 375
 376       emit(MOV(dst, src_reg(temp_dst)));
 377    } else {
 378       emit(opcode, dst, src0, src1);
 379    }
 380 }
 381
 382 void
 383 vec4_visitor::emit_math2_gen4(enum opcode opcode,
 384                               dst_reg dst, src_reg src0, src_reg src1)
 385 {
 386    vec4_instruction *inst = emit(opcode, dst, src0, src1);
 387    inst->base_mrf = 1;
 388    inst->mlen = 2;
 389 }
 390
 391 void
 392 vec4_visitor::emit_math(enum opcode opcode,
 393                         dst_reg dst, src_reg src0, src_reg src1)
 394 {
 395    switch (opcode) {
 396    case SHADER_OPCODE_POW:
 397    case SHADER_OPCODE_INT_QUOTIENT:
 398    case SHADER_OPCODE_INT_REMAINDER:
 399       break;
 400    default:
 401       assert(!"not reached: unsupported binary math opcode");
 402       return;
 403    }
 404
 405    if (brw->gen >= 6) {
 406       return emit_math2_gen6(opcode, dst, src0, src1);
 407    } else {
 408       return emit_math2_gen4(opcode, dst, src0, src1);
 409    }
 410 }
 411
 412 void
 413 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 414 {
 415    if (brw->gen < 7)
 416       assert(!"ir_unop_pack_half_2x16 should be lowered");
 417
 418    assert(dst.type == BRW_REGISTER_TYPE_UD);
 419    assert(src0.type == BRW_REGISTER_TYPE_F);
 420
 421    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
 422     *
 423     *   Because this instruction does not have a 16-bit floating-point type,
 424     *   the destination data type must be Word (W).
 425     *
 426     *   The destination must be DWord-aligned and specify a horizontal stride
 427     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
 428     *   each destination channel and the upper word is not modified.
 429     *
 430     * The above restriction implies that the f32to16 instruction must use
 431     * align1 mode, because only in align1 mode is it possible to specify
 432     * horizontal stride.  We choose here to defy the hardware docs and emit
 433     * align16 instructions.
 434     *
 435     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
 436     * instructions. I was partially successful in that the code passed all
 437     * tests.  However, the code was dubiously correct and fragile, and the
 438     * tests were not harsh enough to probe that frailty. Not trusting the
 439     * code, I chose instead to remain in align16 mode in defiance of the hw
 440     * docs).
 441     *
 442     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
 443     * simulator, emitting a f32to16 in align16 mode with UD as destination
 444     * data type is safe. The behavior differs from that specified in the PRM
 445     * in that the upper word of each destination channel is cleared to 0.
 446     */
 447
 448    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 449    src_reg tmp_src(tmp_dst);
 450
 451 #if 0
 452    /* Verify the undocumented behavior on which the following instructions
 453     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
 454     * then the result of the bit-or instruction below will be incorrect.
 455     *
 456     * You should inspect the disasm output in order to verify that the MOV is
 457     * not optimized away.
 458     */
 459    emit(MOV(tmp_dst, src_reg(0x12345678u)));
 460 #endif
 461
 462    /* Give tmp the form below, where "." means untouched.
 463     *
 464     *     w z          y          x w z          y          x
 465     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
 466     *
 467     * That the upper word of each write-channel be 0 is required for the
 468     * following bit-shift and bit-or instructions to work. Note that this
 469     * relies on the undocumented hardware behavior mentioned above.
 470     */
 471    tmp_dst.writemask = WRITEMASK_XY;
 472    emit(F32TO16(tmp_dst, src0));
 473
 474    /* Give the write-channels of dst the form:
 475     *   0xhhhh0000
 476     */
 477    tmp_src.swizzle = BRW_SWIZZLE_YYYY;
 478    emit(SHL(dst, tmp_src, src_reg(16u)));
 479
 480    /* Finally, give the write-channels of dst the form of packHalf2x16's
 481     * output:
 482     *   0xhhhhllll
 483     */
 484    tmp_src.swizzle = BRW_SWIZZLE_XXXX;
 485    emit(OR(dst, src_reg(dst), tmp_src));
 486 }
 487
 488 void
 489 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 490 {
 491    if (brw->gen < 7)
 492       assert(!"ir_unop_unpack_half_2x16 should be lowered");
 493
 494    assert(dst.type == BRW_REGISTER_TYPE_F);
 495    assert(src0.type == BRW_REGISTER_TYPE_UD);
 496
 497    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
 498     *
 499     *   Because this instruction does not have a 16-bit floating-point type,
 500     *   the source data type must be Word (W). The destination type must be
 501     *   F (Float).
 502     *
 503     * To use W as the source data type, we must adjust horizontal strides,
 504     * which is only possible in align1 mode. All my [chadv] attempts at
 505     * emitting align1 instructions for unpackHalf2x16 failed to pass the
 506     * Piglit tests, so I gave up.
 507     *
 508     * I've verified that, on gen7 hardware and the simulator, it is safe to
 509     * emit f16to32 in align16 mode with UD as source data type.
 510     */
 511
 512    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 513    src_reg tmp_src(tmp_dst);
 514
 515    tmp_dst.writemask = WRITEMASK_X;
 516    emit(AND(tmp_dst, src0, src_reg(0xffffu)));
 517
 518    tmp_dst.writemask = WRITEMASK_Y;
 519    emit(SHR(tmp_dst, src0, src_reg(16u)));
 520
 521    dst.writemask = WRITEMASK_XY;
 522    emit(F16TO32(dst, tmp_src));
 523 }
 524
 525 void
 526 vec4_visitor::visit_instructions(const exec_list *list)
 527 {
 528    foreach_list(node, list) {
 529       ir_instruction *ir = (ir_instruction *)node;
 530
 531       base_ir = ir;
 532       ir->accept(this);
 533    }
 534 }
 535
 536
 537 static int
 538 type_size(const struct glsl_type *type)
 539 {
 540    unsigned int i;
 541    int size;
 542
 543    switch (type->base_type) {
 544    case GLSL_TYPE_UINT:
 545    case GLSL_TYPE_INT:
 546    case GLSL_TYPE_FLOAT:
 547    case GLSL_TYPE_BOOL:
 548       if (type->is_matrix()) {
 549          return type->matrix_columns;
 550       } else {
 551          /* Regardless of size of vector, it gets a vec4. This is bad
 552           * packing for things like floats, but otherwise arrays become a
 553           * mess.  Hopefully a later pass over the code can pack scalars
 554           * down if appropriate.
 555           */
 556          return 1;
 557       }
 558    case GLSL_TYPE_ARRAY:
 559       assert(type->length > 0);
 560       return type_size(type->fields.array) * type->length;
 561    case GLSL_TYPE_STRUCT:
 562       size = 0;
 563       for (i = 0; i < type->length; i++) {
 564          size += type_size(type->fields.structure[i].type);
 565       }
 566       return size;
 567    case GLSL_TYPE_SAMPLER:
 568       /* Samplers take up one slot in UNIFORMS[], but they're baked in
 569        * at link time.
 570        */
 571       return 1;
 572    case GLSL_TYPE_ATOMIC_UINT:
 573       return 0;
 574    case GLSL_TYPE_IMAGE:
 575    case GLSL_TYPE_VOID:
 576    case GLSL_TYPE_ERROR:
 577    case GLSL_TYPE_INTERFACE:
 578       assert(0);
 579       break;
 580    }
 581
 582    return 0;
 583 }
 584
 585 int
 586 vec4_visitor::virtual_grf_alloc(int size)
 587 {
 588    if (virtual_grf_array_size <= virtual_grf_count) {
 589       if (virtual_grf_array_size == 0)
 590          virtual_grf_array_size = 16;
 591       else
 592          virtual_grf_array_size *= 2;
 593       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 594                                    virtual_grf_array_size);
 595       virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
 596                                      virtual_grf_array_size);
 597    }
 598    virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
 599    virtual_grf_reg_count += size;
 600    virtual_grf_sizes[virtual_grf_count] = size;
 601    return virtual_grf_count++;
 602 }
 603
 604 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 605 {
 606    init();
 607
 608    this->file = GRF;
 609    this->reg = v->virtual_grf_alloc(type_size(type));
 610
 611    if (type->is_array() || type->is_record()) {
 612       this->swizzle = BRW_SWIZZLE_NOOP;
 613    } else {
 614       this->swizzle = swizzle_for_size(type->vector_elements);
 615    }
 616
 617    this->type = brw_type_for_base_type(type);
 618 }
 619
 620 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 621 {
 622    init();
 623
 624    this->file = GRF;
 625    this->reg = v->virtual_grf_alloc(type_size(type));
 626
 627    if (type->is_array() || type->is_record()) {
 628       this->writemask = WRITEMASK_XYZW;
 629    } else {
 630       this->writemask = (1 << type->vector_elements) - 1;
 631    }
 632
 633    this->type = brw_type_for_base_type(type);
 634 }
 635
 636 /* Our support for uniforms is piggy-backed on the struct
 637  * gl_fragment_program, because that's where the values actually
 638  * get stored, rather than in some global gl_shader_program uniform
 639  * store.
 640  */
 641 void
 642 vec4_visitor::setup_uniform_values(ir_variable *ir)
 643 {
 644    int namelen = strlen(ir->name);
 645
 646    /* The data for our (non-builtin) uniforms is stored in a series of
 647     * gl_uniform_driver_storage structs for each subcomponent that
 648     * glGetUniformLocation() could name.  We know it's been set up in the same
 649     * order we'd walk the type, so walk the list of storage and find anything
 650     * with our name, or the prefix of a component that starts with our name.
 651     */
 652    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 653       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 654
 655       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 656           (storage->name[namelen] != 0 &&
 657            storage->name[namelen] != '.' &&
 658            storage->name[namelen] != '[')) {
 659          continue;
 660       }
 661
 662       gl_constant_value *components = storage->storage;
 663       unsigned vector_count = (MAX2(storage->array_elements, 1) *
 664                                storage->type->matrix_columns);
 665
 666       for (unsigned s = 0; s < vector_count; s++) {
 667          assert(uniforms < uniform_array_size);
 668          uniform_vector_size[uniforms] = storage->type->vector_elements;
 669
 670          int i;
 671          for (i = 0; i < uniform_vector_size[uniforms]; i++) {
 672             stage_prog_data->param[uniforms * 4 + i] = &components->f;
 673             components++;
 674          }
 675          for (; i < 4; i++) {
 676             static float zero = 0;
 677             stage_prog_data->param[uniforms * 4 + i] = &zero;
 678          }
 679
 680          uniforms++;
 681       }
 682    }
 683 }
 684
 685 void
 686 vec4_visitor::setup_uniform_clipplane_values()
 687 {
 688    gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
 689
 690    for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
 691       assert(this->uniforms < uniform_array_size);
 692       this->uniform_vector_size[this->uniforms] = 4;
 693       this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
 694       this->userplane[i].type = BRW_REGISTER_TYPE_F;
 695       for (int j = 0; j < 4; ++j) {
 696          stage_prog_data->param[this->uniforms * 4 + j] = &clip_planes[i][j];
 697       }
 698       ++this->uniforms;
 699    }
 700 }
 701
 702 /* Our support for builtin uniforms is even scarier than non-builtin.
 703  * It sits on top of the PROG_STATE_VAR parameters that are
 704  * automatically updated from GL context state.
 705  */
 706 void
 707 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
 708 {
 709    const ir_state_slot *const slots = ir->state_slots;
 710    assert(ir->state_slots != NULL);
 711
 712    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 713       /* This state reference has already been setup by ir_to_mesa,
 714        * but we'll get the same index back here.  We can reference
 715        * ParameterValues directly, since unlike brw_fs.cpp, we never
 716        * add new state references during compile.
 717        */
 718       int index = _mesa_add_state_reference(this->prog->Parameters,
 719                                             (gl_state_index *)slots[i].tokens);
 720       float *values = &this->prog->Parameters->ParameterValues[index][0].f;
 721
 722       assert(this->uniforms < uniform_array_size);
 723       this->uniform_vector_size[this->uniforms] = 0;
 724       /* Add each of the unique swizzled channels of the element.
 725        * This will end up matching the size of the glsl_type of this field.
 726        */
 727       int last_swiz = -1;
 728       for (unsigned int j = 0; j < 4; j++) {
 729          int swiz = GET_SWZ(slots[i].swizzle, j);
 730          last_swiz = swiz;
 731
 732          stage_prog_data->param[this->uniforms * 4 + j] = &values[swiz];
 733          assert(this->uniforms < uniform_array_size);
 734          if (swiz <= last_swiz)
 735             this->uniform_vector_size[this->uniforms]++;
 736       }
 737       this->uniforms++;
 738    }
 739 }
 740
 741 dst_reg *
 742 vec4_visitor::variable_storage(ir_variable *var)
 743 {
 744    return (dst_reg *)hash_table_find(this->variable_ht, var);
 745 }
 746
 747 void
 748 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir, uint32_t *predicate)
 749 {
 750    ir_expression *expr = ir->as_expression();
 751
 752    *predicate = BRW_PREDICATE_NORMAL;
 753
 754    if (expr) {
 755       src_reg op[2];
 756       vec4_instruction *inst;
 757
 758       assert(expr->get_num_operands() <= 2);
 759       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 760          expr->operands[i]->accept(this);
 761          op[i] = this->result;
 762
 763          resolve_ud_negate(&op[i]);
 764       }
 765
 766       switch (expr->operation) {
 767       case ir_unop_logic_not:
 768          inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
 769          inst->conditional_mod = BRW_CONDITIONAL_Z;
 770          break;
 771
 772       case ir_binop_logic_xor:
 773          inst = emit(XOR(dst_null_d(), op[0], op[1]));
 774          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 775          break;
 776
 777       case ir_binop_logic_or:
 778          inst = emit(OR(dst_null_d(), op[0], op[1]));
 779          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 780          break;
 781
 782       case ir_binop_logic_and:
 783          inst = emit(AND(dst_null_d(), op[0], op[1]));
 784          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 785          break;
 786
 787       case ir_unop_f2b:
 788          if (brw->gen >= 6) {
 789             emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
 790          } else {
 791             inst = emit(MOV(dst_null_f(), op[0]));
 792             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 793          }
 794          break;
 795
 796       case ir_unop_i2b:
 797          if (brw->gen >= 6) {
 798             emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 799          } else {
 800             inst = emit(MOV(dst_null_d(), op[0]));
 801             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 802          }
 803          break;
 804
 805       case ir_binop_all_equal:
 806          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 807          *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
 808          break;
 809
 810       case ir_binop_any_nequal:
 811          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 812          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 813          break;
 814
 815       case ir_unop_any:
 816          inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 817          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 818          break;
 819
 820       case ir_binop_greater:
 821       case ir_binop_gequal:
 822       case ir_binop_less:
 823       case ir_binop_lequal:
 824       case ir_binop_equal:
 825       case ir_binop_nequal:
 826          emit(CMP(dst_null_d(), op[0], op[1],
 827                   brw_conditional_for_comparison(expr->operation)));
 828          break;
 829
 830       default:
 831          assert(!"not reached");
 832          break;
 833       }
 834       return;
 835    }
 836
 837    ir->accept(this);
 838
 839    resolve_ud_negate(&this->result);
 840
 841    if (brw->gen >= 6) {
 842       vec4_instruction *inst = emit(AND(dst_null_d(),
 843                                         this->result, src_reg(1)));
 844       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 845    } else {
 846       vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
 847       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 848    }
 849 }
 850
 851 /**
 852  * Emit a gen6 IF statement with the comparison folded into the IF
 853  * instruction.
 854  */
 855 void
 856 vec4_visitor::emit_if_gen6(ir_if *ir)
 857 {
 858    ir_expression *expr = ir->condition->as_expression();
 859
 860    if (expr) {
 861       src_reg op[2];
 862       dst_reg temp;
 863
 864       assert(expr->get_num_operands() <= 2);
 865       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 866          expr->operands[i]->accept(this);
 867          op[i] = this->result;
 868       }
 869
 870       switch (expr->operation) {
 871       case ir_unop_logic_not:
 872          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
 873          return;
 874
 875       case ir_binop_logic_xor:
 876          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
 877          return;
 878
 879       case ir_binop_logic_or:
 880          temp = dst_reg(this, glsl_type::bool_type);
 881          emit(OR(temp, op[0], op[1]));
 882          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 883          return;
 884
 885       case ir_binop_logic_and:
 886          temp = dst_reg(this, glsl_type::bool_type);
 887          emit(AND(temp, op[0], op[1]));
 888          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 889          return;
 890
 891       case ir_unop_f2b:
 892          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 893          return;
 894
 895       case ir_unop_i2b:
 896          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 897          return;
 898
 899       case ir_binop_greater:
 900       case ir_binop_gequal:
 901       case ir_binop_less:
 902       case ir_binop_lequal:
 903       case ir_binop_equal:
 904       case ir_binop_nequal:
 905          emit(IF(op[0], op[1],
 906                  brw_conditional_for_comparison(expr->operation)));
 907          return;
 908
 909       case ir_binop_all_equal:
 910          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 911          emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
 912          return;
 913
 914       case ir_binop_any_nequal:
 915          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 916          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 917          return;
 918
 919       case ir_unop_any:
 920          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 921          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 922          return;
 923
 924       default:
 925          assert(!"not reached");
 926          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 927          return;
 928       }
 929       return;
 930    }
 931
 932    ir->condition->accept(this);
 933
 934    emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
 935 }
 936
 937 void
 938 vec4_visitor::visit(ir_variable *ir)
 939 {
 940    dst_reg *reg = NULL;
 941
 942    if (variable_storage(ir))
 943       return;
 944
 945    switch (ir->data.mode) {
 946    case ir_var_shader_in:
 947       reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
 948       break;
 949
 950    case ir_var_shader_out:
 951       reg = new(mem_ctx) dst_reg(this, ir->type);
 952
 953       for (int i = 0; i < type_size(ir->type); i++) {
 954          output_reg[ir->data.location + i] = *reg;
 955          output_reg[ir->data.location + i].reg_offset = i;
 956          output_reg[ir->data.location + i].type =
 957             brw_type_for_base_type(ir->type->get_scalar_type());
 958          output_reg_annotation[ir->data.location + i] = ir->name;
 959       }
 960       break;
 961
 962    case ir_var_auto:
 963    case ir_var_temporary:
 964       reg = new(mem_ctx) dst_reg(this, ir->type);
 965       break;
 966
 967    case ir_var_uniform:
 968       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
 969
 970       /* Thanks to the lower_ubo_reference pass, we will see only
 971        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
 972        * variables, so no need for them to be in variable_ht.
 973        *
 974        * Atomic counters take no uniform storage, no need to do
 975        * anything here.
 976        */
 977       if (ir->is_in_uniform_block() || ir->type->contains_atomic())
 978          return;
 979
 980       /* Track how big the whole uniform variable is, in case we need to put a
 981        * copy of its data into pull constants for array access.
 982        */
 983       assert(this->uniforms < uniform_array_size);
 984       this->uniform_size[this->uniforms] = type_size(ir->type);
 985
 986       if (!strncmp(ir->name, "gl_", 3)) {
 987          setup_builtin_uniform_values(ir);
 988       } else {
 989          setup_uniform_values(ir);
 990       }
 991       break;
 992
 993    case ir_var_system_value:
 994       reg = make_reg_for_system_value(ir);
 995       break;
 996
 997    default:
 998       assert(!"not reached");
 999    }
1000
1001    reg->type = brw_type_for_base_type(ir->type);
1002    hash_table_insert(this->variable_ht, reg, ir);
1003 }
1004
1005 void
1006 vec4_visitor::visit(ir_loop *ir)
1007 {
1008    /* We don't want debugging output to print the whole body of the
1009     * loop as the annotation.
1010     */
1011    this->base_ir = NULL;
1012
1013    emit(BRW_OPCODE_DO);
1014
1015    visit_instructions(&ir->body_instructions);
1016
1017    emit(BRW_OPCODE_WHILE);
1018 }
1019
1020 void
1021 vec4_visitor::visit(ir_loop_jump *ir)
1022 {
1023    switch (ir->mode) {
1024    case ir_loop_jump::jump_break:
1025       emit(BRW_OPCODE_BREAK);
1026       break;
1027    case ir_loop_jump::jump_continue:
1028       emit(BRW_OPCODE_CONTINUE);
1029       break;
1030    }
1031 }
1032
1033
1034 void
1035 vec4_visitor::visit(ir_function_signature *ir)
1036 {
1037    assert(0);
1038    (void)ir;
1039 }
1040
1041 void
1042 vec4_visitor::visit(ir_function *ir)
1043 {
1044    /* Ignore function bodies other than main() -- we shouldn't see calls to
1045     * them since they should all be inlined.
1046     */
1047    if (strcmp(ir->name, "main") == 0) {
1048       const ir_function_signature *sig;
1049       exec_list empty;
1050
1051       sig = ir->matching_signature(NULL, &empty);
1052
1053       assert(sig);
1054
1055       visit_instructions(&sig->body);
1056    }
1057 }
1058
1059 bool
1060 vec4_visitor::try_emit_sat(ir_expression *ir)
1061 {
1062    ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
1063    if (!sat_src)
1064       return false;
1065
1066    sat_src->accept(this);
1067    src_reg src = this->result;
1068
1069    this->result = src_reg(this, ir->type);
1070    vec4_instruction *inst;
1071    inst = emit(MOV(dst_reg(this->result), src));
1072    inst->saturate = true;
1073
1074    return true;
1075 }
1076
1077 bool
1078 vec4_visitor::try_emit_mad(ir_expression *ir, int mul_arg)
1079 {
1080    /* 3-src instructions were introduced in gen6. */
1081    if (brw->gen < 6)
1082       return false;
1083
1084    /* MAD can only handle floating-point data. */
1085    if (ir->type->base_type != GLSL_TYPE_FLOAT)
1086       return false;
1087
1088    ir_rvalue *nonmul = ir->operands[1 - mul_arg];
1089    ir_expression *mul = ir->operands[mul_arg]->as_expression();
1090
1091    if (!mul || mul->operation != ir_binop_mul)
1092       return false;
1093
1094    nonmul->accept(this);
1095    src_reg src0 = fix_3src_operand(this->result);
1096
1097    mul->operands[0]->accept(this);
1098    src_reg src1 = fix_3src_operand(this->result);
1099
1100    mul->operands[1]->accept(this);
1101    src_reg src2 = fix_3src_operand(this->result);
1102
1103    this->result = src_reg(this, ir->type);
1104    emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1105
1106    return true;
1107 }
1108
1109 void
1110 vec4_visitor::emit_bool_comparison(unsigned int op,
1111                                  dst_reg dst, src_reg src0, src_reg src1)
1112 {
1113    /* original gen4 does destination conversion before comparison. */
1114    if (brw->gen < 5)
1115       dst.type = src0.type;
1116
1117    emit(CMP(dst, src0, src1, brw_conditional_for_comparison(op)));
1118
1119    dst.type = BRW_REGISTER_TYPE_D;
1120    emit(AND(dst, src_reg(dst), src_reg(0x1)));
1121 }
1122
1123 void
1124 vec4_visitor::emit_minmax(uint32_t conditionalmod, dst_reg dst,
1125                           src_reg src0, src_reg src1)
1126 {
1127    vec4_instruction *inst;
1128
1129    if (brw->gen >= 6) {
1130       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1131       inst->conditional_mod = conditionalmod;
1132    } else {
1133       emit(CMP(dst, src0, src1, conditionalmod));
1134
1135       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1136       inst->predicate = BRW_PREDICATE_NORMAL;
1137    }
1138 }
1139
1140 void
1141 vec4_visitor::emit_lrp(const dst_reg &dst,
1142                        const src_reg &x, const src_reg &y, const src_reg &a)
1143 {
1144    if (brw->gen >= 6) {
1145       /* Note that the instruction's argument order is reversed from GLSL
1146        * and the IR.
1147        */
1148       emit(LRP(dst,
1149                fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
1150    } else {
1151       /* Earlier generations don't support three source operations, so we
1152        * need to emit x*(1-a) + y*a.
1153        *
1154        * A better way to do this would be:
1155        *    ADD one_minus_a, negate(a), 1.0f
1156        *    MUL null, y, a
1157        *    MAC dst, x, one_minus_a
1158        * but we would need to support MAC and implicit accumulator.
1159        */
1160       dst_reg y_times_a           = dst_reg(this, glsl_type::vec4_type);
1161       dst_reg one_minus_a         = dst_reg(this, glsl_type::vec4_type);
1162       dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1163       y_times_a.writemask           = dst.writemask;
1164       one_minus_a.writemask         = dst.writemask;
1165       x_times_one_minus_a.writemask = dst.writemask;
1166
1167       emit(MUL(y_times_a, y, a));
1168       emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1169       emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1170       emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1171    }
1172 }
1173
1174 static bool
1175 is_16bit_constant(ir_rvalue *rvalue)
1176 {
1177    ir_constant *constant = rvalue->as_constant();
1178    if (!constant)
1179       return false;
1180
1181    if (constant->type != glsl_type::int_type &&
1182        constant->type != glsl_type::uint_type)
1183       return false;
1184
1185    return constant->value.u[0] < (1 << 16);
1186 }
1187
1188 void
1189 vec4_visitor::visit(ir_expression *ir)
1190 {
1191    unsigned int operand;
1192    src_reg op[Elements(ir->operands)];
1193    src_reg result_src;
1194    dst_reg result_dst;
1195    vec4_instruction *inst;
1196
1197    if (try_emit_sat(ir))
1198       return;
1199
1200    if (ir->operation == ir_binop_add) {
1201       if (try_emit_mad(ir, 0) || try_emit_mad(ir, 1))
1202          return;
1203    }
1204
1205    for (operand = 0; operand < ir->get_num_operands(); operand++) {
1206       this->result.file = BAD_FILE;
1207       ir->operands[operand]->accept(this);
1208       if (this->result.file == BAD_FILE) {
1209          fprintf(stderr, "Failed to get tree for expression operand:\n");
1210          ir->operands[operand]->fprint(stderr);
1211          exit(1);
1212       }
1213       op[operand] = this->result;
1214
1215       /* Matrix expression operands should have been broken down to vector
1216        * operations already.
1217        */
1218       assert(!ir->operands[operand]->type->is_matrix());
1219    }
1220
1221    int vector_elements = ir->operands[0]->type->vector_elements;
1222    if (ir->operands[1]) {
1223       vector_elements = MAX2(vector_elements,
1224                              ir->operands[1]->type->vector_elements);
1225    }
1226
1227    this->result.file = BAD_FILE;
1228
1229    /* Storage for our result.  Ideally for an assignment we'd be using
1230     * the actual storage for the result here, instead.
1231     */
1232    result_src = src_reg(this, ir->type);
1233    /* convenience for the emit functions below. */
1234    result_dst = dst_reg(result_src);
1235    /* If nothing special happens, this is the result. */
1236    this->result = result_src;
1237    /* Limit writes to the channels that will be used by result_src later.
1238     * This does limit this temp's use as a temporary for multi-instruction
1239     * sequences.
1240     */
1241    result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1242
1243    switch (ir->operation) {
1244    case ir_unop_logic_not:
1245       /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
1246        * ones complement of the whole register, not just bit 0.
1247        */
1248       emit(XOR(result_dst, op[0], src_reg(1)));
1249       break;
1250    case ir_unop_neg:
1251       op[0].negate = !op[0].negate;
1252       emit(MOV(result_dst, op[0]));
1253       break;
1254    case ir_unop_abs:
1255       op[0].abs = true;
1256       op[0].negate = false;
1257       emit(MOV(result_dst, op[0]));
1258       break;
1259
1260    case ir_unop_sign:
1261       if (ir->type->is_float()) {
1262          /* AND(val, 0x80000000) gives the sign bit.
1263           *
1264           * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1265           * zero.
1266           */
1267          emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1268
1269          op[0].type = BRW_REGISTER_TYPE_UD;
1270          result_dst.type = BRW_REGISTER_TYPE_UD;
1271          emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1272
1273          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1274          inst->predicate = BRW_PREDICATE_NORMAL;
1275
1276          this->result.type = BRW_REGISTER_TYPE_F;
1277       } else {
1278          /*  ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1279           *               -> non-negative val generates 0x00000000.
1280           *  Predicated OR sets 1 if val is positive.
1281           */
1282          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1283
1284          emit(ASR(result_dst, op[0], src_reg(31)));
1285
1286          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1287          inst->predicate = BRW_PREDICATE_NORMAL;
1288       }
1289       break;
1290
1291    case ir_unop_rcp:
1292       emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1293       break;
1294
1295    case ir_unop_exp2:
1296       emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1297       break;
1298    case ir_unop_log2:
1299       emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1300       break;
1301    case ir_unop_exp:
1302    case ir_unop_log:
1303       assert(!"not reached: should be handled by ir_explog_to_explog2");
1304       break;
1305    case ir_unop_sin:
1306    case ir_unop_sin_reduced:
1307       emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1308       break;
1309    case ir_unop_cos:
1310    case ir_unop_cos_reduced:
1311       emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1312       break;
1313
1314    case ir_unop_dFdx:
1315    case ir_unop_dFdy:
1316       assert(!"derivatives not valid in vertex shader");
1317       break;
1318
1319    case ir_unop_bitfield_reverse:
1320       emit(BFREV(result_dst, op[0]));
1321       break;
1322    case ir_unop_bit_count:
1323       emit(CBIT(result_dst, op[0]));
1324       break;
1325    case ir_unop_find_msb: {
1326       src_reg temp = src_reg(this, glsl_type::uint_type);
1327
1328       inst = emit(FBH(dst_reg(temp), op[0]));
1329       inst->dst.writemask = WRITEMASK_XYZW;
1330
1331       /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1332        * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1333        * subtract the result from 31 to convert the MSB count into an LSB count.
1334        */
1335
1336       /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1337       temp.swizzle = BRW_SWIZZLE_NOOP;
1338       emit(MOV(result_dst, temp));
1339
1340       src_reg src_tmp = src_reg(result_dst);
1341       emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1342
1343       src_tmp.negate = true;
1344       inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1345       inst->predicate = BRW_PREDICATE_NORMAL;
1346       break;
1347    }
1348    case ir_unop_find_lsb:
1349       emit(FBL(result_dst, op[0]));
1350       break;
1351
1352    case ir_unop_noise:
1353       assert(!"not reached: should be handled by lower_noise");
1354       break;
1355
1356    case ir_binop_add:
1357       emit(ADD(result_dst, op[0], op[1]));
1358       break;
1359    case ir_binop_sub:
1360       assert(!"not reached: should be handled by ir_sub_to_add_neg");
1361       break;
1362
1363    case ir_binop_mul:
1364       if (brw->gen < 8 && ir->type->is_integer()) {
1365          /* For integer multiplication, the MUL uses the low 16 bits of one of
1366           * the operands (src0 through SNB, src1 on IVB and later).  The MACH
1367           * accumulates in the contribution of the upper 16 bits of that
1368           * operand.  If we can determine that one of the args is in the low
1369           * 16 bits, though, we can just emit a single MUL.
1370           */
1371          if (is_16bit_constant(ir->operands[0])) {
1372             if (brw->gen < 7)
1373                emit(MUL(result_dst, op[0], op[1]));
1374             else
1375                emit(MUL(result_dst, op[1], op[0]));
1376          } else if (is_16bit_constant(ir->operands[1])) {
1377             if (brw->gen < 7)
1378                emit(MUL(result_dst, op[1], op[0]));
1379             else
1380                emit(MUL(result_dst, op[0], op[1]));
1381          } else {
1382             struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1383
1384             emit(MUL(acc, op[0], op[1]));
1385             emit(MACH(dst_null_d(), op[0], op[1]));
1386             emit(MOV(result_dst, src_reg(acc)));
1387          }
1388       } else {
1389          emit(MUL(result_dst, op[0], op[1]));
1390       }
1391       break;
1392    case ir_binop_imul_high: {
1393       struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1394
1395       emit(MUL(acc, op[0], op[1]));
1396       emit(MACH(result_dst, op[0], op[1]));
1397       break;
1398    }
1399    case ir_binop_div:
1400       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1401       assert(ir->type->is_integer());
1402       emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1403       break;
1404    case ir_binop_carry: {
1405       struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1406
1407       emit(ADDC(dst_null_ud(), op[0], op[1]));
1408       emit(MOV(result_dst, src_reg(acc)));
1409       break;
1410    }
1411    case ir_binop_borrow: {
1412       struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1413
1414       emit(SUBB(dst_null_ud(), op[0], op[1]));
1415       emit(MOV(result_dst, src_reg(acc)));
1416       break;
1417    }
1418    case ir_binop_mod:
1419       /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1420       assert(ir->type->is_integer());
1421       emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1422       break;
1423
1424    case ir_binop_less:
1425    case ir_binop_greater:
1426    case ir_binop_lequal:
1427    case ir_binop_gequal:
1428    case ir_binop_equal:
1429    case ir_binop_nequal: {
1430       emit(CMP(result_dst, op[0], op[1],
1431                brw_conditional_for_comparison(ir->operation)));
1432       emit(AND(result_dst, result_src, src_reg(0x1)));
1433       break;
1434    }
1435
1436    case ir_binop_all_equal:
1437       /* "==" operator producing a scalar boolean. */
1438       if (ir->operands[0]->type->is_vector() ||
1439           ir->operands[1]->type->is_vector()) {
1440          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1441          emit(MOV(result_dst, src_reg(0)));
1442          inst = emit(MOV(result_dst, src_reg(1)));
1443          inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1444       } else {
1445          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1446          emit(AND(result_dst, result_src, src_reg(0x1)));
1447       }
1448       break;
1449    case ir_binop_any_nequal:
1450       /* "!=" operator producing a scalar boolean. */
1451       if (ir->operands[0]->type->is_vector() ||
1452           ir->operands[1]->type->is_vector()) {
1453          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1454
1455          emit(MOV(result_dst, src_reg(0)));
1456          inst = emit(MOV(result_dst, src_reg(1)));
1457          inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1458       } else {
1459          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1460          emit(AND(result_dst, result_src, src_reg(0x1)));
1461       }
1462       break;
1463
1464    case ir_unop_any:
1465       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1466       emit(MOV(result_dst, src_reg(0)));
1467
1468       inst = emit(MOV(result_dst, src_reg(1)));
1469       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1470       break;
1471
1472    case ir_binop_logic_xor:
1473       emit(XOR(result_dst, op[0], op[1]));
1474       break;
1475
1476    case ir_binop_logic_or:
1477       emit(OR(result_dst, op[0], op[1]));
1478       break;
1479
1480    case ir_binop_logic_and:
1481       emit(AND(result_dst, op[0], op[1]));
1482       break;
1483
1484    case ir_binop_dot:
1485       assert(ir->operands[0]->type->is_vector());
1486       assert(ir->operands[0]->type == ir->operands[1]->type);
1487       emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1488       break;
1489
1490    case ir_unop_sqrt:
1491       emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1492       break;
1493    case ir_unop_rsq:
1494       emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1495       break;
1496
1497    case ir_unop_bitcast_i2f:
1498    case ir_unop_bitcast_u2f:
1499       this->result = op[0];
1500       this->result.type = BRW_REGISTER_TYPE_F;
1501       break;
1502
1503    case ir_unop_bitcast_f2i:
1504       this->result = op[0];
1505       this->result.type = BRW_REGISTER_TYPE_D;
1506       break;
1507
1508    case ir_unop_bitcast_f2u:
1509       this->result = op[0];
1510       this->result.type = BRW_REGISTER_TYPE_UD;
1511       break;
1512
1513    case ir_unop_i2f:
1514    case ir_unop_i2u:
1515    case ir_unop_u2i:
1516    case ir_unop_u2f:
1517    case ir_unop_b2f:
1518    case ir_unop_b2i:
1519    case ir_unop_f2i:
1520    case ir_unop_f2u:
1521       emit(MOV(result_dst, op[0]));
1522       break;
1523    case ir_unop_f2b:
1524    case ir_unop_i2b: {
1525       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1526       emit(AND(result_dst, result_src, src_reg(1)));
1527       break;
1528    }
1529
1530    case ir_unop_trunc:
1531       emit(RNDZ(result_dst, op[0]));
1532       break;
1533    case ir_unop_ceil:
1534       op[0].negate = !op[0].negate;
1535       inst = emit(RNDD(result_dst, op[0]));
1536       this->result.negate = true;
1537       break;
1538    case ir_unop_floor:
1539       inst = emit(RNDD(result_dst, op[0]));
1540       break;
1541    case ir_unop_fract:
1542       inst = emit(FRC(result_dst, op[0]));
1543       break;
1544    case ir_unop_round_even:
1545       emit(RNDE(result_dst, op[0]));
1546       break;
1547
1548    case ir_binop_min:
1549       emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1550       break;
1551    case ir_binop_max:
1552       emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1553       break;
1554
1555    case ir_binop_pow:
1556       emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1557       break;
1558
1559    case ir_unop_bit_not:
1560       inst = emit(NOT(result_dst, op[0]));
1561       break;
1562    case ir_binop_bit_and:
1563       inst = emit(AND(result_dst, op[0], op[1]));
1564       break;
1565    case ir_binop_bit_xor:
1566       inst = emit(XOR(result_dst, op[0], op[1]));
1567       break;
1568    case ir_binop_bit_or:
1569       inst = emit(OR(result_dst, op[0], op[1]));
1570       break;
1571
1572    case ir_binop_lshift:
1573       inst = emit(SHL(result_dst, op[0], op[1]));
1574       break;
1575
1576    case ir_binop_rshift:
1577       if (ir->type->base_type == GLSL_TYPE_INT)
1578          inst = emit(ASR(result_dst, op[0], op[1]));
1579       else
1580          inst = emit(SHR(result_dst, op[0], op[1]));
1581       break;
1582
1583    case ir_binop_bfm:
1584       emit(BFI1(result_dst, op[0], op[1]));
1585       break;
1586
1587    case ir_binop_ubo_load: {
1588       ir_constant *uniform_block = ir->operands[0]->as_constant();
1589       ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1590       unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1591       src_reg offset;
1592
1593       /* Now, load the vector from that offset. */
1594       assert(ir->type->is_vector() || ir->type->is_scalar());
1595
1596       src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1597       packed_consts.type = result.type;
1598       src_reg surf_index =
1599          src_reg(prog_data->base.binding_table.ubo_start + uniform_block->value.u[0]);
1600       if (const_offset_ir) {
1601          if (brw->gen >= 8) {
1602             /* Store the offset in a GRF so we can send-from-GRF. */
1603             offset = src_reg(this, glsl_type::int_type);
1604             emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1605          } else {
1606             /* Immediates are fine on older generations since they'll be moved
1607              * to a (potentially fake) MRF at the generator level.
1608              */
1609             offset = src_reg(const_offset / 16);
1610          }
1611       } else {
1612          offset = src_reg(this, glsl_type::uint_type);
1613          emit(SHR(dst_reg(offset), op[1], src_reg(4)));
1614       }
1615
1616       if (brw->gen >= 7) {
1617          dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1618          grf_offset.type = offset.type;
1619
1620          emit(MOV(grf_offset, offset));
1621
1622          emit(new(mem_ctx) vec4_instruction(this,
1623                                             VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1624                                             dst_reg(packed_consts),
1625                                             surf_index,
1626                                             src_reg(grf_offset)));
1627       } else {
1628          vec4_instruction *pull =
1629             emit(new(mem_ctx) vec4_instruction(this,
1630                                                VS_OPCODE_PULL_CONSTANT_LOAD,
1631                                                dst_reg(packed_consts),
1632                                                surf_index,
1633                                                offset));
1634          pull->base_mrf = 14;
1635          pull->mlen = 1;
1636       }
1637
1638       packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1639       packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1640                                             const_offset % 16 / 4,
1641                                             const_offset % 16 / 4,
1642                                             const_offset % 16 / 4);
1643
1644       /* UBO bools are any nonzero int.  We store bools as either 0 or 1. */
1645       if (ir->type->base_type == GLSL_TYPE_BOOL) {
1646          emit(CMP(result_dst, packed_consts, src_reg(0u),
1647                   BRW_CONDITIONAL_NZ));
1648          emit(AND(result_dst, result, src_reg(0x1)));
1649       } else {
1650          emit(MOV(result_dst, packed_consts));
1651       }
1652       break;
1653    }
1654
1655    case ir_binop_vector_extract:
1656       assert(!"should have been lowered by vec_index_to_cond_assign");
1657       break;
1658
1659    case ir_triop_fma:
1660       op[0] = fix_3src_operand(op[0]);
1661       op[1] = fix_3src_operand(op[1]);
1662       op[2] = fix_3src_operand(op[2]);
1663       /* Note that the instruction's argument order is reversed from GLSL
1664        * and the IR.
1665        */
1666       emit(MAD(result_dst, op[2], op[1], op[0]));
1667       break;
1668
1669    case ir_triop_lrp:
1670       emit_lrp(result_dst, op[0], op[1], op[2]);
1671       break;
1672
1673    case ir_triop_csel:
1674       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1675       inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1676       inst->predicate = BRW_PREDICATE_NORMAL;
1677       break;
1678
1679    case ir_triop_bfi:
1680       op[0] = fix_3src_operand(op[0]);
1681       op[1] = fix_3src_operand(op[1]);
1682       op[2] = fix_3src_operand(op[2]);
1683       emit(BFI2(result_dst, op[0], op[1], op[2]));
1684       break;
1685
1686    case ir_triop_bitfield_extract:
1687       op[0] = fix_3src_operand(op[0]);
1688       op[1] = fix_3src_operand(op[1]);
1689       op[2] = fix_3src_operand(op[2]);
1690       /* Note that the instruction's argument order is reversed from GLSL
1691        * and the IR.
1692        */
1693       emit(BFE(result_dst, op[2], op[1], op[0]));
1694       break;
1695
1696    case ir_triop_vector_insert:
1697       assert(!"should have been lowered by lower_vector_insert");
1698       break;
1699
1700    case ir_quadop_bitfield_insert:
1701       assert(!"not reached: should be handled by "
1702               "bitfield_insert_to_bfm_bfi\n");
1703       break;
1704
1705    case ir_quadop_vector:
1706       assert(!"not reached: should be handled by lower_quadop_vector");
1707       break;
1708
1709    case ir_unop_pack_half_2x16:
1710       emit_pack_half_2x16(result_dst, op[0]);
1711       break;
1712    case ir_unop_unpack_half_2x16:
1713       emit_unpack_half_2x16(result_dst, op[0]);
1714       break;
1715    case ir_unop_pack_snorm_2x16:
1716    case ir_unop_pack_snorm_4x8:
1717    case ir_unop_pack_unorm_2x16:
1718    case ir_unop_pack_unorm_4x8:
1719    case ir_unop_unpack_snorm_2x16:
1720    case ir_unop_unpack_snorm_4x8:
1721    case ir_unop_unpack_unorm_2x16:
1722    case ir_unop_unpack_unorm_4x8:
1723       assert(!"not reached: should be handled by lower_packing_builtins");
1724       break;
1725    case ir_unop_unpack_half_2x16_split_x:
1726    case ir_unop_unpack_half_2x16_split_y:
1727    case ir_binop_pack_half_2x16_split:
1728       assert(!"not reached: should not occur in vertex shader");
1729       break;
1730    case ir_binop_ldexp:
1731       assert(!"not reached: should be handled by ldexp_to_arith()");
1732       break;
1733    }
1734 }
1735
1736
1737 void
1738 vec4_visitor::visit(ir_swizzle *ir)
1739 {
1740    src_reg src;
1741    int i = 0;
1742    int swizzle[4];
1743
1744    /* Note that this is only swizzles in expressions, not those on the left
1745     * hand side of an assignment, which do write masking.  See ir_assignment
1746     * for that.
1747     */
1748
1749    ir->val->accept(this);
1750    src = this->result;
1751    assert(src.file != BAD_FILE);
1752
1753    for (i = 0; i < ir->type->vector_elements; i++) {
1754       switch (i) {
1755       case 0:
1756          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1757          break;
1758       case 1:
1759          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1760          break;
1761       case 2:
1762          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1763          break;
1764       case 3:
1765          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1766             break;
1767       }
1768    }
1769    for (; i < 4; i++) {
1770       /* Replicate the last channel out. */
1771       swizzle[i] = swizzle[ir->type->vector_elements - 1];
1772    }
1773
1774    src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1775
1776    this->result = src;
1777 }
1778
1779 void
1780 vec4_visitor::visit(ir_dereference_variable *ir)
1781 {
1782    const struct glsl_type *type = ir->type;
1783    dst_reg *reg = variable_storage(ir->var);
1784
1785    if (!reg) {
1786       fail("Failed to find variable storage for %s\n", ir->var->name);
1787       this->result = src_reg(brw_null_reg());
1788       return;
1789    }
1790
1791    this->result = src_reg(*reg);
1792
1793    /* System values get their swizzle from the dst_reg writemask */
1794    if (ir->var->data.mode == ir_var_system_value)
1795       return;
1796
1797    if (type->is_scalar() || type->is_vector() || type->is_matrix())
1798       this->result.swizzle = swizzle_for_size(type->vector_elements);
1799 }
1800
1801
1802 int
1803 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1804 {
1805    /* Under normal circumstances array elements are stored consecutively, so
1806     * the stride is equal to the size of the array element.
1807     */
1808    return type_size(ir->type);
1809 }
1810
1811
1812 void
1813 vec4_visitor::visit(ir_dereference_array *ir)
1814 {
1815    ir_constant *constant_index;
1816    src_reg src;
1817    int array_stride = compute_array_stride(ir);
1818
1819    constant_index = ir->array_index->constant_expression_value();
1820
1821    ir->array->accept(this);
1822    src = this->result;
1823
1824    if (constant_index) {
1825       src.reg_offset += constant_index->value.i[0] * array_stride;
1826    } else {
1827       /* Variable index array dereference.  It eats the "vec4" of the
1828        * base of the array and an index that offsets the Mesa register
1829        * index.
1830        */
1831       ir->array_index->accept(this);
1832
1833       src_reg index_reg;
1834
1835       if (array_stride == 1) {
1836          index_reg = this->result;
1837       } else {
1838          index_reg = src_reg(this, glsl_type::int_type);
1839
1840          emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
1841       }
1842
1843       if (src.reladdr) {
1844          src_reg temp = src_reg(this, glsl_type::int_type);
1845
1846          emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1847
1848          index_reg = temp;
1849       }
1850
1851       src.reladdr = ralloc(mem_ctx, src_reg);
1852       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1853    }
1854
1855    /* If the type is smaller than a vec4, replicate the last channel out. */
1856    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1857       src.swizzle = swizzle_for_size(ir->type->vector_elements);
1858    else
1859       src.swizzle = BRW_SWIZZLE_NOOP;
1860    src.type = brw_type_for_base_type(ir->type);
1861
1862    this->result = src;
1863 }
1864
1865 void
1866 vec4_visitor::visit(ir_dereference_record *ir)
1867 {
1868    unsigned int i;
1869    const glsl_type *struct_type = ir->record->type;
1870    int offset = 0;
1871
1872    ir->record->accept(this);
1873
1874    for (i = 0; i < struct_type->length; i++) {
1875       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1876          break;
1877       offset += type_size(struct_type->fields.structure[i].type);
1878    }
1879
1880    /* If the type is smaller than a vec4, replicate the last channel out. */
1881    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1882       this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1883    else
1884       this->result.swizzle = BRW_SWIZZLE_NOOP;
1885    this->result.type = brw_type_for_base_type(ir->type);
1886
1887    this->result.reg_offset += offset;
1888 }
1889
1890 /**
1891  * We want to be careful in assignment setup to hit the actual storage
1892  * instead of potentially using a temporary like we might with the
1893  * ir_dereference handler.
1894  */
1895 static dst_reg
1896 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1897 {
1898    /* The LHS must be a dereference.  If the LHS is a variable indexed array
1899     * access of a vector, it must be separated into a series conditional moves
1900     * before reaching this point (see ir_vec_index_to_cond_assign).
1901     */
1902    assert(ir->as_dereference());
1903    ir_dereference_array *deref_array = ir->as_dereference_array();
1904    if (deref_array) {
1905       assert(!deref_array->array->type->is_vector());
1906    }
1907
1908    /* Use the rvalue deref handler for the most part.  We'll ignore
1909     * swizzles in it and write swizzles using writemask, though.
1910     */
1911    ir->accept(v);
1912    return dst_reg(v->result);
1913 }
1914
1915 void
1916 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1917                               const struct glsl_type *type, uint32_t predicate)
1918 {
1919    if (type->base_type == GLSL_TYPE_STRUCT) {
1920       for (unsigned int i = 0; i < type->length; i++) {
1921          emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1922       }
1923       return;
1924    }
1925
1926    if (type->is_array()) {
1927       for (unsigned int i = 0; i < type->length; i++) {
1928          emit_block_move(dst, src, type->fields.array, predicate);
1929       }
1930       return;
1931    }
1932
1933    if (type->is_matrix()) {
1934       const struct glsl_type *vec_type;
1935
1936       vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1937                                          type->vector_elements, 1);
1938
1939       for (int i = 0; i < type->matrix_columns; i++) {
1940          emit_block_move(dst, src, vec_type, predicate);
1941       }
1942       return;
1943    }
1944
1945    assert(type->is_scalar() || type->is_vector());
1946
1947    dst->type = brw_type_for_base_type(type);
1948    src->type = dst->type;
1949
1950    dst->writemask = (1 << type->vector_elements) - 1;
1951
1952    src->swizzle = swizzle_for_size(type->vector_elements);
1953
1954    vec4_instruction *inst = emit(MOV(*dst, *src));
1955    inst->predicate = predicate;
1956
1957    dst->reg_offset++;
1958    src->reg_offset++;
1959 }
1960
1961
1962 /* If the RHS processing resulted in an instruction generating a
1963  * temporary value, and it would be easy to rewrite the instruction to
1964  * generate its result right into the LHS instead, do so.  This ends
1965  * up reliably removing instructions where it can be tricky to do so
1966  * later without real UD chain information.
1967  */
1968 bool
1969 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
1970                                      dst_reg dst,
1971                                      src_reg src,
1972                                      vec4_instruction *pre_rhs_inst,
1973                                      vec4_instruction *last_rhs_inst)
1974 {
1975    /* This could be supported, but it would take more smarts. */
1976    if (ir->condition)
1977       return false;
1978
1979    if (pre_rhs_inst == last_rhs_inst)
1980       return false; /* No instructions generated to work with. */
1981
1982    /* Make sure the last instruction generated our source reg. */
1983    if (src.file != GRF ||
1984        src.file != last_rhs_inst->dst.file ||
1985        src.reg != last_rhs_inst->dst.reg ||
1986        src.reg_offset != last_rhs_inst->dst.reg_offset ||
1987        src.reladdr ||
1988        src.abs ||
1989        src.negate ||
1990        last_rhs_inst->predicate != BRW_PREDICATE_NONE)
1991       return false;
1992
1993    /* Check that that last instruction fully initialized the channels
1994     * we want to use, in the order we want to use them.  We could
1995     * potentially reswizzle the operands of many instructions so that
1996     * we could handle out of order channels, but don't yet.
1997     */
1998
1999    for (unsigned i = 0; i < 4; i++) {
2000       if (dst.writemask & (1 << i)) {
2001          if (!(last_rhs_inst->dst.writemask & (1 << i)))
2002             return false;
2003
2004          if (BRW_GET_SWZ(src.swizzle, i) != i)
2005             return false;
2006       }
2007    }
2008
2009    /* Success!  Rewrite the instruction. */
2010    last_rhs_inst->dst.file = dst.file;
2011    last_rhs_inst->dst.reg = dst.reg;
2012    last_rhs_inst->dst.reg_offset = dst.reg_offset;
2013    last_rhs_inst->dst.reladdr = dst.reladdr;
2014    last_rhs_inst->dst.writemask &= dst.writemask;
2015
2016    return true;
2017 }
2018
2019 void
2020 vec4_visitor::visit(ir_assignment *ir)
2021 {
2022    dst_reg dst = get_assignment_lhs(ir->lhs, this);
2023    uint32_t predicate = BRW_PREDICATE_NONE;
2024
2025    if (!ir->lhs->type->is_scalar() &&
2026        !ir->lhs->type->is_vector()) {
2027       ir->rhs->accept(this);
2028       src_reg src = this->result;
2029
2030       if (ir->condition) {
2031          emit_bool_to_cond_code(ir->condition, &predicate);
2032       }
2033
2034       /* emit_block_move doesn't account for swizzles in the source register.
2035        * This should be ok, since the source register is a structure or an
2036        * array, and those can't be swizzled.  But double-check to be sure.
2037        */
2038       assert(src.swizzle ==
2039              (ir->rhs->type->is_matrix()
2040               ? swizzle_for_size(ir->rhs->type->vector_elements)
2041               : BRW_SWIZZLE_NOOP));
2042
2043       emit_block_move(&dst, &src, ir->rhs->type, predicate);
2044       return;
2045    }
2046
2047    /* Now we're down to just a scalar/vector with writemasks. */
2048    int i;
2049
2050    vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2051    pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2052
2053    ir->rhs->accept(this);
2054
2055    last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2056
2057    src_reg src = this->result;
2058
2059    int swizzles[4];
2060    int first_enabled_chan = 0;
2061    int src_chan = 0;
2062
2063    assert(ir->lhs->type->is_vector() ||
2064           ir->lhs->type->is_scalar());
2065    dst.writemask = ir->write_mask;
2066
2067    for (int i = 0; i < 4; i++) {
2068       if (dst.writemask & (1 << i)) {
2069          first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
2070          break;
2071       }
2072    }
2073
2074    /* Swizzle a small RHS vector into the channels being written.
2075     *
2076     * glsl ir treats write_mask as dictating how many channels are
2077     * present on the RHS while in our instructions we need to make
2078     * those channels appear in the slots of the vec4 they're written to.
2079     */
2080    for (int i = 0; i < 4; i++) {
2081       if (dst.writemask & (1 << i))
2082          swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
2083       else
2084          swizzles[i] = first_enabled_chan;
2085    }
2086    src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2087                               swizzles[2], swizzles[3]);
2088
2089    if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2090       return;
2091    }
2092
2093    if (ir->condition) {
2094       emit_bool_to_cond_code(ir->condition, &predicate);
2095    }
2096
2097    for (i = 0; i < type_size(ir->lhs->type); i++) {
2098       vec4_instruction *inst = emit(MOV(dst, src));
2099       inst->predicate = predicate;
2100
2101       dst.reg_offset++;
2102       src.reg_offset++;
2103    }
2104 }
2105
2106 void
2107 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2108 {
2109    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2110       foreach_list(node, &ir->components) {
2111          ir_constant *field_value = (ir_constant *)node;
2112
2113          emit_constant_values(dst, field_value);
2114       }
2115       return;
2116    }
2117
2118    if (ir->type->is_array()) {
2119       for (unsigned int i = 0; i < ir->type->length; i++) {
2120          emit_constant_values(dst, ir->array_elements[i]);
2121       }
2122       return;
2123    }
2124
2125    if (ir->type->is_matrix()) {
2126       for (int i = 0; i < ir->type->matrix_columns; i++) {
2127          float *vec = &ir->value.f[i * ir->type->vector_elements];
2128
2129          for (int j = 0; j < ir->type->vector_elements; j++) {
2130             dst->writemask = 1 << j;
2131             dst->type = BRW_REGISTER_TYPE_F;
2132
2133             emit(MOV(*dst, src_reg(vec[j])));
2134          }
2135          dst->reg_offset++;
2136       }
2137       return;
2138    }
2139
2140    int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2141
2142    for (int i = 0; i < ir->type->vector_elements; i++) {
2143       if (!(remaining_writemask & (1 << i)))
2144          continue;
2145
2146       dst->writemask = 1 << i;
2147       dst->type = brw_type_for_base_type(ir->type);
2148
2149       /* Find other components that match the one we're about to
2150        * write.  Emits fewer instructions for things like vec4(0.5,
2151        * 1.5, 1.5, 1.5).
2152        */
2153       for (int j = i + 1; j < ir->type->vector_elements; j++) {
2154          if (ir->type->base_type == GLSL_TYPE_BOOL) {
2155             if (ir->value.b[i] == ir->value.b[j])
2156                dst->writemask |= (1 << j);
2157          } else {
2158             /* u, i, and f storage all line up, so no need for a
2159              * switch case for comparing each type.
2160              */
2161             if (ir->value.u[i] == ir->value.u[j])
2162                dst->writemask |= (1 << j);
2163          }
2164       }
2165
2166       switch (ir->type->base_type) {
2167       case GLSL_TYPE_FLOAT:
2168          emit(MOV(*dst, src_reg(ir->value.f[i])));
2169          break;
2170       case GLSL_TYPE_INT:
2171          emit(MOV(*dst, src_reg(ir->value.i[i])));
2172          break;
2173       case GLSL_TYPE_UINT:
2174          emit(MOV(*dst, src_reg(ir->value.u[i])));
2175          break;
2176       case GLSL_TYPE_BOOL:
2177          emit(MOV(*dst, src_reg(ir->value.b[i])));
2178          break;
2179       default:
2180          assert(!"Non-float/uint/int/bool constant");
2181          break;
2182       }
2183
2184       remaining_writemask &= ~dst->writemask;
2185    }
2186    dst->reg_offset++;
2187 }
2188
2189 void
2190 vec4_visitor::visit(ir_constant *ir)
2191 {
2192    dst_reg dst = dst_reg(this, ir->type);
2193    this->result = src_reg(dst);
2194
2195    emit_constant_values(&dst, ir);
2196 }
2197
2198 void
2199 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2200 {
2201    ir_dereference *deref = static_cast<ir_dereference *>(
2202       ir->actual_parameters.get_head());
2203    ir_variable *location = deref->variable_referenced();
2204    unsigned surf_index = (prog_data->base.binding_table.abo_start +
2205                           location->data.atomic.buffer_index);
2206
2207    /* Calculate the surface offset */
2208    src_reg offset(this, glsl_type::uint_type);
2209    ir_dereference_array *deref_array = deref->as_dereference_array();
2210    if (deref_array) {
2211       deref_array->array_index->accept(this);
2212
2213       src_reg tmp(this, glsl_type::uint_type);
2214       emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2215       emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2216    } else {
2217       offset = location->data.atomic.offset;
2218    }
2219
2220    /* Emit the appropriate machine instruction */
2221    const char *callee = ir->callee->function_name();
2222    dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2223
2224    if (!strcmp("__intrinsic_atomic_read", callee)) {
2225       emit_untyped_surface_read(surf_index, dst, offset);
2226
2227    } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2228       emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2229                           src_reg(), src_reg());
2230
2231    } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2232       emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2233                           src_reg(), src_reg());
2234    }
2235 }
2236
2237 void
2238 vec4_visitor::visit(ir_call *ir)
2239 {
2240    const char *callee = ir->callee->function_name();
2241
2242    if (!strcmp("__intrinsic_atomic_read", callee) ||
2243        !strcmp("__intrinsic_atomic_increment", callee) ||
2244        !strcmp("__intrinsic_atomic_predecrement", callee)) {
2245       visit_atomic_counter_intrinsic(ir);
2246    } else {
2247       assert(!"Unsupported intrinsic.");
2248    }
2249 }
2250
2251 src_reg
2252 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, int sampler)
2253 {
2254    vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_MCS);
2255    inst->base_mrf = 2;
2256    inst->mlen = 1;
2257    inst->sampler = sampler;
2258    inst->dst = dst_reg(this, glsl_type::uvec4_type);
2259    inst->dst.writemask = WRITEMASK_XYZW;
2260
2261    /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2262    int param_base = inst->base_mrf;
2263    int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2264    int zero_mask = 0xf & ~coord_mask;
2265
2266    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2267             coordinate));
2268
2269    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2270             src_reg(0)));
2271
2272    emit(inst);
2273    return src_reg(inst->dst);
2274 }
2275
2276 void
2277 vec4_visitor::visit(ir_texture *ir)
2278 {
2279    int sampler =
2280       _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2281
2282    /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2283     * emitting anything other than setting up the constant result.
2284     */
2285    if (ir->op == ir_tg4) {
2286       ir_constant *chan = ir->lod_info.component->as_constant();
2287       int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2288       if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2289          dst_reg result(this, ir->type);
2290          this->result = src_reg(result);
2291          emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2292          return;
2293       }
2294    }
2295
2296    /* Should be lowered by do_lower_texture_projection */
2297    assert(!ir->projector);
2298
2299    /* Should be lowered */
2300    assert(!ir->offset || !ir->offset->type->is_array());
2301
2302    /* Generate code to compute all the subexpression trees.  This has to be
2303     * done before loading any values into MRFs for the sampler message since
2304     * generating these values may involve SEND messages that need the MRFs.
2305     */
2306    src_reg coordinate;
2307    if (ir->coordinate) {
2308       ir->coordinate->accept(this);
2309       coordinate = this->result;
2310    }
2311
2312    src_reg shadow_comparitor;
2313    if (ir->shadow_comparitor) {
2314       ir->shadow_comparitor->accept(this);
2315       shadow_comparitor = this->result;
2316    }
2317
2318    bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2319    src_reg offset_value;
2320    if (has_nonconstant_offset) {
2321       ir->offset->accept(this);
2322       offset_value = src_reg(this->result);
2323    }
2324
2325    const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2326    src_reg lod, dPdx, dPdy, sample_index, mcs;
2327    switch (ir->op) {
2328    case ir_tex:
2329       lod = src_reg(0.0f);
2330       lod_type = glsl_type::float_type;
2331       break;
2332    case ir_txf:
2333    case ir_txl:
2334    case ir_txs:
2335       ir->lod_info.lod->accept(this);
2336       lod = this->result;
2337       lod_type = ir->lod_info.lod->type;
2338       break;
2339    case ir_query_levels:
2340       lod = src_reg(0);
2341       lod_type = glsl_type::int_type;
2342       break;
2343    case ir_txf_ms:
2344       ir->lod_info.sample_index->accept(this);
2345       sample_index = this->result;
2346       sample_index_type = ir->lod_info.sample_index->type;
2347
2348       if (brw->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2349          mcs = emit_mcs_fetch(ir, coordinate, sampler);
2350       else
2351          mcs = src_reg(0u);
2352       break;
2353    case ir_txd:
2354       ir->lod_info.grad.dPdx->accept(this);
2355       dPdx = this->result;
2356
2357       ir->lod_info.grad.dPdy->accept(this);
2358       dPdy = this->result;
2359
2360       lod_type = ir->lod_info.grad.dPdx->type;
2361       break;
2362    case ir_txb:
2363    case ir_lod:
2364    case ir_tg4:
2365       break;
2366    }
2367
2368    vec4_instruction *inst = NULL;
2369    switch (ir->op) {
2370    case ir_tex:
2371    case ir_txl:
2372       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXL);
2373       break;
2374    case ir_txd:
2375       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXD);
2376       break;
2377    case ir_txf:
2378       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF);
2379       break;
2380    case ir_txf_ms:
2381       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_CMS);
2382       break;
2383    case ir_txs:
2384       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
2385       break;
2386    case ir_tg4:
2387       if (has_nonconstant_offset)
2388          inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TG4_OFFSET);
2389       else
2390          inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TG4);
2391       break;
2392    case ir_query_levels:
2393       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
2394       break;
2395    case ir_txb:
2396       assert(!"TXB is not valid for vertex shaders.");
2397       break;
2398    case ir_lod:
2399       assert(!"LOD is not valid for vertex shaders.");
2400       break;
2401    default:
2402       assert(!"Unrecognized tex op");
2403    }
2404
2405    if (ir->offset != NULL && ir->op != ir_txf)
2406       inst->texture_offset = brw_texture_offset(ctx, ir->offset->as_constant());
2407
2408    /* Stuff the channel select bits in the top of the texture offset */
2409    if (ir->op == ir_tg4)
2410       inst->texture_offset |= gather_channel(ir, sampler) << 16;
2411
2412    /* The message header is necessary for:
2413     * - Gen4 (always)
2414     * - Texel offsets
2415     * - Gather channel selection
2416     * - Sampler indices too large to fit in a 4-bit value.
2417     */
2418    inst->header_present =
2419       brw->gen < 5 || inst->texture_offset != 0 || ir->op == ir_tg4 ||
2420       sampler >= 16;
2421    inst->base_mrf = 2;
2422    inst->mlen = inst->header_present + 1; /* always at least one */
2423    inst->sampler = sampler;
2424    inst->dst = dst_reg(this, ir->type);
2425    inst->dst.writemask = WRITEMASK_XYZW;
2426    inst->shadow_compare = ir->shadow_comparitor != NULL;
2427
2428    /* MRF for the first parameter */
2429    int param_base = inst->base_mrf + inst->header_present;
2430
2431    if (ir->op == ir_txs || ir->op == ir_query_levels) {
2432       int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2433       emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2434    } else {
2435       /* Load the coordinate */
2436       /* FINISHME: gl_clamp_mask and saturate */
2437       int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2438       int zero_mask = 0xf & ~coord_mask;
2439
2440       emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2441                coordinate));
2442
2443       if (zero_mask != 0) {
2444          emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2445                   src_reg(0)));
2446       }
2447       /* Load the shadow comparitor */
2448       if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2449          emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2450                           WRITEMASK_X),
2451                   shadow_comparitor));
2452          inst->mlen++;
2453       }
2454
2455       /* Load the LOD info */
2456       if (ir->op == ir_tex || ir->op == ir_txl) {
2457          int mrf, writemask;
2458          if (brw->gen >= 5) {
2459             mrf = param_base + 1;
2460             if (ir->shadow_comparitor) {
2461                writemask = WRITEMASK_Y;
2462                /* mlen already incremented */
2463             } else {
2464                writemask = WRITEMASK_X;
2465                inst->mlen++;
2466             }
2467          } else /* brw->gen == 4 */ {
2468             mrf = param_base;
2469             writemask = WRITEMASK_W;
2470          }
2471          emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2472       } else if (ir->op == ir_txf) {
2473          emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2474       } else if (ir->op == ir_txf_ms) {
2475          emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2476                   sample_index));
2477          if (brw->gen >= 7)
2478             /* MCS data is in the first channel of `mcs`, but we need to get it into
2479              * the .y channel of the second vec4 of params, so replicate .x across
2480              * the whole vec4 and then mask off everything except .y
2481              */
2482             mcs.swizzle = BRW_SWIZZLE_XXXX;
2483             emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2484                      mcs));
2485          inst->mlen++;
2486       } else if (ir->op == ir_txd) {
2487          const glsl_type *type = lod_type;
2488
2489          if (brw->gen >= 5) {
2490             dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2491             dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2492             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2493             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2494             inst->mlen++;
2495
2496             if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2497                dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2498                dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2499                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2500                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2501                inst->mlen++;
2502
2503                if (ir->shadow_comparitor) {
2504                   emit(MOV(dst_reg(MRF, param_base + 2,
2505                                    ir->shadow_comparitor->type, WRITEMASK_Z),
2506                            shadow_comparitor));
2507                }
2508             }
2509          } else /* brw->gen == 4 */ {
2510             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2511             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2512             inst->mlen += 2;
2513          }
2514       } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2515          if (ir->shadow_comparitor) {
2516             emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2517                      shadow_comparitor));
2518          }
2519
2520          emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2521                   offset_value));
2522          inst->mlen++;
2523       }
2524    }
2525
2526    emit(inst);
2527
2528    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2529     * spec requires layers.
2530     */
2531    if (ir->op == ir_txs) {
2532       glsl_type const *type = ir->sampler->type;
2533       if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2534           type->sampler_array) {
2535          emit_math(SHADER_OPCODE_INT_QUOTIENT,
2536                    writemask(inst->dst, WRITEMASK_Z),
2537                    src_reg(inst->dst), src_reg(6));
2538       }
2539    }
2540
2541    if (brw->gen == 6 && ir->op == ir_tg4) {
2542       emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2543    }
2544
2545    swizzle_result(ir, src_reg(inst->dst), sampler);
2546 }
2547
2548 /**
2549  * Apply workarounds for Gen6 gather with UINT/SINT
2550  */
2551 void
2552 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2553 {
2554    if (!wa)
2555       return;
2556
2557    int width = (wa & WA_8BIT) ? 8 : 16;
2558    dst_reg dst_f = dst;
2559    dst_f.type = BRW_REGISTER_TYPE_F;
2560
2561    /* Convert from UNORM to UINT */
2562    emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2563    emit(MOV(dst, src_reg(dst_f)));
2564
2565    if (wa & WA_SIGN) {
2566       /* Reinterpret the UINT value as a signed INT value by
2567        * shifting the sign bit into place, then shifting back
2568        * preserving sign.
2569        */
2570       emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2571       emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2572    }
2573 }
2574
2575 /**
2576  * Set up the gather channel based on the swizzle, for gather4.
2577  */
2578 uint32_t
2579 vec4_visitor::gather_channel(ir_texture *ir, int sampler)
2580 {
2581    ir_constant *chan = ir->lod_info.component->as_constant();
2582    int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2583    switch (swiz) {
2584       case SWIZZLE_X: return 0;
2585       case SWIZZLE_Y:
2586          /* gather4 sampler is broken for green channel on RG32F --
2587           * we must ask for blue instead.
2588           */
2589          if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2590             return 2;
2591          return 1;
2592       case SWIZZLE_Z: return 2;
2593       case SWIZZLE_W: return 3;
2594       default:
2595          assert(!"Not reached"); /* zero, one swizzles handled already */
2596          return 0;
2597    }
2598 }
2599
2600 void
2601 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, int sampler)
2602 {
2603    int s = key->tex.swizzles[sampler];
2604
2605    this->result = src_reg(this, ir->type);
2606    dst_reg swizzled_result(this->result);
2607
2608    if (ir->op == ir_query_levels) {
2609       /* # levels is in .w */
2610       orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2611       emit(MOV(swizzled_result, orig_val));
2612       return;
2613    }
2614
2615    if (ir->op == ir_txs || ir->type == glsl_type::float_type
2616                         || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2617       emit(MOV(swizzled_result, orig_val));
2618       return;
2619    }
2620
2621
2622    int zero_mask = 0, one_mask = 0, copy_mask = 0;
2623    int swizzle[4] = {0};
2624
2625    for (int i = 0; i < 4; i++) {
2626       switch (GET_SWZ(s, i)) {
2627       case SWIZZLE_ZERO:
2628          zero_mask |= (1 << i);
2629          break;
2630       case SWIZZLE_ONE:
2631          one_mask |= (1 << i);
2632          break;
2633       default:
2634          copy_mask |= (1 << i);
2635          swizzle[i] = GET_SWZ(s, i);
2636          break;
2637       }
2638    }
2639
2640    if (copy_mask) {
2641       orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2642       swizzled_result.writemask = copy_mask;
2643       emit(MOV(swizzled_result, orig_val));
2644    }
2645
2646    if (zero_mask) {
2647       swizzled_result.writemask = zero_mask;
2648       emit(MOV(swizzled_result, src_reg(0.0f)));
2649    }
2650
2651    if (one_mask) {
2652       swizzled_result.writemask = one_mask;
2653       emit(MOV(swizzled_result, src_reg(1.0f)));
2654    }
2655 }
2656
2657 void
2658 vec4_visitor::visit(ir_return *ir)
2659 {
2660    assert(!"not reached");
2661 }
2662
2663 void
2664 vec4_visitor::visit(ir_discard *ir)
2665 {
2666    assert(!"not reached");
2667 }
2668
2669 void
2670 vec4_visitor::visit(ir_if *ir)
2671 {
2672    /* Don't point the annotation at the if statement, because then it plus
2673     * the then and else blocks get printed.
2674     */
2675    this->base_ir = ir->condition;
2676
2677    if (brw->gen == 6) {
2678       emit_if_gen6(ir);
2679    } else {
2680       uint32_t predicate;
2681       emit_bool_to_cond_code(ir->condition, &predicate);
2682       emit(IF(predicate));
2683    }
2684
2685    visit_instructions(&ir->then_instructions);
2686
2687    if (!ir->else_instructions.is_empty()) {
2688       this->base_ir = ir->condition;
2689       emit(BRW_OPCODE_ELSE);
2690
2691       visit_instructions(&ir->else_instructions);
2692    }
2693
2694    this->base_ir = ir->condition;
2695    emit(BRW_OPCODE_ENDIF);
2696 }
2697
2698 void
2699 vec4_visitor::visit(ir_emit_vertex *)
2700 {
2701    assert(!"not reached");
2702 }
2703
2704 void
2705 vec4_visitor::visit(ir_end_primitive *)
2706 {
2707    assert(!"not reached");
2708 }
2709
2710 void
2711 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2712                                   dst_reg dst, src_reg offset,
2713                                   src_reg src0, src_reg src1)
2714 {
2715    unsigned mlen = 0;
2716
2717    /* Set the atomic operation offset. */
2718    emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2719    mlen++;
2720
2721    /* Set the atomic operation arguments. */
2722    if (src0.file != BAD_FILE) {
2723       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2724       mlen++;
2725    }
2726
2727    if (src1.file != BAD_FILE) {
2728       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2729       mlen++;
2730    }
2731
2732    /* Emit the instruction.  Note that this maps to the normal SIMD8
2733     * untyped atomic message on Ivy Bridge, but that's OK because
2734     * unused channels will be masked out.
2735     */
2736    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
2737                                  src_reg(atomic_op), src_reg(surf_index));
2738    inst->base_mrf = 0;
2739    inst->mlen = mlen;
2740 }
2741
2742 void
2743 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
2744                                         src_reg offset)
2745 {
2746    /* Set the surface read offset. */
2747    emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
2748
2749    /* Emit the instruction.  Note that this maps to the normal SIMD8
2750     * untyped surface read message, but that's OK because unused
2751     * channels will be masked out.
2752     */
2753    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ,
2754                                  dst, src_reg(surf_index));
2755    inst->base_mrf = 0;
2756    inst->mlen = 1;
2757 }
2758
2759 void
2760 vec4_visitor::emit_ndc_computation()
2761 {
2762    /* Get the position */
2763    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2764
2765    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2766    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2767    output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2768
2769    current_annotation = "NDC";
2770    dst_reg ndc_w = ndc;
2771    ndc_w.writemask = WRITEMASK_W;
2772    src_reg pos_w = pos;
2773    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2774    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2775
2776    dst_reg ndc_xyz = ndc;
2777    ndc_xyz.writemask = WRITEMASK_XYZ;
2778
2779    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2780 }
2781
2782 void
2783 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2784 {
2785    if (brw->gen < 6 &&
2786        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2787         key->userclip_active || brw->has_negative_rhw_bug)) {
2788       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2789       dst_reg header1_w = header1;
2790       header1_w.writemask = WRITEMASK_W;
2791
2792       emit(MOV(header1, 0u));
2793
2794       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2795          src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2796
2797          current_annotation = "Point size";
2798          emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2799          emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2800       }
2801
2802       if (key->userclip_active) {
2803          current_annotation = "Clipping flags";
2804          dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
2805          dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
2806
2807          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
2808          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
2809          emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
2810
2811          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
2812          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
2813          emit(SHL(flags1, src_reg(flags1), src_reg(4)));
2814          emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
2815       }
2816
2817       /* i965 clipping workaround:
2818        * 1) Test for -ve rhw
2819        * 2) If set,
2820        *      set ndc = (0,0,0,0)
2821        *      set ucp[6] = 1
2822        *
2823        * Later, clipping will detect ucp[6] and ensure the primitive is
2824        * clipped against all fixed planes.
2825        */
2826       if (brw->has_negative_rhw_bug) {
2827          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
2828          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
2829          emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
2830          vec4_instruction *inst;
2831          inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
2832          inst->predicate = BRW_PREDICATE_NORMAL;
2833          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
2834          inst->predicate = BRW_PREDICATE_NORMAL;
2835       }
2836
2837       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2838    } else if (brw->gen < 6) {
2839       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2840    } else {
2841       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2842       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2843          emit(MOV(brw_writemask(reg, WRITEMASK_W),
2844                   src_reg(output_reg[VARYING_SLOT_PSIZ])));
2845       }
2846       if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
2847          emit(MOV(retype(brw_writemask(reg, WRITEMASK_Y), BRW_REGISTER_TYPE_D),
2848                   src_reg(output_reg[VARYING_SLOT_LAYER])));
2849       }
2850       if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
2851          emit(MOV(retype(brw_writemask(reg, WRITEMASK_Z), BRW_REGISTER_TYPE_D),
2852                   src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
2853       }
2854    }
2855 }
2856
2857 void
2858 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
2859 {
2860    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2861     *
2862     *     "If a linked set of shaders forming the vertex stage contains no
2863     *     static write to gl_ClipVertex or gl_ClipDistance, but the
2864     *     application has requested clipping against user clip planes through
2865     *     the API, then the coordinate written to gl_Position is used for
2866     *     comparison against the user clip planes."
2867     *
2868     * This function is only called if the shader didn't write to
2869     * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
2870     * if the user wrote to it; otherwise we use gl_Position.
2871     */
2872    gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
2873    if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
2874       clip_vertex = VARYING_SLOT_POS;
2875    }
2876
2877    for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
2878         ++i) {
2879       reg.writemask = 1 << i;
2880       emit(DP4(reg,
2881                src_reg(output_reg[clip_vertex]),
2882                src_reg(this->userplane[i + offset])));
2883    }
2884 }
2885
2886 void
2887 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
2888 {
2889    assert (varying < VARYING_SLOT_MAX);
2890    reg.type = output_reg[varying].type;
2891    current_annotation = output_reg_annotation[varying];
2892    /* Copy the register, saturating if necessary */
2893    vec4_instruction *inst = emit(MOV(reg,
2894                                      src_reg(output_reg[varying])));
2895    if ((varying == VARYING_SLOT_COL0 ||
2896         varying == VARYING_SLOT_COL1 ||
2897         varying == VARYING_SLOT_BFC0 ||
2898         varying == VARYING_SLOT_BFC1) &&
2899        key->clamp_vertex_color) {
2900       inst->saturate = true;
2901    }
2902 }
2903
2904 void
2905 vec4_visitor::emit_urb_slot(int mrf, int varying)
2906 {
2907    struct brw_reg hw_reg = brw_message_reg(mrf);
2908    dst_reg reg = dst_reg(MRF, mrf);
2909    reg.type = BRW_REGISTER_TYPE_F;
2910
2911    switch (varying) {
2912    case VARYING_SLOT_PSIZ:
2913       /* PSIZ is always in slot 0, and is coupled with other flags. */
2914       current_annotation = "indices, point width, clip flags";
2915       emit_psiz_and_flags(hw_reg);
2916       break;
2917    case BRW_VARYING_SLOT_NDC:
2918       current_annotation = "NDC";
2919       emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
2920       break;
2921    case VARYING_SLOT_POS:
2922       current_annotation = "gl_Position";
2923       emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
2924       break;
2925    case VARYING_SLOT_EDGE:
2926       /* This is present when doing unfilled polygons.  We're supposed to copy
2927        * the edge flag from the user-provided vertex array
2928        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
2929        * of that attribute (starts as 1.0f).  This is then used in clipping to
2930        * determine which edges should be drawn as wireframe.
2931        */
2932       current_annotation = "edge flag";
2933       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
2934                                     glsl_type::float_type, WRITEMASK_XYZW))));
2935       break;
2936    case BRW_VARYING_SLOT_PAD:
2937       /* No need to write to this slot */
2938       break;
2939    default:
2940       emit_generic_urb_slot(reg, varying);
2941       break;
2942    }
2943 }
2944
2945 static int
2946 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
2947 {
2948    if (brw->gen >= 6) {
2949       /* URB data written (does not include the message header reg) must
2950        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
2951        * section 5.4.3.2.2: URB_INTERLEAVED.
2952        *
2953        * URB entries are allocated on a multiple of 1024 bits, so an
2954        * extra 128 bits written here to make the end align to 256 is
2955        * no problem.
2956        */
2957       if ((mlen % 2) != 1)
2958          mlen++;
2959    }
2960
2961    return mlen;
2962 }
2963
2964
2965 /**
2966  * Generates the VUE payload plus the necessary URB write instructions to
2967  * output it.
2968  *
2969  * The VUE layout is documented in Volume 2a.
2970  */
2971 void
2972 vec4_visitor::emit_vertex()
2973 {
2974    /* MRF 0 is reserved for the debugger, so start with message header
2975     * in MRF 1.
2976     */
2977    int base_mrf = 1;
2978    int mrf = base_mrf;
2979    /* In the process of generating our URB write message contents, we
2980     * may need to unspill a register or load from an array.  Those
2981     * reads would use MRFs 14-15.
2982     */
2983    int max_usable_mrf = 13;
2984
2985    /* The following assertion verifies that max_usable_mrf causes an
2986     * even-numbered amount of URB write data, which will meet gen6's
2987     * requirements for length alignment.
2988     */
2989    assert ((max_usable_mrf - base_mrf) % 2 == 0);
2990
2991    /* First mrf is the g0-based message header containing URB handles and
2992     * such.
2993     */
2994    emit_urb_write_header(mrf++);
2995
2996    if (brw->gen < 6) {
2997       emit_ndc_computation();
2998    }
2999
3000    /* Lower legacy ff and ClipVertex clipping to clip distances */
3001    if (key->userclip_active && !prog->UsesClipDistanceOut) {
3002       current_annotation = "user clip distances";
3003
3004       output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3005       output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3006
3007       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3008       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3009    }
3010
3011    /* We may need to split this up into several URB writes, so do them in a
3012     * loop.
3013     */
3014    int slot = 0;
3015    bool complete = false;
3016    do {
3017       /* URB offset is in URB row increments, and each of our MRFs is half of
3018        * one of those, since we're doing interleaved writes.
3019        */
3020       int offset = slot / 2;
3021
3022       mrf = base_mrf + 1;
3023       for (; slot < prog_data->vue_map.num_slots; ++slot) {
3024          emit_urb_slot(mrf++, prog_data->vue_map.slot_to_varying[slot]);
3025
3026          /* If this was max_usable_mrf, we can't fit anything more into this
3027           * URB WRITE.
3028           */
3029          if (mrf > max_usable_mrf) {
3030             slot++;
3031             break;
3032          }
3033       }
3034
3035       complete = slot >= prog_data->vue_map.num_slots;
3036       current_annotation = "URB write";
3037       vec4_instruction *inst = emit_urb_write_opcode(complete);
3038       inst->base_mrf = base_mrf;
3039       inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
3040       inst->offset += offset;
3041    } while(!complete);
3042 }
3043
3044
3045 src_reg
3046 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
3047                                  src_reg *reladdr, int reg_offset)
3048 {
3049    /* Because we store the values to scratch interleaved like our
3050     * vertex data, we need to scale the vec4 index by 2.
3051     */
3052    int message_header_scale = 2;
3053
3054    /* Pre-gen6, the message header uses byte offsets instead of vec4
3055     * (16-byte) offset units.
3056     */
3057    if (brw->gen < 6)
3058       message_header_scale *= 16;
3059
3060    if (reladdr) {
3061       src_reg index = src_reg(this, glsl_type::int_type);
3062
3063       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3064       emit_before(inst, MUL(dst_reg(index),
3065                             index, src_reg(message_header_scale)));
3066
3067       return index;
3068    } else {
3069       return src_reg(reg_offset * message_header_scale);
3070    }
3071 }
3072
3073 src_reg
3074 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
3075                                        src_reg *reladdr, int reg_offset)
3076 {
3077    if (reladdr) {
3078       src_reg index = src_reg(this, glsl_type::int_type);
3079
3080       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3081
3082       /* Pre-gen6, the message header uses byte offsets instead of vec4
3083        * (16-byte) offset units.
3084        */
3085       if (brw->gen < 6) {
3086          emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
3087       }
3088
3089       return index;
3090    } else if (brw->gen >= 8) {
3091       /* Store the offset in a GRF so we can send-from-GRF. */
3092       src_reg offset = src_reg(this, glsl_type::int_type);
3093       emit_before(inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3094       return offset;
3095    } else {
3096       int message_header_scale = brw->gen < 6 ? 16 : 1;
3097       return src_reg(reg_offset * message_header_scale);
3098    }
3099 }
3100
3101 /**
3102  * Emits an instruction before @inst to load the value named by @orig_src
3103  * from scratch space at @base_offset to @temp.
3104  *
3105  * @base_offset is measured in 32-byte units (the size of a register).
3106  */
3107 void
3108 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
3109                                 dst_reg temp, src_reg orig_src,
3110                                 int base_offset)
3111 {
3112    int reg_offset = base_offset + orig_src.reg_offset;
3113    src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
3114
3115    emit_before(inst, SCRATCH_READ(temp, index));
3116 }
3117
3118 /**
3119  * Emits an instruction after @inst to store the value to be written
3120  * to @orig_dst to scratch space at @base_offset, from @temp.
3121  *
3122  * @base_offset is measured in 32-byte units (the size of a register).
3123  */
3124 void
3125 vec4_visitor::emit_scratch_write(vec4_instruction *inst, int base_offset)
3126 {
3127    int reg_offset = base_offset + inst->dst.reg_offset;
3128    src_reg index = get_scratch_offset(inst, inst->dst.reladdr, reg_offset);
3129
3130    /* Create a temporary register to store *inst's result in.
3131     *
3132     * We have to be careful in MOVing from our temporary result register in
3133     * the scratch write.  If we swizzle from channels of the temporary that
3134     * weren't initialized, it will confuse live interval analysis, which will
3135     * make spilling fail to make progress.
3136     */
3137    src_reg temp = src_reg(this, glsl_type::vec4_type);
3138    temp.type = inst->dst.type;
3139    int first_writemask_chan = ffs(inst->dst.writemask) - 1;
3140    int swizzles[4];
3141    for (int i = 0; i < 4; i++)
3142       if (inst->dst.writemask & (1 << i))
3143          swizzles[i] = i;
3144       else
3145          swizzles[i] = first_writemask_chan;
3146    temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
3147                                swizzles[2], swizzles[3]);
3148
3149    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3150                                        inst->dst.writemask));
3151    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3152    write->predicate = inst->predicate;
3153    write->ir = inst->ir;
3154    write->annotation = inst->annotation;
3155    inst->insert_after(write);
3156
3157    inst->dst.file = temp.file;
3158    inst->dst.reg = temp.reg;
3159    inst->dst.reg_offset = temp.reg_offset;
3160    inst->dst.reladdr = NULL;
3161 }
3162
3163 /**
3164  * We can't generally support array access in GRF space, because a
3165  * single instruction's destination can only span 2 contiguous
3166  * registers.  So, we send all GRF arrays that get variable index
3167  * access to scratch space.
3168  */
3169 void
3170 vec4_visitor::move_grf_array_access_to_scratch()
3171 {
3172    int scratch_loc[this->virtual_grf_count];
3173
3174    for (int i = 0; i < this->virtual_grf_count; i++) {
3175       scratch_loc[i] = -1;
3176    }
3177
3178    /* First, calculate the set of virtual GRFs that need to be punted
3179     * to scratch due to having any array access on them, and where in
3180     * scratch.
3181     */
3182    foreach_list(node, &this->instructions) {
3183       vec4_instruction *inst = (vec4_instruction *)node;
3184
3185       if (inst->dst.file == GRF && inst->dst.reladdr &&
3186           scratch_loc[inst->dst.reg] == -1) {
3187          scratch_loc[inst->dst.reg] = c->last_scratch;
3188          c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
3189       }
3190
3191       for (int i = 0 ; i < 3; i++) {
3192          src_reg *src = &inst->src[i];
3193
3194          if (src->file == GRF && src->reladdr &&
3195              scratch_loc[src->reg] == -1) {
3196             scratch_loc[src->reg] = c->last_scratch;
3197             c->last_scratch += this->virtual_grf_sizes[src->reg];
3198          }
3199       }
3200    }
3201
3202    /* Now, for anything that will be accessed through scratch, rewrite
3203     * it to load/store.  Note that this is a _safe list walk, because
3204     * we may generate a new scratch_write instruction after the one
3205     * we're processing.
3206     */
3207    foreach_list_safe(node, &this->instructions) {
3208       vec4_instruction *inst = (vec4_instruction *)node;
3209
3210       /* Set up the annotation tracking for new generated instructions. */
3211       base_ir = inst->ir;
3212       current_annotation = inst->annotation;
3213
3214       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
3215          emit_scratch_write(inst, scratch_loc[inst->dst.reg]);
3216       }
3217
3218       for (int i = 0 ; i < 3; i++) {
3219          if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
3220             continue;
3221
3222          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3223
3224          emit_scratch_read(inst, temp, inst->src[i],
3225                            scratch_loc[inst->src[i].reg]);
3226
3227          inst->src[i].file = temp.file;
3228          inst->src[i].reg = temp.reg;
3229          inst->src[i].reg_offset = temp.reg_offset;
3230          inst->src[i].reladdr = NULL;
3231       }
3232    }
3233 }
3234
3235 /**
3236  * Emits an instruction before @inst to load the value named by @orig_src
3237  * from the pull constant buffer (surface) at @base_offset to @temp.
3238  */
3239 void
3240 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
3241                                       dst_reg temp, src_reg orig_src,
3242                                       int base_offset)
3243 {
3244    int reg_offset = base_offset + orig_src.reg_offset;
3245    src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3246    src_reg offset = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
3247    vec4_instruction *load;
3248
3249    if (brw->gen >= 7) {
3250       dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
3251       grf_offset.type = offset.type;
3252       emit_before(inst, MOV(grf_offset, offset));
3253
3254       load = new(mem_ctx) vec4_instruction(this,
3255                                            VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
3256                                            temp, index, src_reg(grf_offset));
3257    } else {
3258       load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
3259                                            temp, index, offset);
3260       load->base_mrf = 14;
3261       load->mlen = 1;
3262    }
3263    emit_before(inst, load);
3264 }
3265
3266 /**
3267  * Implements array access of uniforms by inserting a
3268  * PULL_CONSTANT_LOAD instruction.
3269  *
3270  * Unlike temporary GRF array access (where we don't support it due to
3271  * the difficulty of doing relative addressing on instruction
3272  * destinations), we could potentially do array access of uniforms
3273  * that were loaded in GRF space as push constants.  In real-world
3274  * usage we've seen, though, the arrays being used are always larger
3275  * than we could load as push constants, so just always move all
3276  * uniform array access out to a pull constant buffer.
3277  */
3278 void
3279 vec4_visitor::move_uniform_array_access_to_pull_constants()
3280 {
3281    int pull_constant_loc[this->uniforms];
3282
3283    for (int i = 0; i < this->uniforms; i++) {
3284       pull_constant_loc[i] = -1;
3285    }
3286
3287    /* Walk through and find array access of uniforms.  Put a copy of that
3288     * uniform in the pull constant buffer.
3289     *
3290     * Note that we don't move constant-indexed accesses to arrays.  No
3291     * testing has been done of the performance impact of this choice.
3292     */
3293    foreach_list_safe(node, &this->instructions) {
3294       vec4_instruction *inst = (vec4_instruction *)node;
3295
3296       for (int i = 0 ; i < 3; i++) {
3297          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3298             continue;
3299
3300          int uniform = inst->src[i].reg;
3301
3302          /* If this array isn't already present in the pull constant buffer,
3303           * add it.
3304           */
3305          if (pull_constant_loc[uniform] == -1) {
3306             const float **values = &stage_prog_data->param[uniform * 4];
3307
3308             pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3309
3310             assert(uniform < uniform_array_size);
3311             for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3312                stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3313                   = values[j];
3314             }
3315          }
3316
3317          /* Set up the annotation tracking for new generated instructions. */
3318          base_ir = inst->ir;
3319          current_annotation = inst->annotation;
3320
3321          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3322
3323          emit_pull_constant_load(inst, temp, inst->src[i],
3324                                  pull_constant_loc[uniform]);
3325
3326          inst->src[i].file = temp.file;
3327          inst->src[i].reg = temp.reg;
3328          inst->src[i].reg_offset = temp.reg_offset;
3329          inst->src[i].reladdr = NULL;
3330       }
3331    }
3332
3333    /* Now there are no accesses of the UNIFORM file with a reladdr, so
3334     * no need to track them as larger-than-vec4 objects.  This will be
3335     * relied on in cutting out unused uniform vectors from push
3336     * constants.
3337     */
3338    split_uniform_registers();
3339 }
3340
3341 void
3342 vec4_visitor::resolve_ud_negate(src_reg *reg)
3343 {
3344    if (reg->type != BRW_REGISTER_TYPE_UD ||
3345        !reg->negate)
3346       return;
3347
3348    src_reg temp = src_reg(this, glsl_type::uvec4_type);
3349    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3350    *reg = temp;
3351 }
3352
3353 vec4_visitor::vec4_visitor(struct brw_context *brw,
3354                            struct brw_vec4_compile *c,
3355                            struct gl_program *prog,
3356                            const struct brw_vec4_prog_key *key,
3357                            struct brw_vec4_prog_data *prog_data,
3358                            struct gl_shader_program *shader_prog,
3359                            struct brw_shader *shader,
3360                            void *mem_ctx,
3361                            bool debug_flag,
3362                            bool no_spills,
3363                            shader_time_shader_type st_base,
3364                            shader_time_shader_type st_written,
3365                            shader_time_shader_type st_reset)
3366    : c(c),
3367      key(key),
3368      prog_data(prog_data),
3369      sanity_param_count(0),
3370      fail_msg(NULL),
3371      first_non_payload_grf(0),
3372      need_all_constants_in_pull_buffer(false),
3373      debug_flag(debug_flag),
3374      no_spills(no_spills),
3375      st_base(st_base),
3376      st_written(st_written),
3377      st_reset(st_reset)
3378 {
3379    this->brw = brw;
3380    this->ctx = &brw->ctx;
3381    this->shader_prog = shader_prog;
3382    this->shader = shader;
3383
3384    this->mem_ctx = mem_ctx;
3385    this->failed = false;
3386
3387    this->base_ir = NULL;
3388    this->current_annotation = NULL;
3389    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3390
3391    this->prog = prog;
3392    this->stage_prog_data = &prog_data->base;
3393
3394    this->variable_ht = hash_table_ctor(0,
3395                                        hash_table_pointer_hash,
3396                                        hash_table_pointer_compare);
3397
3398    this->virtual_grf_start = NULL;
3399    this->virtual_grf_end = NULL;
3400    this->virtual_grf_sizes = NULL;
3401    this->virtual_grf_count = 0;
3402    this->virtual_grf_reg_map = NULL;
3403    this->virtual_grf_reg_count = 0;
3404    this->virtual_grf_array_size = 0;
3405    this->live_intervals_valid = false;
3406
3407    this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3408
3409    this->uniforms = 0;
3410
3411    /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3412     * at least one. See setup_uniforms() in brw_vec4.cpp.
3413     */
3414    this->uniform_array_size = 1;
3415    if (prog_data) {
3416       this->uniform_array_size = MAX2(stage_prog_data->nr_params, 1);
3417    }
3418
3419    this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3420    this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3421 }
3422
3423 vec4_visitor::~vec4_visitor()
3424 {
3425    hash_table_dtor(this->variable_ht);
3426 }
3427
3428
3429 void
3430 vec4_visitor::fail(const char *format, ...)
3431 {
3432    va_list va;
3433    char *msg;
3434
3435    if (failed)
3436       return;
3437
3438    failed = true;
3439
3440    va_start(va, format);
3441    msg = ralloc_vasprintf(mem_ctx, format, va);
3442    va_end(va);
3443    msg = ralloc_asprintf(mem_ctx, "vec4 compile failed: %s\n", msg);
3444
3445    this->fail_msg = msg;
3446
3447    if (debug_flag) {
3448       fprintf(stderr, "%s",  msg);
3449    }
3450 }
3451
3452 } /* namespace brw */