src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "glsl/ir_uniform.h"
  26 extern "C" {
  27 #include "program/sampler.h"
  28 }
  29
  30 namespace brw {
  31
  32 vec4_instruction::vec4_instruction(vec4_visitor *v,
  33                                    enum opcode opcode, dst_reg dst,
  34                                    src_reg src0, src_reg src1, src_reg src2)
  35 {
  36    this->opcode = opcode;
  37    this->dst = dst;
  38    this->src[0] = src0;
  39    this->src[1] = src1;
  40    this->src[2] = src2;
  41    this->saturate = false;
  42    this->force_writemask_all = false;
  43    this->no_dd_clear = false;
  44    this->no_dd_check = false;
  45    this->conditional_mod = BRW_CONDITIONAL_NONE;
  46    this->sampler = 0;
  47    this->texture_offset = 0;
  48    this->target = 0;
  49    this->shadow_compare = false;
  50    this->ir = v->base_ir;
  51    this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
  52    this->header_present = false;
  53    this->mlen = 0;
  54    this->base_mrf = 0;
  55    this->offset = 0;
  56    this->annotation = v->current_annotation;
  57 }
  58
  59 vec4_instruction *
  60 vec4_visitor::emit(vec4_instruction *inst)
  61 {
  62    this->instructions.push_tail(inst);
  63
  64    return inst;
  65 }
  66
  67 vec4_instruction *
  68 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
  69 {
  70    new_inst->ir = inst->ir;
  71    new_inst->annotation = inst->annotation;
  72
  73    inst->insert_before(new_inst);
  74
  75    return inst;
  76 }
  77
  78 vec4_instruction *
  79 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
  80                    src_reg src0, src_reg src1, src_reg src2)
  81 {
  82    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
  83                                              src0, src1, src2));
  84 }
  85
  86
  87 vec4_instruction *
  88 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
  89 {
  90    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
  91 }
  92
  93 vec4_instruction *
  94 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
  95 {
  96    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
  97 }
  98
  99 vec4_instruction *
 100 vec4_visitor::emit(enum opcode opcode, dst_reg dst)
 101 {
 102    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst));
 103 }
 104
 105 vec4_instruction *
 106 vec4_visitor::emit(enum opcode opcode)
 107 {
 108    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
 109 }
 110
 111 #define ALU1(op)                                                        \
 112    vec4_instruction *                                                   \
 113    vec4_visitor::op(dst_reg dst, src_reg src0)                          \
 114    {                                                                    \
 115       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 116                                            src0);                       \
 117    }
 118
 119 #define ALU2(op)                                                        \
 120    vec4_instruction *                                                   \
 121    vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1)            \
 122    {                                                                    \
 123       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 124                                            src0, src1);                 \
 125    }
 126
 127 #define ALU3(op)                                                        \
 128    vec4_instruction *                                                   \
 129    vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1, src_reg src2)\
 130    {                                                                    \
 131       assert(brw->gen >= 6);                                            \
 132       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 133                                            src0, src1, src2);           \
 134    }
 135
 136 ALU1(NOT)
 137 ALU1(MOV)
 138 ALU1(FRC)
 139 ALU1(RNDD)
 140 ALU1(RNDE)
 141 ALU1(RNDZ)
 142 ALU1(F32TO16)
 143 ALU1(F16TO32)
 144 ALU2(ADD)
 145 ALU2(MUL)
 146 ALU2(MACH)
 147 ALU2(AND)
 148 ALU2(OR)
 149 ALU2(XOR)
 150 ALU2(DP3)
 151 ALU2(DP4)
 152 ALU2(DPH)
 153 ALU2(SHL)
 154 ALU2(SHR)
 155 ALU2(ASR)
 156 ALU3(LRP)
 157 ALU1(BFREV)
 158 ALU3(BFE)
 159 ALU2(BFI1)
 160 ALU3(BFI2)
 161 ALU1(FBH)
 162 ALU1(FBL)
 163 ALU1(CBIT)
 164 ALU3(MAD)
 165 ALU2(ADDC)
 166 ALU2(SUBB)
 167
 168 /** Gen4 predicated IF. */
 169 vec4_instruction *
 170 vec4_visitor::IF(uint32_t predicate)
 171 {
 172    vec4_instruction *inst;
 173
 174    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
 175    inst->predicate = predicate;
 176
 177    return inst;
 178 }
 179
 180 /** Gen6 IF with embedded comparison. */
 181 vec4_instruction *
 182 vec4_visitor::IF(src_reg src0, src_reg src1, uint32_t condition)
 183 {
 184    assert(brw->gen == 6);
 185
 186    vec4_instruction *inst;
 187
 188    resolve_ud_negate(&src0);
 189    resolve_ud_negate(&src1);
 190
 191    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
 192                                         src0, src1);
 193    inst->conditional_mod = condition;
 194
 195    return inst;
 196 }
 197
 198 /**
 199  * CMP: Sets the low bit of the destination channels with the result
 200  * of the comparison, while the upper bits are undefined, and updates
 201  * the flag register with the packed 16 bits of the result.
 202  */
 203 vec4_instruction *
 204 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, uint32_t condition)
 205 {
 206    vec4_instruction *inst;
 207
 208    /* original gen4 does type conversion to the destination type
 209     * before before comparison, producing garbage results for floating
 210     * point comparisons.
 211     */
 212    if (brw->gen == 4) {
 213       dst.type = src0.type;
 214       if (dst.file == HW_REG)
 215          dst.fixed_hw_reg.type = dst.type;
 216    }
 217
 218    resolve_ud_negate(&src0);
 219    resolve_ud_negate(&src1);
 220
 221    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
 222    inst->conditional_mod = condition;
 223
 224    return inst;
 225 }
 226
 227 vec4_instruction *
 228 vec4_visitor::SCRATCH_READ(dst_reg dst, src_reg index)
 229 {
 230    vec4_instruction *inst;
 231
 232    inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_READ,
 233                                         dst, index);
 234    inst->base_mrf = 14;
 235    inst->mlen = 2;
 236
 237    return inst;
 238 }
 239
 240 vec4_instruction *
 241 vec4_visitor::SCRATCH_WRITE(dst_reg dst, src_reg src, src_reg index)
 242 {
 243    vec4_instruction *inst;
 244
 245    inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_WRITE,
 246                                         dst, src, index);
 247    inst->base_mrf = 13;
 248    inst->mlen = 3;
 249
 250    return inst;
 251 }
 252
 253 void
 254 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
 255 {
 256    static enum opcode dot_opcodes[] = {
 257       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
 258    };
 259
 260    emit(dot_opcodes[elements - 2], dst, src0, src1);
 261 }
 262
 263 src_reg
 264 vec4_visitor::fix_3src_operand(src_reg src)
 265 {
 266    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
 267     * able to use vertical stride of zero to replicate the vec4 uniform, like
 268     *
 269     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
 270     *
 271     * But you can't, since vertical stride is always four in three-source
 272     * instructions. Instead, insert a MOV instruction to do the replication so
 273     * that the three-source instruction can consume it.
 274     */
 275
 276    /* The MOV is only needed if the source is a uniform or immediate. */
 277    if (src.file != UNIFORM && src.file != IMM)
 278       return src;
 279
 280    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 281    expanded.type = src.type;
 282    emit(MOV(expanded, src));
 283    return src_reg(expanded);
 284 }
 285
 286 src_reg
 287 vec4_visitor::fix_math_operand(src_reg src)
 288 {
 289    /* The gen6 math instruction ignores the source modifiers --
 290     * swizzle, abs, negate, and at least some parts of the register
 291     * region description.
 292     *
 293     * Rather than trying to enumerate all these cases, *always* expand the
 294     * operand to a temp GRF for gen6.
 295     *
 296     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 297     * can't use.
 298     */
 299
 300    if (brw->gen == 7 && src.file != IMM)
 301       return src;
 302
 303    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 304    expanded.type = src.type;
 305    emit(MOV(expanded, src));
 306    return src_reg(expanded);
 307 }
 308
 309 void
 310 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
 311 {
 312    src = fix_math_operand(src);
 313
 314    if (dst.writemask != WRITEMASK_XYZW) {
 315       /* The gen6 math instruction must be align1, so we can't do
 316        * writemasks.
 317        */
 318       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 319
 320       emit(opcode, temp_dst, src);
 321
 322       emit(MOV(dst, src_reg(temp_dst)));
 323    } else {
 324       emit(opcode, dst, src);
 325    }
 326 }
 327
 328 void
 329 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
 330 {
 331    vec4_instruction *inst = emit(opcode, dst, src);
 332    inst->base_mrf = 1;
 333    inst->mlen = 1;
 334 }
 335
 336 void
 337 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
 338 {
 339    switch (opcode) {
 340    case SHADER_OPCODE_RCP:
 341    case SHADER_OPCODE_RSQ:
 342    case SHADER_OPCODE_SQRT:
 343    case SHADER_OPCODE_EXP2:
 344    case SHADER_OPCODE_LOG2:
 345    case SHADER_OPCODE_SIN:
 346    case SHADER_OPCODE_COS:
 347       break;
 348    default:
 349       assert(!"not reached: bad math opcode");
 350       return;
 351    }
 352
 353    if (brw->gen >= 6) {
 354       return emit_math1_gen6(opcode, dst, src);
 355    } else {
 356       return emit_math1_gen4(opcode, dst, src);
 357    }
 358 }
 359
 360 void
 361 vec4_visitor::emit_math2_gen6(enum opcode opcode,
 362                               dst_reg dst, src_reg src0, src_reg src1)
 363 {
 364    src0 = fix_math_operand(src0);
 365    src1 = fix_math_operand(src1);
 366
 367    if (dst.writemask != WRITEMASK_XYZW) {
 368       /* The gen6 math instruction must be align1, so we can't do
 369        * writemasks.
 370        */
 371       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 372       temp_dst.type = dst.type;
 373
 374       emit(opcode, temp_dst, src0, src1);
 375
 376       emit(MOV(dst, src_reg(temp_dst)));
 377    } else {
 378       emit(opcode, dst, src0, src1);
 379    }
 380 }
 381
 382 void
 383 vec4_visitor::emit_math2_gen4(enum opcode opcode,
 384                               dst_reg dst, src_reg src0, src_reg src1)
 385 {
 386    vec4_instruction *inst = emit(opcode, dst, src0, src1);
 387    inst->base_mrf = 1;
 388    inst->mlen = 2;
 389 }
 390
 391 void
 392 vec4_visitor::emit_math(enum opcode opcode,
 393                         dst_reg dst, src_reg src0, src_reg src1)
 394 {
 395    switch (opcode) {
 396    case SHADER_OPCODE_POW:
 397    case SHADER_OPCODE_INT_QUOTIENT:
 398    case SHADER_OPCODE_INT_REMAINDER:
 399       break;
 400    default:
 401       assert(!"not reached: unsupported binary math opcode");
 402       return;
 403    }
 404
 405    if (brw->gen >= 6) {
 406       return emit_math2_gen6(opcode, dst, src0, src1);
 407    } else {
 408       return emit_math2_gen4(opcode, dst, src0, src1);
 409    }
 410 }
 411
 412 void
 413 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 414 {
 415    if (brw->gen < 7)
 416       assert(!"ir_unop_pack_half_2x16 should be lowered");
 417
 418    assert(dst.type == BRW_REGISTER_TYPE_UD);
 419    assert(src0.type == BRW_REGISTER_TYPE_F);
 420
 421    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
 422     *
 423     *   Because this instruction does not have a 16-bit floating-point type,
 424     *   the destination data type must be Word (W).
 425     *
 426     *   The destination must be DWord-aligned and specify a horizontal stride
 427     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
 428     *   each destination channel and the upper word is not modified.
 429     *
 430     * The above restriction implies that the f32to16 instruction must use
 431     * align1 mode, because only in align1 mode is it possible to specify
 432     * horizontal stride.  We choose here to defy the hardware docs and emit
 433     * align16 instructions.
 434     *
 435     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
 436     * instructions. I was partially successful in that the code passed all
 437     * tests.  However, the code was dubiously correct and fragile, and the
 438     * tests were not harsh enough to probe that frailty. Not trusting the
 439     * code, I chose instead to remain in align16 mode in defiance of the hw
 440     * docs).
 441     *
 442     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
 443     * simulator, emitting a f32to16 in align16 mode with UD as destination
 444     * data type is safe. The behavior differs from that specified in the PRM
 445     * in that the upper word of each destination channel is cleared to 0.
 446     */
 447
 448    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 449    src_reg tmp_src(tmp_dst);
 450
 451 #if 0
 452    /* Verify the undocumented behavior on which the following instructions
 453     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
 454     * then the result of the bit-or instruction below will be incorrect.
 455     *
 456     * You should inspect the disasm output in order to verify that the MOV is
 457     * not optimized away.
 458     */
 459    emit(MOV(tmp_dst, src_reg(0x12345678u)));
 460 #endif
 461
 462    /* Give tmp the form below, where "." means untouched.
 463     *
 464     *     w z          y          x w z          y          x
 465     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
 466     *
 467     * That the upper word of each write-channel be 0 is required for the
 468     * following bit-shift and bit-or instructions to work. Note that this
 469     * relies on the undocumented hardware behavior mentioned above.
 470     */
 471    tmp_dst.writemask = WRITEMASK_XY;
 472    emit(F32TO16(tmp_dst, src0));
 473
 474    /* Give the write-channels of dst the form:
 475     *   0xhhhh0000
 476     */
 477    tmp_src.swizzle = BRW_SWIZZLE_YYYY;
 478    emit(SHL(dst, tmp_src, src_reg(16u)));
 479
 480    /* Finally, give the write-channels of dst the form of packHalf2x16's
 481     * output:
 482     *   0xhhhhllll
 483     */
 484    tmp_src.swizzle = BRW_SWIZZLE_XXXX;
 485    emit(OR(dst, src_reg(dst), tmp_src));
 486 }
 487
 488 void
 489 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 490 {
 491    if (brw->gen < 7)
 492       assert(!"ir_unop_unpack_half_2x16 should be lowered");
 493
 494    assert(dst.type == BRW_REGISTER_TYPE_F);
 495    assert(src0.type == BRW_REGISTER_TYPE_UD);
 496
 497    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
 498     *
 499     *   Because this instruction does not have a 16-bit floating-point type,
 500     *   the source data type must be Word (W). The destination type must be
 501     *   F (Float).
 502     *
 503     * To use W as the source data type, we must adjust horizontal strides,
 504     * which is only possible in align1 mode. All my [chadv] attempts at
 505     * emitting align1 instructions for unpackHalf2x16 failed to pass the
 506     * Piglit tests, so I gave up.
 507     *
 508     * I've verified that, on gen7 hardware and the simulator, it is safe to
 509     * emit f16to32 in align16 mode with UD as source data type.
 510     */
 511
 512    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 513    src_reg tmp_src(tmp_dst);
 514
 515    tmp_dst.writemask = WRITEMASK_X;
 516    emit(AND(tmp_dst, src0, src_reg(0xffffu)));
 517
 518    tmp_dst.writemask = WRITEMASK_Y;
 519    emit(SHR(tmp_dst, src0, src_reg(16u)));
 520
 521    dst.writemask = WRITEMASK_XY;
 522    emit(F16TO32(dst, tmp_src));
 523 }
 524
 525 void
 526 vec4_visitor::visit_instructions(const exec_list *list)
 527 {
 528    foreach_list(node, list) {
 529       ir_instruction *ir = (ir_instruction *)node;
 530
 531       base_ir = ir;
 532       ir->accept(this);
 533    }
 534 }
 535
 536
 537 static int
 538 type_size(const struct glsl_type *type)
 539 {
 540    unsigned int i;
 541    int size;
 542
 543    switch (type->base_type) {
 544    case GLSL_TYPE_UINT:
 545    case GLSL_TYPE_INT:
 546    case GLSL_TYPE_FLOAT:
 547    case GLSL_TYPE_BOOL:
 548       if (type->is_matrix()) {
 549          return type->matrix_columns;
 550       } else {
 551          /* Regardless of size of vector, it gets a vec4. This is bad
 552           * packing for things like floats, but otherwise arrays become a
 553           * mess.  Hopefully a later pass over the code can pack scalars
 554           * down if appropriate.
 555           */
 556          return 1;
 557       }
 558    case GLSL_TYPE_ARRAY:
 559       assert(type->length > 0);
 560       return type_size(type->fields.array) * type->length;
 561    case GLSL_TYPE_STRUCT:
 562       size = 0;
 563       for (i = 0; i < type->length; i++) {
 564          size += type_size(type->fields.structure[i].type);
 565       }
 566       return size;
 567    case GLSL_TYPE_SAMPLER:
 568       /* Samplers take up one slot in UNIFORMS[], but they're baked in
 569        * at link time.
 570        */
 571       return 1;
 572    case GLSL_TYPE_ATOMIC_UINT:
 573       return 0;
 574    case GLSL_TYPE_IMAGE:
 575    case GLSL_TYPE_VOID:
 576    case GLSL_TYPE_ERROR:
 577    case GLSL_TYPE_INTERFACE:
 578       assert(0);
 579       break;
 580    }
 581
 582    return 0;
 583 }
 584
 585 int
 586 vec4_visitor::virtual_grf_alloc(int size)
 587 {
 588    if (virtual_grf_array_size <= virtual_grf_count) {
 589       if (virtual_grf_array_size == 0)
 590          virtual_grf_array_size = 16;
 591       else
 592          virtual_grf_array_size *= 2;
 593       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 594                                    virtual_grf_array_size);
 595       virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
 596                                      virtual_grf_array_size);
 597    }
 598    virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
 599    virtual_grf_reg_count += size;
 600    virtual_grf_sizes[virtual_grf_count] = size;
 601    return virtual_grf_count++;
 602 }
 603
 604 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 605 {
 606    init();
 607
 608    this->file = GRF;
 609    this->reg = v->virtual_grf_alloc(type_size(type));
 610
 611    if (type->is_array() || type->is_record()) {
 612       this->swizzle = BRW_SWIZZLE_NOOP;
 613    } else {
 614       this->swizzle = swizzle_for_size(type->vector_elements);
 615    }
 616
 617    this->type = brw_type_for_base_type(type);
 618 }
 619
 620 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 621 {
 622    init();
 623
 624    this->file = GRF;
 625    this->reg = v->virtual_grf_alloc(type_size(type));
 626
 627    if (type->is_array() || type->is_record()) {
 628       this->writemask = WRITEMASK_XYZW;
 629    } else {
 630       this->writemask = (1 << type->vector_elements) - 1;
 631    }
 632
 633    this->type = brw_type_for_base_type(type);
 634 }
 635
 636 /* Our support for uniforms is piggy-backed on the struct
 637  * gl_fragment_program, because that's where the values actually
 638  * get stored, rather than in some global gl_shader_program uniform
 639  * store.
 640  */
 641 void
 642 vec4_visitor::setup_uniform_values(ir_variable *ir)
 643 {
 644    int namelen = strlen(ir->name);
 645
 646    /* The data for our (non-builtin) uniforms is stored in a series of
 647     * gl_uniform_driver_storage structs for each subcomponent that
 648     * glGetUniformLocation() could name.  We know it's been set up in the same
 649     * order we'd walk the type, so walk the list of storage and find anything
 650     * with our name, or the prefix of a component that starts with our name.
 651     */
 652    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 653       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 654
 655       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 656           (storage->name[namelen] != 0 &&
 657            storage->name[namelen] != '.' &&
 658            storage->name[namelen] != '[')) {
 659          continue;
 660       }
 661
 662       gl_constant_value *components = storage->storage;
 663       unsigned vector_count = (MAX2(storage->array_elements, 1) *
 664                                storage->type->matrix_columns);
 665
 666       for (unsigned s = 0; s < vector_count; s++) {
 667          uniform_vector_size[uniforms] = storage->type->vector_elements;
 668
 669          int i;
 670          for (i = 0; i < uniform_vector_size[uniforms]; i++) {
 671             stage_prog_data->param[uniforms * 4 + i] = &components->f;
 672             components++;
 673          }
 674          for (; i < 4; i++) {
 675             static float zero = 0;
 676             stage_prog_data->param[uniforms * 4 + i] = &zero;
 677          }
 678
 679          uniforms++;
 680       }
 681    }
 682 }
 683
 684 void
 685 vec4_visitor::setup_uniform_clipplane_values()
 686 {
 687    gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
 688
 689    for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
 690       this->uniform_vector_size[this->uniforms] = 4;
 691       this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
 692       this->userplane[i].type = BRW_REGISTER_TYPE_F;
 693       for (int j = 0; j < 4; ++j) {
 694          stage_prog_data->param[this->uniforms * 4 + j] = &clip_planes[i][j];
 695       }
 696       ++this->uniforms;
 697    }
 698 }
 699
 700 /* Our support for builtin uniforms is even scarier than non-builtin.
 701  * It sits on top of the PROG_STATE_VAR parameters that are
 702  * automatically updated from GL context state.
 703  */
 704 void
 705 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
 706 {
 707    const ir_state_slot *const slots = ir->state_slots;
 708    assert(ir->state_slots != NULL);
 709
 710    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 711       /* This state reference has already been setup by ir_to_mesa,
 712        * but we'll get the same index back here.  We can reference
 713        * ParameterValues directly, since unlike brw_fs.cpp, we never
 714        * add new state references during compile.
 715        */
 716       int index = _mesa_add_state_reference(this->prog->Parameters,
 717                                             (gl_state_index *)slots[i].tokens);
 718       float *values = &this->prog->Parameters->ParameterValues[index][0].f;
 719
 720       this->uniform_vector_size[this->uniforms] = 0;
 721       /* Add each of the unique swizzled channels of the element.
 722        * This will end up matching the size of the glsl_type of this field.
 723        */
 724       int last_swiz = -1;
 725       for (unsigned int j = 0; j < 4; j++) {
 726          int swiz = GET_SWZ(slots[i].swizzle, j);
 727          last_swiz = swiz;
 728
 729          stage_prog_data->param[this->uniforms * 4 + j] = &values[swiz];
 730          if (swiz <= last_swiz)
 731             this->uniform_vector_size[this->uniforms]++;
 732       }
 733       this->uniforms++;
 734    }
 735 }
 736
 737 dst_reg *
 738 vec4_visitor::variable_storage(ir_variable *var)
 739 {
 740    return (dst_reg *)hash_table_find(this->variable_ht, var);
 741 }
 742
 743 void
 744 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir, uint32_t *predicate)
 745 {
 746    ir_expression *expr = ir->as_expression();
 747
 748    *predicate = BRW_PREDICATE_NORMAL;
 749
 750    if (expr) {
 751       src_reg op[2];
 752       vec4_instruction *inst;
 753
 754       assert(expr->get_num_operands() <= 2);
 755       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 756          expr->operands[i]->accept(this);
 757          op[i] = this->result;
 758
 759          resolve_ud_negate(&op[i]);
 760       }
 761
 762       switch (expr->operation) {
 763       case ir_unop_logic_not:
 764          inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
 765          inst->conditional_mod = BRW_CONDITIONAL_Z;
 766          break;
 767
 768       case ir_binop_logic_xor:
 769          inst = emit(XOR(dst_null_d(), op[0], op[1]));
 770          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 771          break;
 772
 773       case ir_binop_logic_or:
 774          inst = emit(OR(dst_null_d(), op[0], op[1]));
 775          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 776          break;
 777
 778       case ir_binop_logic_and:
 779          inst = emit(AND(dst_null_d(), op[0], op[1]));
 780          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 781          break;
 782
 783       case ir_unop_f2b:
 784          if (brw->gen >= 6) {
 785             emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
 786          } else {
 787             inst = emit(MOV(dst_null_f(), op[0]));
 788             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 789          }
 790          break;
 791
 792       case ir_unop_i2b:
 793          if (brw->gen >= 6) {
 794             emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 795          } else {
 796             inst = emit(MOV(dst_null_d(), op[0]));
 797             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 798          }
 799          break;
 800
 801       case ir_binop_all_equal:
 802          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 803          *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
 804          break;
 805
 806       case ir_binop_any_nequal:
 807          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 808          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 809          break;
 810
 811       case ir_unop_any:
 812          inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 813          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 814          break;
 815
 816       case ir_binop_greater:
 817       case ir_binop_gequal:
 818       case ir_binop_less:
 819       case ir_binop_lequal:
 820       case ir_binop_equal:
 821       case ir_binop_nequal:
 822          emit(CMP(dst_null_d(), op[0], op[1],
 823                   brw_conditional_for_comparison(expr->operation)));
 824          break;
 825
 826       default:
 827          assert(!"not reached");
 828          break;
 829       }
 830       return;
 831    }
 832
 833    ir->accept(this);
 834
 835    resolve_ud_negate(&this->result);
 836
 837    if (brw->gen >= 6) {
 838       vec4_instruction *inst = emit(AND(dst_null_d(),
 839                                         this->result, src_reg(1)));
 840       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 841    } else {
 842       vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
 843       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 844    }
 845 }
 846
 847 /**
 848  * Emit a gen6 IF statement with the comparison folded into the IF
 849  * instruction.
 850  */
 851 void
 852 vec4_visitor::emit_if_gen6(ir_if *ir)
 853 {
 854    ir_expression *expr = ir->condition->as_expression();
 855
 856    if (expr) {
 857       src_reg op[2];
 858       dst_reg temp;
 859
 860       assert(expr->get_num_operands() <= 2);
 861       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 862          expr->operands[i]->accept(this);
 863          op[i] = this->result;
 864       }
 865
 866       switch (expr->operation) {
 867       case ir_unop_logic_not:
 868          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
 869          return;
 870
 871       case ir_binop_logic_xor:
 872          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
 873          return;
 874
 875       case ir_binop_logic_or:
 876          temp = dst_reg(this, glsl_type::bool_type);
 877          emit(OR(temp, op[0], op[1]));
 878          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 879          return;
 880
 881       case ir_binop_logic_and:
 882          temp = dst_reg(this, glsl_type::bool_type);
 883          emit(AND(temp, op[0], op[1]));
 884          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 885          return;
 886
 887       case ir_unop_f2b:
 888          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 889          return;
 890
 891       case ir_unop_i2b:
 892          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 893          return;
 894
 895       case ir_binop_greater:
 896       case ir_binop_gequal:
 897       case ir_binop_less:
 898       case ir_binop_lequal:
 899       case ir_binop_equal:
 900       case ir_binop_nequal:
 901          emit(IF(op[0], op[1],
 902                  brw_conditional_for_comparison(expr->operation)));
 903          return;
 904
 905       case ir_binop_all_equal:
 906          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 907          emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
 908          return;
 909
 910       case ir_binop_any_nequal:
 911          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 912          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 913          return;
 914
 915       case ir_unop_any:
 916          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 917          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 918          return;
 919
 920       default:
 921          assert(!"not reached");
 922          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 923          return;
 924       }
 925       return;
 926    }
 927
 928    ir->condition->accept(this);
 929
 930    emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
 931 }
 932
 933 void
 934 vec4_visitor::visit(ir_variable *ir)
 935 {
 936    dst_reg *reg = NULL;
 937
 938    if (variable_storage(ir))
 939       return;
 940
 941    switch (ir->data.mode) {
 942    case ir_var_shader_in:
 943       reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
 944       break;
 945
 946    case ir_var_shader_out:
 947       reg = new(mem_ctx) dst_reg(this, ir->type);
 948
 949       for (int i = 0; i < type_size(ir->type); i++) {
 950          output_reg[ir->data.location + i] = *reg;
 951          output_reg[ir->data.location + i].reg_offset = i;
 952          output_reg[ir->data.location + i].type =
 953             brw_type_for_base_type(ir->type->get_scalar_type());
 954          output_reg_annotation[ir->data.location + i] = ir->name;
 955       }
 956       break;
 957
 958    case ir_var_auto:
 959    case ir_var_temporary:
 960       reg = new(mem_ctx) dst_reg(this, ir->type);
 961       break;
 962
 963    case ir_var_uniform:
 964       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
 965
 966       /* Thanks to the lower_ubo_reference pass, we will see only
 967        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
 968        * variables, so no need for them to be in variable_ht.
 969        *
 970        * Atomic counters take no uniform storage, no need to do
 971        * anything here.
 972        */
 973       if (ir->is_in_uniform_block() || ir->type->contains_atomic())
 974          return;
 975
 976       /* Track how big the whole uniform variable is, in case we need to put a
 977        * copy of its data into pull constants for array access.
 978        */
 979       this->uniform_size[this->uniforms] = type_size(ir->type);
 980
 981       if (!strncmp(ir->name, "gl_", 3)) {
 982          setup_builtin_uniform_values(ir);
 983       } else {
 984          setup_uniform_values(ir);
 985       }
 986       break;
 987
 988    case ir_var_system_value:
 989       reg = make_reg_for_system_value(ir);
 990       break;
 991
 992    default:
 993       assert(!"not reached");
 994    }
 995
 996    reg->type = brw_type_for_base_type(ir->type);
 997    hash_table_insert(this->variable_ht, reg, ir);
 998 }
 999
1000 void
1001 vec4_visitor::visit(ir_loop *ir)
1002 {
1003    /* We don't want debugging output to print the whole body of the
1004     * loop as the annotation.
1005     */
1006    this->base_ir = NULL;
1007
1008    emit(BRW_OPCODE_DO);
1009
1010    visit_instructions(&ir->body_instructions);
1011
1012    emit(BRW_OPCODE_WHILE);
1013 }
1014
1015 void
1016 vec4_visitor::visit(ir_loop_jump *ir)
1017 {
1018    switch (ir->mode) {
1019    case ir_loop_jump::jump_break:
1020       emit(BRW_OPCODE_BREAK);
1021       break;
1022    case ir_loop_jump::jump_continue:
1023       emit(BRW_OPCODE_CONTINUE);
1024       break;
1025    }
1026 }
1027
1028
1029 void
1030 vec4_visitor::visit(ir_function_signature *ir)
1031 {
1032    assert(0);
1033    (void)ir;
1034 }
1035
1036 void
1037 vec4_visitor::visit(ir_function *ir)
1038 {
1039    /* Ignore function bodies other than main() -- we shouldn't see calls to
1040     * them since they should all be inlined.
1041     */
1042    if (strcmp(ir->name, "main") == 0) {
1043       const ir_function_signature *sig;
1044       exec_list empty;
1045
1046       sig = ir->matching_signature(NULL, &empty);
1047
1048       assert(sig);
1049
1050       visit_instructions(&sig->body);
1051    }
1052 }
1053
1054 bool
1055 vec4_visitor::try_emit_sat(ir_expression *ir)
1056 {
1057    ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
1058    if (!sat_src)
1059       return false;
1060
1061    sat_src->accept(this);
1062    src_reg src = this->result;
1063
1064    this->result = src_reg(this, ir->type);
1065    vec4_instruction *inst;
1066    inst = emit(MOV(dst_reg(this->result), src));
1067    inst->saturate = true;
1068
1069    return true;
1070 }
1071
1072 bool
1073 vec4_visitor::try_emit_mad(ir_expression *ir, int mul_arg)
1074 {
1075    /* 3-src instructions were introduced in gen6. */
1076    if (brw->gen < 6)
1077       return false;
1078
1079    /* MAD can only handle floating-point data. */
1080    if (ir->type->base_type != GLSL_TYPE_FLOAT)
1081       return false;
1082
1083    ir_rvalue *nonmul = ir->operands[1 - mul_arg];
1084    ir_expression *mul = ir->operands[mul_arg]->as_expression();
1085
1086    if (!mul || mul->operation != ir_binop_mul)
1087       return false;
1088
1089    nonmul->accept(this);
1090    src_reg src0 = fix_3src_operand(this->result);
1091
1092    mul->operands[0]->accept(this);
1093    src_reg src1 = fix_3src_operand(this->result);
1094
1095    mul->operands[1]->accept(this);
1096    src_reg src2 = fix_3src_operand(this->result);
1097
1098    this->result = src_reg(this, ir->type);
1099    emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1100
1101    return true;
1102 }
1103
1104 void
1105 vec4_visitor::emit_bool_comparison(unsigned int op,
1106                                  dst_reg dst, src_reg src0, src_reg src1)
1107 {
1108    /* original gen4 does destination conversion before comparison. */
1109    if (brw->gen < 5)
1110       dst.type = src0.type;
1111
1112    emit(CMP(dst, src0, src1, brw_conditional_for_comparison(op)));
1113
1114    dst.type = BRW_REGISTER_TYPE_D;
1115    emit(AND(dst, src_reg(dst), src_reg(0x1)));
1116 }
1117
1118 void
1119 vec4_visitor::emit_minmax(uint32_t conditionalmod, dst_reg dst,
1120                           src_reg src0, src_reg src1)
1121 {
1122    vec4_instruction *inst;
1123
1124    if (brw->gen >= 6) {
1125       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1126       inst->conditional_mod = conditionalmod;
1127    } else {
1128       emit(CMP(dst, src0, src1, conditionalmod));
1129
1130       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1131       inst->predicate = BRW_PREDICATE_NORMAL;
1132    }
1133 }
1134
1135 void
1136 vec4_visitor::emit_lrp(const dst_reg &dst,
1137                        const src_reg &x, const src_reg &y, const src_reg &a)
1138 {
1139    if (brw->gen >= 6) {
1140       /* Note that the instruction's argument order is reversed from GLSL
1141        * and the IR.
1142        */
1143       emit(LRP(dst,
1144                fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
1145    } else {
1146       /* Earlier generations don't support three source operations, so we
1147        * need to emit x*(1-a) + y*a.
1148        *
1149        * A better way to do this would be:
1150        *    ADD one_minus_a, negate(a), 1.0f
1151        *    MUL null, y, a
1152        *    MAC dst, x, one_minus_a
1153        * but we would need to support MAC and implicit accumulator.
1154        */
1155       dst_reg y_times_a           = dst_reg(this, glsl_type::vec4_type);
1156       dst_reg one_minus_a         = dst_reg(this, glsl_type::vec4_type);
1157       dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1158       y_times_a.writemask           = dst.writemask;
1159       one_minus_a.writemask         = dst.writemask;
1160       x_times_one_minus_a.writemask = dst.writemask;
1161
1162       emit(MUL(y_times_a, y, a));
1163       emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1164       emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1165       emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1166    }
1167 }
1168
1169 static bool
1170 is_16bit_constant(ir_rvalue *rvalue)
1171 {
1172    ir_constant *constant = rvalue->as_constant();
1173    if (!constant)
1174       return false;
1175
1176    if (constant->type != glsl_type::int_type &&
1177        constant->type != glsl_type::uint_type)
1178       return false;
1179
1180    return constant->value.u[0] < (1 << 16);
1181 }
1182
1183 void
1184 vec4_visitor::visit(ir_expression *ir)
1185 {
1186    unsigned int operand;
1187    src_reg op[Elements(ir->operands)];
1188    src_reg result_src;
1189    dst_reg result_dst;
1190    vec4_instruction *inst;
1191
1192    if (try_emit_sat(ir))
1193       return;
1194
1195    if (ir->operation == ir_binop_add) {
1196       if (try_emit_mad(ir, 0) || try_emit_mad(ir, 1))
1197          return;
1198    }
1199
1200    for (operand = 0; operand < ir->get_num_operands(); operand++) {
1201       this->result.file = BAD_FILE;
1202       ir->operands[operand]->accept(this);
1203       if (this->result.file == BAD_FILE) {
1204          fprintf(stderr, "Failed to get tree for expression operand:\n");
1205          ir->operands[operand]->fprint(stderr);
1206          exit(1);
1207       }
1208       op[operand] = this->result;
1209
1210       /* Matrix expression operands should have been broken down to vector
1211        * operations already.
1212        */
1213       assert(!ir->operands[operand]->type->is_matrix());
1214    }
1215
1216    int vector_elements = ir->operands[0]->type->vector_elements;
1217    if (ir->operands[1]) {
1218       vector_elements = MAX2(vector_elements,
1219                              ir->operands[1]->type->vector_elements);
1220    }
1221
1222    this->result.file = BAD_FILE;
1223
1224    /* Storage for our result.  Ideally for an assignment we'd be using
1225     * the actual storage for the result here, instead.
1226     */
1227    result_src = src_reg(this, ir->type);
1228    /* convenience for the emit functions below. */
1229    result_dst = dst_reg(result_src);
1230    /* If nothing special happens, this is the result. */
1231    this->result = result_src;
1232    /* Limit writes to the channels that will be used by result_src later.
1233     * This does limit this temp's use as a temporary for multi-instruction
1234     * sequences.
1235     */
1236    result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1237
1238    switch (ir->operation) {
1239    case ir_unop_logic_not:
1240       /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
1241        * ones complement of the whole register, not just bit 0.
1242        */
1243       emit(XOR(result_dst, op[0], src_reg(1)));
1244       break;
1245    case ir_unop_neg:
1246       op[0].negate = !op[0].negate;
1247       emit(MOV(result_dst, op[0]));
1248       break;
1249    case ir_unop_abs:
1250       op[0].abs = true;
1251       op[0].negate = false;
1252       emit(MOV(result_dst, op[0]));
1253       break;
1254
1255    case ir_unop_sign:
1256       if (ir->type->is_float()) {
1257          /* AND(val, 0x80000000) gives the sign bit.
1258           *
1259           * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1260           * zero.
1261           */
1262          emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1263
1264          op[0].type = BRW_REGISTER_TYPE_UD;
1265          result_dst.type = BRW_REGISTER_TYPE_UD;
1266          emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1267
1268          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1269          inst->predicate = BRW_PREDICATE_NORMAL;
1270
1271          this->result.type = BRW_REGISTER_TYPE_F;
1272       } else {
1273          /*  ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1274           *               -> non-negative val generates 0x00000000.
1275           *  Predicated OR sets 1 if val is positive.
1276           */
1277          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1278
1279          emit(ASR(result_dst, op[0], src_reg(31)));
1280
1281          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1282          inst->predicate = BRW_PREDICATE_NORMAL;
1283       }
1284       break;
1285
1286    case ir_unop_rcp:
1287       emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1288       break;
1289
1290    case ir_unop_exp2:
1291       emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1292       break;
1293    case ir_unop_log2:
1294       emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1295       break;
1296    case ir_unop_exp:
1297    case ir_unop_log:
1298       assert(!"not reached: should be handled by ir_explog_to_explog2");
1299       break;
1300    case ir_unop_sin:
1301    case ir_unop_sin_reduced:
1302       emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1303       break;
1304    case ir_unop_cos:
1305    case ir_unop_cos_reduced:
1306       emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1307       break;
1308
1309    case ir_unop_dFdx:
1310    case ir_unop_dFdy:
1311       assert(!"derivatives not valid in vertex shader");
1312       break;
1313
1314    case ir_unop_bitfield_reverse:
1315       emit(BFREV(result_dst, op[0]));
1316       break;
1317    case ir_unop_bit_count:
1318       emit(CBIT(result_dst, op[0]));
1319       break;
1320    case ir_unop_find_msb: {
1321       src_reg temp = src_reg(this, glsl_type::uint_type);
1322
1323       inst = emit(FBH(dst_reg(temp), op[0]));
1324       inst->dst.writemask = WRITEMASK_XYZW;
1325
1326       /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1327        * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1328        * subtract the result from 31 to convert the MSB count into an LSB count.
1329        */
1330
1331       /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1332       temp.swizzle = BRW_SWIZZLE_NOOP;
1333       emit(MOV(result_dst, temp));
1334
1335       src_reg src_tmp = src_reg(result_dst);
1336       emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1337
1338       src_tmp.negate = true;
1339       inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1340       inst->predicate = BRW_PREDICATE_NORMAL;
1341       break;
1342    }
1343    case ir_unop_find_lsb:
1344       emit(FBL(result_dst, op[0]));
1345       break;
1346
1347    case ir_unop_noise:
1348       assert(!"not reached: should be handled by lower_noise");
1349       break;
1350
1351    case ir_binop_add:
1352       emit(ADD(result_dst, op[0], op[1]));
1353       break;
1354    case ir_binop_sub:
1355       assert(!"not reached: should be handled by ir_sub_to_add_neg");
1356       break;
1357
1358    case ir_binop_mul:
1359       if (brw->gen < 8 && ir->type->is_integer()) {
1360          /* For integer multiplication, the MUL uses the low 16 bits of one of
1361           * the operands (src0 through SNB, src1 on IVB and later).  The MACH
1362           * accumulates in the contribution of the upper 16 bits of that
1363           * operand.  If we can determine that one of the args is in the low
1364           * 16 bits, though, we can just emit a single MUL.
1365           */
1366          if (is_16bit_constant(ir->operands[0])) {
1367             if (brw->gen < 7)
1368                emit(MUL(result_dst, op[0], op[1]));
1369             else
1370                emit(MUL(result_dst, op[1], op[0]));
1371          } else if (is_16bit_constant(ir->operands[1])) {
1372             if (brw->gen < 7)
1373                emit(MUL(result_dst, op[1], op[0]));
1374             else
1375                emit(MUL(result_dst, op[0], op[1]));
1376          } else {
1377             struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1378
1379             emit(MUL(acc, op[0], op[1]));
1380             emit(MACH(dst_null_d(), op[0], op[1]));
1381             emit(MOV(result_dst, src_reg(acc)));
1382          }
1383       } else {
1384          emit(MUL(result_dst, op[0], op[1]));
1385       }
1386       break;
1387    case ir_binop_imul_high: {
1388       struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1389
1390       emit(MUL(acc, op[0], op[1]));
1391       emit(MACH(result_dst, op[0], op[1]));
1392       break;
1393    }
1394    case ir_binop_div:
1395       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1396       assert(ir->type->is_integer());
1397       emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1398       break;
1399    case ir_binop_carry: {
1400       struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1401
1402       emit(ADDC(dst_null_ud(), op[0], op[1]));
1403       emit(MOV(result_dst, src_reg(acc)));
1404       break;
1405    }
1406    case ir_binop_borrow: {
1407       struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1408
1409       emit(SUBB(dst_null_ud(), op[0], op[1]));
1410       emit(MOV(result_dst, src_reg(acc)));
1411       break;
1412    }
1413    case ir_binop_mod:
1414       /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1415       assert(ir->type->is_integer());
1416       emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1417       break;
1418
1419    case ir_binop_less:
1420    case ir_binop_greater:
1421    case ir_binop_lequal:
1422    case ir_binop_gequal:
1423    case ir_binop_equal:
1424    case ir_binop_nequal: {
1425       emit(CMP(result_dst, op[0], op[1],
1426                brw_conditional_for_comparison(ir->operation)));
1427       emit(AND(result_dst, result_src, src_reg(0x1)));
1428       break;
1429    }
1430
1431    case ir_binop_all_equal:
1432       /* "==" operator producing a scalar boolean. */
1433       if (ir->operands[0]->type->is_vector() ||
1434           ir->operands[1]->type->is_vector()) {
1435          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1436          emit(MOV(result_dst, src_reg(0)));
1437          inst = emit(MOV(result_dst, src_reg(1)));
1438          inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1439       } else {
1440          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1441          emit(AND(result_dst, result_src, src_reg(0x1)));
1442       }
1443       break;
1444    case ir_binop_any_nequal:
1445       /* "!=" operator producing a scalar boolean. */
1446       if (ir->operands[0]->type->is_vector() ||
1447           ir->operands[1]->type->is_vector()) {
1448          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1449
1450          emit(MOV(result_dst, src_reg(0)));
1451          inst = emit(MOV(result_dst, src_reg(1)));
1452          inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1453       } else {
1454          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1455          emit(AND(result_dst, result_src, src_reg(0x1)));
1456       }
1457       break;
1458
1459    case ir_unop_any:
1460       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1461       emit(MOV(result_dst, src_reg(0)));
1462
1463       inst = emit(MOV(result_dst, src_reg(1)));
1464       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1465       break;
1466
1467    case ir_binop_logic_xor:
1468       emit(XOR(result_dst, op[0], op[1]));
1469       break;
1470
1471    case ir_binop_logic_or:
1472       emit(OR(result_dst, op[0], op[1]));
1473       break;
1474
1475    case ir_binop_logic_and:
1476       emit(AND(result_dst, op[0], op[1]));
1477       break;
1478
1479    case ir_binop_dot:
1480       assert(ir->operands[0]->type->is_vector());
1481       assert(ir->operands[0]->type == ir->operands[1]->type);
1482       emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1483       break;
1484
1485    case ir_unop_sqrt:
1486       emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1487       break;
1488    case ir_unop_rsq:
1489       emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1490       break;
1491
1492    case ir_unop_bitcast_i2f:
1493    case ir_unop_bitcast_u2f:
1494       this->result = op[0];
1495       this->result.type = BRW_REGISTER_TYPE_F;
1496       break;
1497
1498    case ir_unop_bitcast_f2i:
1499       this->result = op[0];
1500       this->result.type = BRW_REGISTER_TYPE_D;
1501       break;
1502
1503    case ir_unop_bitcast_f2u:
1504       this->result = op[0];
1505       this->result.type = BRW_REGISTER_TYPE_UD;
1506       break;
1507
1508    case ir_unop_i2f:
1509    case ir_unop_i2u:
1510    case ir_unop_u2i:
1511    case ir_unop_u2f:
1512    case ir_unop_b2f:
1513    case ir_unop_b2i:
1514    case ir_unop_f2i:
1515    case ir_unop_f2u:
1516       emit(MOV(result_dst, op[0]));
1517       break;
1518    case ir_unop_f2b:
1519    case ir_unop_i2b: {
1520       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1521       emit(AND(result_dst, result_src, src_reg(1)));
1522       break;
1523    }
1524
1525    case ir_unop_trunc:
1526       emit(RNDZ(result_dst, op[0]));
1527       break;
1528    case ir_unop_ceil:
1529       op[0].negate = !op[0].negate;
1530       inst = emit(RNDD(result_dst, op[0]));
1531       this->result.negate = true;
1532       break;
1533    case ir_unop_floor:
1534       inst = emit(RNDD(result_dst, op[0]));
1535       break;
1536    case ir_unop_fract:
1537       inst = emit(FRC(result_dst, op[0]));
1538       break;
1539    case ir_unop_round_even:
1540       emit(RNDE(result_dst, op[0]));
1541       break;
1542
1543    case ir_binop_min:
1544       emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1545       break;
1546    case ir_binop_max:
1547       emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1548       break;
1549
1550    case ir_binop_pow:
1551       emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1552       break;
1553
1554    case ir_unop_bit_not:
1555       inst = emit(NOT(result_dst, op[0]));
1556       break;
1557    case ir_binop_bit_and:
1558       inst = emit(AND(result_dst, op[0], op[1]));
1559       break;
1560    case ir_binop_bit_xor:
1561       inst = emit(XOR(result_dst, op[0], op[1]));
1562       break;
1563    case ir_binop_bit_or:
1564       inst = emit(OR(result_dst, op[0], op[1]));
1565       break;
1566
1567    case ir_binop_lshift:
1568       inst = emit(SHL(result_dst, op[0], op[1]));
1569       break;
1570
1571    case ir_binop_rshift:
1572       if (ir->type->base_type == GLSL_TYPE_INT)
1573          inst = emit(ASR(result_dst, op[0], op[1]));
1574       else
1575          inst = emit(SHR(result_dst, op[0], op[1]));
1576       break;
1577
1578    case ir_binop_bfm:
1579       emit(BFI1(result_dst, op[0], op[1]));
1580       break;
1581
1582    case ir_binop_ubo_load: {
1583       ir_constant *uniform_block = ir->operands[0]->as_constant();
1584       ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1585       unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1586       src_reg offset;
1587
1588       /* Now, load the vector from that offset. */
1589       assert(ir->type->is_vector() || ir->type->is_scalar());
1590
1591       src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1592       packed_consts.type = result.type;
1593       src_reg surf_index =
1594          src_reg(prog_data->base.binding_table.ubo_start + uniform_block->value.u[0]);
1595       if (const_offset_ir) {
1596          if (brw->gen >= 8) {
1597             /* Store the offset in a GRF so we can send-from-GRF. */
1598             offset = src_reg(this, glsl_type::int_type);
1599             emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1600          } else {
1601             /* Immediates are fine on older generations since they'll be moved
1602              * to a (potentially fake) MRF at the generator level.
1603              */
1604             offset = src_reg(const_offset / 16);
1605          }
1606       } else {
1607          offset = src_reg(this, glsl_type::uint_type);
1608          emit(SHR(dst_reg(offset), op[1], src_reg(4)));
1609       }
1610
1611       if (brw->gen >= 7) {
1612          dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1613          grf_offset.type = offset.type;
1614
1615          emit(MOV(grf_offset, offset));
1616
1617          emit(new(mem_ctx) vec4_instruction(this,
1618                                             VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1619                                             dst_reg(packed_consts),
1620                                             surf_index,
1621                                             src_reg(grf_offset)));
1622       } else {
1623          vec4_instruction *pull =
1624             emit(new(mem_ctx) vec4_instruction(this,
1625                                                VS_OPCODE_PULL_CONSTANT_LOAD,
1626                                                dst_reg(packed_consts),
1627                                                surf_index,
1628                                                offset));
1629          pull->base_mrf = 14;
1630          pull->mlen = 1;
1631       }
1632
1633       packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1634       packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1635                                             const_offset % 16 / 4,
1636                                             const_offset % 16 / 4,
1637                                             const_offset % 16 / 4);
1638
1639       /* UBO bools are any nonzero int.  We store bools as either 0 or 1. */
1640       if (ir->type->base_type == GLSL_TYPE_BOOL) {
1641          emit(CMP(result_dst, packed_consts, src_reg(0u),
1642                   BRW_CONDITIONAL_NZ));
1643          emit(AND(result_dst, result, src_reg(0x1)));
1644       } else {
1645          emit(MOV(result_dst, packed_consts));
1646       }
1647       break;
1648    }
1649
1650    case ir_binop_vector_extract:
1651       assert(!"should have been lowered by vec_index_to_cond_assign");
1652       break;
1653
1654    case ir_triop_fma:
1655       op[0] = fix_3src_operand(op[0]);
1656       op[1] = fix_3src_operand(op[1]);
1657       op[2] = fix_3src_operand(op[2]);
1658       /* Note that the instruction's argument order is reversed from GLSL
1659        * and the IR.
1660        */
1661       emit(MAD(result_dst, op[2], op[1], op[0]));
1662       break;
1663
1664    case ir_triop_lrp:
1665       emit_lrp(result_dst, op[0], op[1], op[2]);
1666       break;
1667
1668    case ir_triop_csel:
1669       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1670       inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1671       inst->predicate = BRW_PREDICATE_NORMAL;
1672       break;
1673
1674    case ir_triop_bfi:
1675       op[0] = fix_3src_operand(op[0]);
1676       op[1] = fix_3src_operand(op[1]);
1677       op[2] = fix_3src_operand(op[2]);
1678       emit(BFI2(result_dst, op[0], op[1], op[2]));
1679       break;
1680
1681    case ir_triop_bitfield_extract:
1682       op[0] = fix_3src_operand(op[0]);
1683       op[1] = fix_3src_operand(op[1]);
1684       op[2] = fix_3src_operand(op[2]);
1685       /* Note that the instruction's argument order is reversed from GLSL
1686        * and the IR.
1687        */
1688       emit(BFE(result_dst, op[2], op[1], op[0]));
1689       break;
1690
1691    case ir_triop_vector_insert:
1692       assert(!"should have been lowered by lower_vector_insert");
1693       break;
1694
1695    case ir_quadop_bitfield_insert:
1696       assert(!"not reached: should be handled by "
1697               "bitfield_insert_to_bfm_bfi\n");
1698       break;
1699
1700    case ir_quadop_vector:
1701       assert(!"not reached: should be handled by lower_quadop_vector");
1702       break;
1703
1704    case ir_unop_pack_half_2x16:
1705       emit_pack_half_2x16(result_dst, op[0]);
1706       break;
1707    case ir_unop_unpack_half_2x16:
1708       emit_unpack_half_2x16(result_dst, op[0]);
1709       break;
1710    case ir_unop_pack_snorm_2x16:
1711    case ir_unop_pack_snorm_4x8:
1712    case ir_unop_pack_unorm_2x16:
1713    case ir_unop_pack_unorm_4x8:
1714    case ir_unop_unpack_snorm_2x16:
1715    case ir_unop_unpack_snorm_4x8:
1716    case ir_unop_unpack_unorm_2x16:
1717    case ir_unop_unpack_unorm_4x8:
1718       assert(!"not reached: should be handled by lower_packing_builtins");
1719       break;
1720    case ir_unop_unpack_half_2x16_split_x:
1721    case ir_unop_unpack_half_2x16_split_y:
1722    case ir_binop_pack_half_2x16_split:
1723       assert(!"not reached: should not occur in vertex shader");
1724       break;
1725    case ir_binop_ldexp:
1726       assert(!"not reached: should be handled by ldexp_to_arith()");
1727       break;
1728    }
1729 }
1730
1731
1732 void
1733 vec4_visitor::visit(ir_swizzle *ir)
1734 {
1735    src_reg src;
1736    int i = 0;
1737    int swizzle[4];
1738
1739    /* Note that this is only swizzles in expressions, not those on the left
1740     * hand side of an assignment, which do write masking.  See ir_assignment
1741     * for that.
1742     */
1743
1744    ir->val->accept(this);
1745    src = this->result;
1746    assert(src.file != BAD_FILE);
1747
1748    for (i = 0; i < ir->type->vector_elements; i++) {
1749       switch (i) {
1750       case 0:
1751          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1752          break;
1753       case 1:
1754          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1755          break;
1756       case 2:
1757          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1758          break;
1759       case 3:
1760          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1761             break;
1762       }
1763    }
1764    for (; i < 4; i++) {
1765       /* Replicate the last channel out. */
1766       swizzle[i] = swizzle[ir->type->vector_elements - 1];
1767    }
1768
1769    src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1770
1771    this->result = src;
1772 }
1773
1774 void
1775 vec4_visitor::visit(ir_dereference_variable *ir)
1776 {
1777    const struct glsl_type *type = ir->type;
1778    dst_reg *reg = variable_storage(ir->var);
1779
1780    if (!reg) {
1781       fail("Failed to find variable storage for %s\n", ir->var->name);
1782       this->result = src_reg(brw_null_reg());
1783       return;
1784    }
1785
1786    this->result = src_reg(*reg);
1787
1788    /* System values get their swizzle from the dst_reg writemask */
1789    if (ir->var->data.mode == ir_var_system_value)
1790       return;
1791
1792    if (type->is_scalar() || type->is_vector() || type->is_matrix())
1793       this->result.swizzle = swizzle_for_size(type->vector_elements);
1794 }
1795
1796
1797 int
1798 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1799 {
1800    /* Under normal circumstances array elements are stored consecutively, so
1801     * the stride is equal to the size of the array element.
1802     */
1803    return type_size(ir->type);
1804 }
1805
1806
1807 void
1808 vec4_visitor::visit(ir_dereference_array *ir)
1809 {
1810    ir_constant *constant_index;
1811    src_reg src;
1812    int array_stride = compute_array_stride(ir);
1813
1814    constant_index = ir->array_index->constant_expression_value();
1815
1816    ir->array->accept(this);
1817    src = this->result;
1818
1819    if (constant_index) {
1820       src.reg_offset += constant_index->value.i[0] * array_stride;
1821    } else {
1822       /* Variable index array dereference.  It eats the "vec4" of the
1823        * base of the array and an index that offsets the Mesa register
1824        * index.
1825        */
1826       ir->array_index->accept(this);
1827
1828       src_reg index_reg;
1829
1830       if (array_stride == 1) {
1831          index_reg = this->result;
1832       } else {
1833          index_reg = src_reg(this, glsl_type::int_type);
1834
1835          emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
1836       }
1837
1838       if (src.reladdr) {
1839          src_reg temp = src_reg(this, glsl_type::int_type);
1840
1841          emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1842
1843          index_reg = temp;
1844       }
1845
1846       src.reladdr = ralloc(mem_ctx, src_reg);
1847       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1848    }
1849
1850    /* If the type is smaller than a vec4, replicate the last channel out. */
1851    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1852       src.swizzle = swizzle_for_size(ir->type->vector_elements);
1853    else
1854       src.swizzle = BRW_SWIZZLE_NOOP;
1855    src.type = brw_type_for_base_type(ir->type);
1856
1857    this->result = src;
1858 }
1859
1860 void
1861 vec4_visitor::visit(ir_dereference_record *ir)
1862 {
1863    unsigned int i;
1864    const glsl_type *struct_type = ir->record->type;
1865    int offset = 0;
1866
1867    ir->record->accept(this);
1868
1869    for (i = 0; i < struct_type->length; i++) {
1870       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1871          break;
1872       offset += type_size(struct_type->fields.structure[i].type);
1873    }
1874
1875    /* If the type is smaller than a vec4, replicate the last channel out. */
1876    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1877       this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1878    else
1879       this->result.swizzle = BRW_SWIZZLE_NOOP;
1880    this->result.type = brw_type_for_base_type(ir->type);
1881
1882    this->result.reg_offset += offset;
1883 }
1884
1885 /**
1886  * We want to be careful in assignment setup to hit the actual storage
1887  * instead of potentially using a temporary like we might with the
1888  * ir_dereference handler.
1889  */
1890 static dst_reg
1891 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1892 {
1893    /* The LHS must be a dereference.  If the LHS is a variable indexed array
1894     * access of a vector, it must be separated into a series conditional moves
1895     * before reaching this point (see ir_vec_index_to_cond_assign).
1896     */
1897    assert(ir->as_dereference());
1898    ir_dereference_array *deref_array = ir->as_dereference_array();
1899    if (deref_array) {
1900       assert(!deref_array->array->type->is_vector());
1901    }
1902
1903    /* Use the rvalue deref handler for the most part.  We'll ignore
1904     * swizzles in it and write swizzles using writemask, though.
1905     */
1906    ir->accept(v);
1907    return dst_reg(v->result);
1908 }
1909
1910 void
1911 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1912                               const struct glsl_type *type, uint32_t predicate)
1913 {
1914    if (type->base_type == GLSL_TYPE_STRUCT) {
1915       for (unsigned int i = 0; i < type->length; i++) {
1916          emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1917       }
1918       return;
1919    }
1920
1921    if (type->is_array()) {
1922       for (unsigned int i = 0; i < type->length; i++) {
1923          emit_block_move(dst, src, type->fields.array, predicate);
1924       }
1925       return;
1926    }
1927
1928    if (type->is_matrix()) {
1929       const struct glsl_type *vec_type;
1930
1931       vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1932                                          type->vector_elements, 1);
1933
1934       for (int i = 0; i < type->matrix_columns; i++) {
1935          emit_block_move(dst, src, vec_type, predicate);
1936       }
1937       return;
1938    }
1939
1940    assert(type->is_scalar() || type->is_vector());
1941
1942    dst->type = brw_type_for_base_type(type);
1943    src->type = dst->type;
1944
1945    dst->writemask = (1 << type->vector_elements) - 1;
1946
1947    src->swizzle = swizzle_for_size(type->vector_elements);
1948
1949    vec4_instruction *inst = emit(MOV(*dst, *src));
1950    inst->predicate = predicate;
1951
1952    dst->reg_offset++;
1953    src->reg_offset++;
1954 }
1955
1956
1957 /* If the RHS processing resulted in an instruction generating a
1958  * temporary value, and it would be easy to rewrite the instruction to
1959  * generate its result right into the LHS instead, do so.  This ends
1960  * up reliably removing instructions where it can be tricky to do so
1961  * later without real UD chain information.
1962  */
1963 bool
1964 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
1965                                      dst_reg dst,
1966                                      src_reg src,
1967                                      vec4_instruction *pre_rhs_inst,
1968                                      vec4_instruction *last_rhs_inst)
1969 {
1970    /* This could be supported, but it would take more smarts. */
1971    if (ir->condition)
1972       return false;
1973
1974    if (pre_rhs_inst == last_rhs_inst)
1975       return false; /* No instructions generated to work with. */
1976
1977    /* Make sure the last instruction generated our source reg. */
1978    if (src.file != GRF ||
1979        src.file != last_rhs_inst->dst.file ||
1980        src.reg != last_rhs_inst->dst.reg ||
1981        src.reg_offset != last_rhs_inst->dst.reg_offset ||
1982        src.reladdr ||
1983        src.abs ||
1984        src.negate ||
1985        last_rhs_inst->predicate != BRW_PREDICATE_NONE)
1986       return false;
1987
1988    /* Check that that last instruction fully initialized the channels
1989     * we want to use, in the order we want to use them.  We could
1990     * potentially reswizzle the operands of many instructions so that
1991     * we could handle out of order channels, but don't yet.
1992     */
1993
1994    for (unsigned i = 0; i < 4; i++) {
1995       if (dst.writemask & (1 << i)) {
1996          if (!(last_rhs_inst->dst.writemask & (1 << i)))
1997             return false;
1998
1999          if (BRW_GET_SWZ(src.swizzle, i) != i)
2000             return false;
2001       }
2002    }
2003
2004    /* Success!  Rewrite the instruction. */
2005    last_rhs_inst->dst.file = dst.file;
2006    last_rhs_inst->dst.reg = dst.reg;
2007    last_rhs_inst->dst.reg_offset = dst.reg_offset;
2008    last_rhs_inst->dst.reladdr = dst.reladdr;
2009    last_rhs_inst->dst.writemask &= dst.writemask;
2010
2011    return true;
2012 }
2013
2014 void
2015 vec4_visitor::visit(ir_assignment *ir)
2016 {
2017    dst_reg dst = get_assignment_lhs(ir->lhs, this);
2018    uint32_t predicate = BRW_PREDICATE_NONE;
2019
2020    if (!ir->lhs->type->is_scalar() &&
2021        !ir->lhs->type->is_vector()) {
2022       ir->rhs->accept(this);
2023       src_reg src = this->result;
2024
2025       if (ir->condition) {
2026          emit_bool_to_cond_code(ir->condition, &predicate);
2027       }
2028
2029       /* emit_block_move doesn't account for swizzles in the source register.
2030        * This should be ok, since the source register is a structure or an
2031        * array, and those can't be swizzled.  But double-check to be sure.
2032        */
2033       assert(src.swizzle ==
2034              (ir->rhs->type->is_matrix()
2035               ? swizzle_for_size(ir->rhs->type->vector_elements)
2036               : BRW_SWIZZLE_NOOP));
2037
2038       emit_block_move(&dst, &src, ir->rhs->type, predicate);
2039       return;
2040    }
2041
2042    /* Now we're down to just a scalar/vector with writemasks. */
2043    int i;
2044
2045    vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2046    pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2047
2048    ir->rhs->accept(this);
2049
2050    last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2051
2052    src_reg src = this->result;
2053
2054    int swizzles[4];
2055    int first_enabled_chan = 0;
2056    int src_chan = 0;
2057
2058    assert(ir->lhs->type->is_vector() ||
2059           ir->lhs->type->is_scalar());
2060    dst.writemask = ir->write_mask;
2061
2062    for (int i = 0; i < 4; i++) {
2063       if (dst.writemask & (1 << i)) {
2064          first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
2065          break;
2066       }
2067    }
2068
2069    /* Swizzle a small RHS vector into the channels being written.
2070     *
2071     * glsl ir treats write_mask as dictating how many channels are
2072     * present on the RHS while in our instructions we need to make
2073     * those channels appear in the slots of the vec4 they're written to.
2074     */
2075    for (int i = 0; i < 4; i++) {
2076       if (dst.writemask & (1 << i))
2077          swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
2078       else
2079          swizzles[i] = first_enabled_chan;
2080    }
2081    src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2082                               swizzles[2], swizzles[3]);
2083
2084    if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2085       return;
2086    }
2087
2088    if (ir->condition) {
2089       emit_bool_to_cond_code(ir->condition, &predicate);
2090    }
2091
2092    for (i = 0; i < type_size(ir->lhs->type); i++) {
2093       vec4_instruction *inst = emit(MOV(dst, src));
2094       inst->predicate = predicate;
2095
2096       dst.reg_offset++;
2097       src.reg_offset++;
2098    }
2099 }
2100
2101 void
2102 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2103 {
2104    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2105       foreach_list(node, &ir->components) {
2106          ir_constant *field_value = (ir_constant *)node;
2107
2108          emit_constant_values(dst, field_value);
2109       }
2110       return;
2111    }
2112
2113    if (ir->type->is_array()) {
2114       for (unsigned int i = 0; i < ir->type->length; i++) {
2115          emit_constant_values(dst, ir->array_elements[i]);
2116       }
2117       return;
2118    }
2119
2120    if (ir->type->is_matrix()) {
2121       for (int i = 0; i < ir->type->matrix_columns; i++) {
2122          float *vec = &ir->value.f[i * ir->type->vector_elements];
2123
2124          for (int j = 0; j < ir->type->vector_elements; j++) {
2125             dst->writemask = 1 << j;
2126             dst->type = BRW_REGISTER_TYPE_F;
2127
2128             emit(MOV(*dst, src_reg(vec[j])));
2129          }
2130          dst->reg_offset++;
2131       }
2132       return;
2133    }
2134
2135    int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2136
2137    for (int i = 0; i < ir->type->vector_elements; i++) {
2138       if (!(remaining_writemask & (1 << i)))
2139          continue;
2140
2141       dst->writemask = 1 << i;
2142       dst->type = brw_type_for_base_type(ir->type);
2143
2144       /* Find other components that match the one we're about to
2145        * write.  Emits fewer instructions for things like vec4(0.5,
2146        * 1.5, 1.5, 1.5).
2147        */
2148       for (int j = i + 1; j < ir->type->vector_elements; j++) {
2149          if (ir->type->base_type == GLSL_TYPE_BOOL) {
2150             if (ir->value.b[i] == ir->value.b[j])
2151                dst->writemask |= (1 << j);
2152          } else {
2153             /* u, i, and f storage all line up, so no need for a
2154              * switch case for comparing each type.
2155              */
2156             if (ir->value.u[i] == ir->value.u[j])
2157                dst->writemask |= (1 << j);
2158          }
2159       }
2160
2161       switch (ir->type->base_type) {
2162       case GLSL_TYPE_FLOAT:
2163          emit(MOV(*dst, src_reg(ir->value.f[i])));
2164          break;
2165       case GLSL_TYPE_INT:
2166          emit(MOV(*dst, src_reg(ir->value.i[i])));
2167          break;
2168       case GLSL_TYPE_UINT:
2169          emit(MOV(*dst, src_reg(ir->value.u[i])));
2170          break;
2171       case GLSL_TYPE_BOOL:
2172          emit(MOV(*dst, src_reg(ir->value.b[i])));
2173          break;
2174       default:
2175          assert(!"Non-float/uint/int/bool constant");
2176          break;
2177       }
2178
2179       remaining_writemask &= ~dst->writemask;
2180    }
2181    dst->reg_offset++;
2182 }
2183
2184 void
2185 vec4_visitor::visit(ir_constant *ir)
2186 {
2187    dst_reg dst = dst_reg(this, ir->type);
2188    this->result = src_reg(dst);
2189
2190    emit_constant_values(&dst, ir);
2191 }
2192
2193 void
2194 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2195 {
2196    ir_dereference *deref = static_cast<ir_dereference *>(
2197       ir->actual_parameters.get_head());
2198    ir_variable *location = deref->variable_referenced();
2199    unsigned surf_index = (prog_data->base.binding_table.abo_start +
2200                           location->data.atomic.buffer_index);
2201
2202    /* Calculate the surface offset */
2203    src_reg offset(this, glsl_type::uint_type);
2204    ir_dereference_array *deref_array = deref->as_dereference_array();
2205    if (deref_array) {
2206       deref_array->array_index->accept(this);
2207
2208       src_reg tmp(this, glsl_type::uint_type);
2209       emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2210       emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2211    } else {
2212       offset = location->data.atomic.offset;
2213    }
2214
2215    /* Emit the appropriate machine instruction */
2216    const char *callee = ir->callee->function_name();
2217    dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2218
2219    if (!strcmp("__intrinsic_atomic_read", callee)) {
2220       emit_untyped_surface_read(surf_index, dst, offset);
2221
2222    } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2223       emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2224                           src_reg(), src_reg());
2225
2226    } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2227       emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2228                           src_reg(), src_reg());
2229    }
2230 }
2231
2232 void
2233 vec4_visitor::visit(ir_call *ir)
2234 {
2235    const char *callee = ir->callee->function_name();
2236
2237    if (!strcmp("__intrinsic_atomic_read", callee) ||
2238        !strcmp("__intrinsic_atomic_increment", callee) ||
2239        !strcmp("__intrinsic_atomic_predecrement", callee)) {
2240       visit_atomic_counter_intrinsic(ir);
2241    } else {
2242       assert(!"Unsupported intrinsic.");
2243    }
2244 }
2245
2246 src_reg
2247 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, int sampler)
2248 {
2249    vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_MCS);
2250    inst->base_mrf = 2;
2251    inst->mlen = 1;
2252    inst->sampler = sampler;
2253    inst->dst = dst_reg(this, glsl_type::uvec4_type);
2254    inst->dst.writemask = WRITEMASK_XYZW;
2255
2256    /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2257    int param_base = inst->base_mrf;
2258    int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2259    int zero_mask = 0xf & ~coord_mask;
2260
2261    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2262             coordinate));
2263
2264    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2265             src_reg(0)));
2266
2267    emit(inst);
2268    return src_reg(inst->dst);
2269 }
2270
2271 void
2272 vec4_visitor::visit(ir_texture *ir)
2273 {
2274    int sampler =
2275       _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2276
2277    /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2278     * emitting anything other than setting up the constant result.
2279     */
2280    if (ir->op == ir_tg4) {
2281       ir_constant *chan = ir->lod_info.component->as_constant();
2282       int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2283       if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2284          dst_reg result(this, ir->type);
2285          this->result = src_reg(result);
2286          emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2287          return;
2288       }
2289    }
2290
2291    /* Should be lowered by do_lower_texture_projection */
2292    assert(!ir->projector);
2293
2294    /* Should be lowered */
2295    assert(!ir->offset || !ir->offset->type->is_array());
2296
2297    /* Generate code to compute all the subexpression trees.  This has to be
2298     * done before loading any values into MRFs for the sampler message since
2299     * generating these values may involve SEND messages that need the MRFs.
2300     */
2301    src_reg coordinate;
2302    if (ir->coordinate) {
2303       ir->coordinate->accept(this);
2304       coordinate = this->result;
2305    }
2306
2307    src_reg shadow_comparitor;
2308    if (ir->shadow_comparitor) {
2309       ir->shadow_comparitor->accept(this);
2310       shadow_comparitor = this->result;
2311    }
2312
2313    bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2314    src_reg offset_value;
2315    if (has_nonconstant_offset) {
2316       ir->offset->accept(this);
2317       offset_value = src_reg(this->result);
2318    }
2319
2320    const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2321    src_reg lod, dPdx, dPdy, sample_index, mcs;
2322    switch (ir->op) {
2323    case ir_tex:
2324       lod = src_reg(0.0f);
2325       lod_type = glsl_type::float_type;
2326       break;
2327    case ir_txf:
2328    case ir_txl:
2329    case ir_txs:
2330       ir->lod_info.lod->accept(this);
2331       lod = this->result;
2332       lod_type = ir->lod_info.lod->type;
2333       break;
2334    case ir_query_levels:
2335       lod = src_reg(0);
2336       lod_type = glsl_type::int_type;
2337       break;
2338    case ir_txf_ms:
2339       ir->lod_info.sample_index->accept(this);
2340       sample_index = this->result;
2341       sample_index_type = ir->lod_info.sample_index->type;
2342
2343       if (brw->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2344          mcs = emit_mcs_fetch(ir, coordinate, sampler);
2345       else
2346          mcs = src_reg(0u);
2347       break;
2348    case ir_txd:
2349       ir->lod_info.grad.dPdx->accept(this);
2350       dPdx = this->result;
2351
2352       ir->lod_info.grad.dPdy->accept(this);
2353       dPdy = this->result;
2354
2355       lod_type = ir->lod_info.grad.dPdx->type;
2356       break;
2357    case ir_txb:
2358    case ir_lod:
2359    case ir_tg4:
2360       break;
2361    }
2362
2363    vec4_instruction *inst = NULL;
2364    switch (ir->op) {
2365    case ir_tex:
2366    case ir_txl:
2367       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXL);
2368       break;
2369    case ir_txd:
2370       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXD);
2371       break;
2372    case ir_txf:
2373       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF);
2374       break;
2375    case ir_txf_ms:
2376       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_CMS);
2377       break;
2378    case ir_txs:
2379       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
2380       break;
2381    case ir_tg4:
2382       if (has_nonconstant_offset)
2383          inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TG4_OFFSET);
2384       else
2385          inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TG4);
2386       break;
2387    case ir_query_levels:
2388       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
2389       break;
2390    case ir_txb:
2391       assert(!"TXB is not valid for vertex shaders.");
2392       break;
2393    case ir_lod:
2394       assert(!"LOD is not valid for vertex shaders.");
2395       break;
2396    default:
2397       assert(!"Unrecognized tex op");
2398    }
2399
2400    if (ir->offset != NULL && ir->op != ir_txf)
2401       inst->texture_offset = brw_texture_offset(ctx, ir->offset->as_constant());
2402
2403    /* Stuff the channel select bits in the top of the texture offset */
2404    if (ir->op == ir_tg4)
2405       inst->texture_offset |= gather_channel(ir, sampler) << 16;
2406
2407    /* The message header is necessary for:
2408     * - Gen4 (always)
2409     * - Texel offsets
2410     * - Gather channel selection
2411     * - Sampler indices too large to fit in a 4-bit value.
2412     */
2413    inst->header_present =
2414       brw->gen < 5 || inst->texture_offset != 0 || ir->op == ir_tg4 ||
2415       sampler >= 16;
2416    inst->base_mrf = 2;
2417    inst->mlen = inst->header_present + 1; /* always at least one */
2418    inst->sampler = sampler;
2419    inst->dst = dst_reg(this, ir->type);
2420    inst->dst.writemask = WRITEMASK_XYZW;
2421    inst->shadow_compare = ir->shadow_comparitor != NULL;
2422
2423    /* MRF for the first parameter */
2424    int param_base = inst->base_mrf + inst->header_present;
2425
2426    if (ir->op == ir_txs || ir->op == ir_query_levels) {
2427       int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2428       emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2429    } else {
2430       /* Load the coordinate */
2431       /* FINISHME: gl_clamp_mask and saturate */
2432       int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2433       int zero_mask = 0xf & ~coord_mask;
2434
2435       emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2436                coordinate));
2437
2438       if (zero_mask != 0) {
2439          emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2440                   src_reg(0)));
2441       }
2442       /* Load the shadow comparitor */
2443       if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2444          emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2445                           WRITEMASK_X),
2446                   shadow_comparitor));
2447          inst->mlen++;
2448       }
2449
2450       /* Load the LOD info */
2451       if (ir->op == ir_tex || ir->op == ir_txl) {
2452          int mrf, writemask;
2453          if (brw->gen >= 5) {
2454             mrf = param_base + 1;
2455             if (ir->shadow_comparitor) {
2456                writemask = WRITEMASK_Y;
2457                /* mlen already incremented */
2458             } else {
2459                writemask = WRITEMASK_X;
2460                inst->mlen++;
2461             }
2462          } else /* brw->gen == 4 */ {
2463             mrf = param_base;
2464             writemask = WRITEMASK_W;
2465          }
2466          emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2467       } else if (ir->op == ir_txf) {
2468          emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2469       } else if (ir->op == ir_txf_ms) {
2470          emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2471                   sample_index));
2472          if (brw->gen >= 7)
2473             /* MCS data is in the first channel of `mcs`, but we need to get it into
2474              * the .y channel of the second vec4 of params, so replicate .x across
2475              * the whole vec4 and then mask off everything except .y
2476              */
2477             mcs.swizzle = BRW_SWIZZLE_XXXX;
2478             emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2479                      mcs));
2480          inst->mlen++;
2481       } else if (ir->op == ir_txd) {
2482          const glsl_type *type = lod_type;
2483
2484          if (brw->gen >= 5) {
2485             dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2486             dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2487             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2488             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2489             inst->mlen++;
2490
2491             if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2492                dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2493                dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2494                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2495                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2496                inst->mlen++;
2497
2498                if (ir->shadow_comparitor) {
2499                   emit(MOV(dst_reg(MRF, param_base + 2,
2500                                    ir->shadow_comparitor->type, WRITEMASK_Z),
2501                            shadow_comparitor));
2502                }
2503             }
2504          } else /* brw->gen == 4 */ {
2505             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2506             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2507             inst->mlen += 2;
2508          }
2509       } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2510          if (ir->shadow_comparitor) {
2511             emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2512                      shadow_comparitor));
2513          }
2514
2515          emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2516                   offset_value));
2517          inst->mlen++;
2518       }
2519    }
2520
2521    emit(inst);
2522
2523    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2524     * spec requires layers.
2525     */
2526    if (ir->op == ir_txs) {
2527       glsl_type const *type = ir->sampler->type;
2528       if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2529           type->sampler_array) {
2530          emit_math(SHADER_OPCODE_INT_QUOTIENT,
2531                    writemask(inst->dst, WRITEMASK_Z),
2532                    src_reg(inst->dst), src_reg(6));
2533       }
2534    }
2535
2536    if (brw->gen == 6 && ir->op == ir_tg4) {
2537       emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2538    }
2539
2540    swizzle_result(ir, src_reg(inst->dst), sampler);
2541 }
2542
2543 /**
2544  * Apply workarounds for Gen6 gather with UINT/SINT
2545  */
2546 void
2547 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2548 {
2549    if (!wa)
2550       return;
2551
2552    int width = (wa & WA_8BIT) ? 8 : 16;
2553    dst_reg dst_f = dst;
2554    dst_f.type = BRW_REGISTER_TYPE_F;
2555
2556    /* Convert from UNORM to UINT */
2557    emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2558    emit(MOV(dst, src_reg(dst_f)));
2559
2560    if (wa & WA_SIGN) {
2561       /* Reinterpret the UINT value as a signed INT value by
2562        * shifting the sign bit into place, then shifting back
2563        * preserving sign.
2564        */
2565       emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2566       emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2567    }
2568 }
2569
2570 /**
2571  * Set up the gather channel based on the swizzle, for gather4.
2572  */
2573 uint32_t
2574 vec4_visitor::gather_channel(ir_texture *ir, int sampler)
2575 {
2576    ir_constant *chan = ir->lod_info.component->as_constant();
2577    int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2578    switch (swiz) {
2579       case SWIZZLE_X: return 0;
2580       case SWIZZLE_Y:
2581          /* gather4 sampler is broken for green channel on RG32F --
2582           * we must ask for blue instead.
2583           */
2584          if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2585             return 2;
2586          return 1;
2587       case SWIZZLE_Z: return 2;
2588       case SWIZZLE_W: return 3;
2589       default:
2590          assert(!"Not reached"); /* zero, one swizzles handled already */
2591          return 0;
2592    }
2593 }
2594
2595 void
2596 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, int sampler)
2597 {
2598    int s = key->tex.swizzles[sampler];
2599
2600    this->result = src_reg(this, ir->type);
2601    dst_reg swizzled_result(this->result);
2602
2603    if (ir->op == ir_query_levels) {
2604       /* # levels is in .w */
2605       orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2606       emit(MOV(swizzled_result, orig_val));
2607       return;
2608    }
2609
2610    if (ir->op == ir_txs || ir->type == glsl_type::float_type
2611                         || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2612       emit(MOV(swizzled_result, orig_val));
2613       return;
2614    }
2615
2616
2617    int zero_mask = 0, one_mask = 0, copy_mask = 0;
2618    int swizzle[4] = {0};
2619
2620    for (int i = 0; i < 4; i++) {
2621       switch (GET_SWZ(s, i)) {
2622       case SWIZZLE_ZERO:
2623          zero_mask |= (1 << i);
2624          break;
2625       case SWIZZLE_ONE:
2626          one_mask |= (1 << i);
2627          break;
2628       default:
2629          copy_mask |= (1 << i);
2630          swizzle[i] = GET_SWZ(s, i);
2631          break;
2632       }
2633    }
2634
2635    if (copy_mask) {
2636       orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2637       swizzled_result.writemask = copy_mask;
2638       emit(MOV(swizzled_result, orig_val));
2639    }
2640
2641    if (zero_mask) {
2642       swizzled_result.writemask = zero_mask;
2643       emit(MOV(swizzled_result, src_reg(0.0f)));
2644    }
2645
2646    if (one_mask) {
2647       swizzled_result.writemask = one_mask;
2648       emit(MOV(swizzled_result, src_reg(1.0f)));
2649    }
2650 }
2651
2652 void
2653 vec4_visitor::visit(ir_return *ir)
2654 {
2655    assert(!"not reached");
2656 }
2657
2658 void
2659 vec4_visitor::visit(ir_discard *ir)
2660 {
2661    assert(!"not reached");
2662 }
2663
2664 void
2665 vec4_visitor::visit(ir_if *ir)
2666 {
2667    /* Don't point the annotation at the if statement, because then it plus
2668     * the then and else blocks get printed.
2669     */
2670    this->base_ir = ir->condition;
2671
2672    if (brw->gen == 6) {
2673       emit_if_gen6(ir);
2674    } else {
2675       uint32_t predicate;
2676       emit_bool_to_cond_code(ir->condition, &predicate);
2677       emit(IF(predicate));
2678    }
2679
2680    visit_instructions(&ir->then_instructions);
2681
2682    if (!ir->else_instructions.is_empty()) {
2683       this->base_ir = ir->condition;
2684       emit(BRW_OPCODE_ELSE);
2685
2686       visit_instructions(&ir->else_instructions);
2687    }
2688
2689    this->base_ir = ir->condition;
2690    emit(BRW_OPCODE_ENDIF);
2691 }
2692
2693 void
2694 vec4_visitor::visit(ir_emit_vertex *)
2695 {
2696    assert(!"not reached");
2697 }
2698
2699 void
2700 vec4_visitor::visit(ir_end_primitive *)
2701 {
2702    assert(!"not reached");
2703 }
2704
2705 void
2706 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2707                                   dst_reg dst, src_reg offset,
2708                                   src_reg src0, src_reg src1)
2709 {
2710    unsigned mlen = 0;
2711
2712    /* Set the atomic operation offset. */
2713    emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2714    mlen++;
2715
2716    /* Set the atomic operation arguments. */
2717    if (src0.file != BAD_FILE) {
2718       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2719       mlen++;
2720    }
2721
2722    if (src1.file != BAD_FILE) {
2723       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2724       mlen++;
2725    }
2726
2727    /* Emit the instruction.  Note that this maps to the normal SIMD8
2728     * untyped atomic message on Ivy Bridge, but that's OK because
2729     * unused channels will be masked out.
2730     */
2731    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
2732                                  src_reg(atomic_op), src_reg(surf_index));
2733    inst->base_mrf = 0;
2734    inst->mlen = mlen;
2735 }
2736
2737 void
2738 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
2739                                         src_reg offset)
2740 {
2741    /* Set the surface read offset. */
2742    emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
2743
2744    /* Emit the instruction.  Note that this maps to the normal SIMD8
2745     * untyped surface read message, but that's OK because unused
2746     * channels will be masked out.
2747     */
2748    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ,
2749                                  dst, src_reg(surf_index));
2750    inst->base_mrf = 0;
2751    inst->mlen = 1;
2752 }
2753
2754 void
2755 vec4_visitor::emit_ndc_computation()
2756 {
2757    /* Get the position */
2758    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2759
2760    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2761    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2762    output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2763
2764    current_annotation = "NDC";
2765    dst_reg ndc_w = ndc;
2766    ndc_w.writemask = WRITEMASK_W;
2767    src_reg pos_w = pos;
2768    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2769    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2770
2771    dst_reg ndc_xyz = ndc;
2772    ndc_xyz.writemask = WRITEMASK_XYZ;
2773
2774    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2775 }
2776
2777 void
2778 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2779 {
2780    if (brw->gen < 6 &&
2781        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2782         key->userclip_active || brw->has_negative_rhw_bug)) {
2783       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2784       dst_reg header1_w = header1;
2785       header1_w.writemask = WRITEMASK_W;
2786
2787       emit(MOV(header1, 0u));
2788
2789       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2790          src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2791
2792          current_annotation = "Point size";
2793          emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2794          emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2795       }
2796
2797       if (key->userclip_active) {
2798          current_annotation = "Clipping flags";
2799          dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
2800          dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
2801
2802          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
2803          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
2804          emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
2805
2806          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
2807          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
2808          emit(SHL(flags1, src_reg(flags1), src_reg(4)));
2809          emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
2810       }
2811
2812       /* i965 clipping workaround:
2813        * 1) Test for -ve rhw
2814        * 2) If set,
2815        *      set ndc = (0,0,0,0)
2816        *      set ucp[6] = 1
2817        *
2818        * Later, clipping will detect ucp[6] and ensure the primitive is
2819        * clipped against all fixed planes.
2820        */
2821       if (brw->has_negative_rhw_bug) {
2822          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
2823          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
2824          emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
2825          vec4_instruction *inst;
2826          inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
2827          inst->predicate = BRW_PREDICATE_NORMAL;
2828          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
2829          inst->predicate = BRW_PREDICATE_NORMAL;
2830       }
2831
2832       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2833    } else if (brw->gen < 6) {
2834       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2835    } else {
2836       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2837       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2838          emit(MOV(brw_writemask(reg, WRITEMASK_W),
2839                   src_reg(output_reg[VARYING_SLOT_PSIZ])));
2840       }
2841       if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
2842          emit(MOV(retype(brw_writemask(reg, WRITEMASK_Y), BRW_REGISTER_TYPE_D),
2843                   src_reg(output_reg[VARYING_SLOT_LAYER])));
2844       }
2845       if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
2846          emit(MOV(retype(brw_writemask(reg, WRITEMASK_Z), BRW_REGISTER_TYPE_D),
2847                   src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
2848       }
2849    }
2850 }
2851
2852 void
2853 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
2854 {
2855    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2856     *
2857     *     "If a linked set of shaders forming the vertex stage contains no
2858     *     static write to gl_ClipVertex or gl_ClipDistance, but the
2859     *     application has requested clipping against user clip planes through
2860     *     the API, then the coordinate written to gl_Position is used for
2861     *     comparison against the user clip planes."
2862     *
2863     * This function is only called if the shader didn't write to
2864     * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
2865     * if the user wrote to it; otherwise we use gl_Position.
2866     */
2867    gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
2868    if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
2869       clip_vertex = VARYING_SLOT_POS;
2870    }
2871
2872    for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
2873         ++i) {
2874       reg.writemask = 1 << i;
2875       emit(DP4(reg,
2876                src_reg(output_reg[clip_vertex]),
2877                src_reg(this->userplane[i + offset])));
2878    }
2879 }
2880
2881 void
2882 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
2883 {
2884    assert (varying < VARYING_SLOT_MAX);
2885    reg.type = output_reg[varying].type;
2886    current_annotation = output_reg_annotation[varying];
2887    /* Copy the register, saturating if necessary */
2888    vec4_instruction *inst = emit(MOV(reg,
2889                                      src_reg(output_reg[varying])));
2890    if ((varying == VARYING_SLOT_COL0 ||
2891         varying == VARYING_SLOT_COL1 ||
2892         varying == VARYING_SLOT_BFC0 ||
2893         varying == VARYING_SLOT_BFC1) &&
2894        key->clamp_vertex_color) {
2895       inst->saturate = true;
2896    }
2897 }
2898
2899 void
2900 vec4_visitor::emit_urb_slot(int mrf, int varying)
2901 {
2902    struct brw_reg hw_reg = brw_message_reg(mrf);
2903    dst_reg reg = dst_reg(MRF, mrf);
2904    reg.type = BRW_REGISTER_TYPE_F;
2905
2906    switch (varying) {
2907    case VARYING_SLOT_PSIZ:
2908       /* PSIZ is always in slot 0, and is coupled with other flags. */
2909       current_annotation = "indices, point width, clip flags";
2910       emit_psiz_and_flags(hw_reg);
2911       break;
2912    case BRW_VARYING_SLOT_NDC:
2913       current_annotation = "NDC";
2914       emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
2915       break;
2916    case VARYING_SLOT_POS:
2917       current_annotation = "gl_Position";
2918       emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
2919       break;
2920    case VARYING_SLOT_EDGE:
2921       /* This is present when doing unfilled polygons.  We're supposed to copy
2922        * the edge flag from the user-provided vertex array
2923        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
2924        * of that attribute (starts as 1.0f).  This is then used in clipping to
2925        * determine which edges should be drawn as wireframe.
2926        */
2927       current_annotation = "edge flag";
2928       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
2929                                     glsl_type::float_type, WRITEMASK_XYZW))));
2930       break;
2931    case BRW_VARYING_SLOT_PAD:
2932       /* No need to write to this slot */
2933       break;
2934    default:
2935       emit_generic_urb_slot(reg, varying);
2936       break;
2937    }
2938 }
2939
2940 static int
2941 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
2942 {
2943    if (brw->gen >= 6) {
2944       /* URB data written (does not include the message header reg) must
2945        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
2946        * section 5.4.3.2.2: URB_INTERLEAVED.
2947        *
2948        * URB entries are allocated on a multiple of 1024 bits, so an
2949        * extra 128 bits written here to make the end align to 256 is
2950        * no problem.
2951        */
2952       if ((mlen % 2) != 1)
2953          mlen++;
2954    }
2955
2956    return mlen;
2957 }
2958
2959
2960 /**
2961  * Generates the VUE payload plus the necessary URB write instructions to
2962  * output it.
2963  *
2964  * The VUE layout is documented in Volume 2a.
2965  */
2966 void
2967 vec4_visitor::emit_vertex()
2968 {
2969    /* MRF 0 is reserved for the debugger, so start with message header
2970     * in MRF 1.
2971     */
2972    int base_mrf = 1;
2973    int mrf = base_mrf;
2974    /* In the process of generating our URB write message contents, we
2975     * may need to unspill a register or load from an array.  Those
2976     * reads would use MRFs 14-15.
2977     */
2978    int max_usable_mrf = 13;
2979
2980    /* The following assertion verifies that max_usable_mrf causes an
2981     * even-numbered amount of URB write data, which will meet gen6's
2982     * requirements for length alignment.
2983     */
2984    assert ((max_usable_mrf - base_mrf) % 2 == 0);
2985
2986    /* First mrf is the g0-based message header containing URB handles and
2987     * such.
2988     */
2989    emit_urb_write_header(mrf++);
2990
2991    if (brw->gen < 6) {
2992       emit_ndc_computation();
2993    }
2994
2995    /* Lower legacy ff and ClipVertex clipping to clip distances */
2996    if (key->userclip_active && !prog->UsesClipDistanceOut) {
2997       current_annotation = "user clip distances";
2998
2999       output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3000       output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3001
3002       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3003       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3004    }
3005
3006    /* We may need to split this up into several URB writes, so do them in a
3007     * loop.
3008     */
3009    int slot = 0;
3010    bool complete = false;
3011    do {
3012       /* URB offset is in URB row increments, and each of our MRFs is half of
3013        * one of those, since we're doing interleaved writes.
3014        */
3015       int offset = slot / 2;
3016
3017       mrf = base_mrf + 1;
3018       for (; slot < prog_data->vue_map.num_slots; ++slot) {
3019          emit_urb_slot(mrf++, prog_data->vue_map.slot_to_varying[slot]);
3020
3021          /* If this was max_usable_mrf, we can't fit anything more into this
3022           * URB WRITE.
3023           */
3024          if (mrf > max_usable_mrf) {
3025             slot++;
3026             break;
3027          }
3028       }
3029
3030       complete = slot >= prog_data->vue_map.num_slots;
3031       current_annotation = "URB write";
3032       vec4_instruction *inst = emit_urb_write_opcode(complete);
3033       inst->base_mrf = base_mrf;
3034       inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
3035       inst->offset += offset;
3036    } while(!complete);
3037 }
3038
3039
3040 src_reg
3041 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
3042                                  src_reg *reladdr, int reg_offset)
3043 {
3044    /* Because we store the values to scratch interleaved like our
3045     * vertex data, we need to scale the vec4 index by 2.
3046     */
3047    int message_header_scale = 2;
3048
3049    /* Pre-gen6, the message header uses byte offsets instead of vec4
3050     * (16-byte) offset units.
3051     */
3052    if (brw->gen < 6)
3053       message_header_scale *= 16;
3054
3055    if (reladdr) {
3056       src_reg index = src_reg(this, glsl_type::int_type);
3057
3058       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3059       emit_before(inst, MUL(dst_reg(index),
3060                             index, src_reg(message_header_scale)));
3061
3062       return index;
3063    } else {
3064       return src_reg(reg_offset * message_header_scale);
3065    }
3066 }
3067
3068 src_reg
3069 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
3070                                        src_reg *reladdr, int reg_offset)
3071 {
3072    if (reladdr) {
3073       src_reg index = src_reg(this, glsl_type::int_type);
3074
3075       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3076
3077       /* Pre-gen6, the message header uses byte offsets instead of vec4
3078        * (16-byte) offset units.
3079        */
3080       if (brw->gen < 6) {
3081          emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
3082       }
3083
3084       return index;
3085    } else if (brw->gen >= 8) {
3086       /* Store the offset in a GRF so we can send-from-GRF. */
3087       src_reg offset = src_reg(this, glsl_type::int_type);
3088       emit_before(inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3089       return offset;
3090    } else {
3091       int message_header_scale = brw->gen < 6 ? 16 : 1;
3092       return src_reg(reg_offset * message_header_scale);
3093    }
3094 }
3095
3096 /**
3097  * Emits an instruction before @inst to load the value named by @orig_src
3098  * from scratch space at @base_offset to @temp.
3099  *
3100  * @base_offset is measured in 32-byte units (the size of a register).
3101  */
3102 void
3103 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
3104                                 dst_reg temp, src_reg orig_src,
3105                                 int base_offset)
3106 {
3107    int reg_offset = base_offset + orig_src.reg_offset;
3108    src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
3109
3110    emit_before(inst, SCRATCH_READ(temp, index));
3111 }
3112
3113 /**
3114  * Emits an instruction after @inst to store the value to be written
3115  * to @orig_dst to scratch space at @base_offset, from @temp.
3116  *
3117  * @base_offset is measured in 32-byte units (the size of a register).
3118  */
3119 void
3120 vec4_visitor::emit_scratch_write(vec4_instruction *inst, int base_offset)
3121 {
3122    int reg_offset = base_offset + inst->dst.reg_offset;
3123    src_reg index = get_scratch_offset(inst, inst->dst.reladdr, reg_offset);
3124
3125    /* Create a temporary register to store *inst's result in.
3126     *
3127     * We have to be careful in MOVing from our temporary result register in
3128     * the scratch write.  If we swizzle from channels of the temporary that
3129     * weren't initialized, it will confuse live interval analysis, which will
3130     * make spilling fail to make progress.
3131     */
3132    src_reg temp = src_reg(this, glsl_type::vec4_type);
3133    temp.type = inst->dst.type;
3134    int first_writemask_chan = ffs(inst->dst.writemask) - 1;
3135    int swizzles[4];
3136    for (int i = 0; i < 4; i++)
3137       if (inst->dst.writemask & (1 << i))
3138          swizzles[i] = i;
3139       else
3140          swizzles[i] = first_writemask_chan;
3141    temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
3142                                swizzles[2], swizzles[3]);
3143
3144    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3145                                        inst->dst.writemask));
3146    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3147    write->predicate = inst->predicate;
3148    write->ir = inst->ir;
3149    write->annotation = inst->annotation;
3150    inst->insert_after(write);
3151
3152    inst->dst.file = temp.file;
3153    inst->dst.reg = temp.reg;
3154    inst->dst.reg_offset = temp.reg_offset;
3155    inst->dst.reladdr = NULL;
3156 }
3157
3158 /**
3159  * We can't generally support array access in GRF space, because a
3160  * single instruction's destination can only span 2 contiguous
3161  * registers.  So, we send all GRF arrays that get variable index
3162  * access to scratch space.
3163  */
3164 void
3165 vec4_visitor::move_grf_array_access_to_scratch()
3166 {
3167    int scratch_loc[this->virtual_grf_count];
3168
3169    for (int i = 0; i < this->virtual_grf_count; i++) {
3170       scratch_loc[i] = -1;
3171    }
3172
3173    /* First, calculate the set of virtual GRFs that need to be punted
3174     * to scratch due to having any array access on them, and where in
3175     * scratch.
3176     */
3177    foreach_list(node, &this->instructions) {
3178       vec4_instruction *inst = (vec4_instruction *)node;
3179
3180       if (inst->dst.file == GRF && inst->dst.reladdr &&
3181           scratch_loc[inst->dst.reg] == -1) {
3182          scratch_loc[inst->dst.reg] = c->last_scratch;
3183          c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
3184       }
3185
3186       for (int i = 0 ; i < 3; i++) {
3187          src_reg *src = &inst->src[i];
3188
3189          if (src->file == GRF && src->reladdr &&
3190              scratch_loc[src->reg] == -1) {
3191             scratch_loc[src->reg] = c->last_scratch;
3192             c->last_scratch += this->virtual_grf_sizes[src->reg];
3193          }
3194       }
3195    }
3196
3197    /* Now, for anything that will be accessed through scratch, rewrite
3198     * it to load/store.  Note that this is a _safe list walk, because
3199     * we may generate a new scratch_write instruction after the one
3200     * we're processing.
3201     */
3202    foreach_list_safe(node, &this->instructions) {
3203       vec4_instruction *inst = (vec4_instruction *)node;
3204
3205       /* Set up the annotation tracking for new generated instructions. */
3206       base_ir = inst->ir;
3207       current_annotation = inst->annotation;
3208
3209       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
3210          emit_scratch_write(inst, scratch_loc[inst->dst.reg]);
3211       }
3212
3213       for (int i = 0 ; i < 3; i++) {
3214          if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
3215             continue;
3216
3217          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3218
3219          emit_scratch_read(inst, temp, inst->src[i],
3220                            scratch_loc[inst->src[i].reg]);
3221
3222          inst->src[i].file = temp.file;
3223          inst->src[i].reg = temp.reg;
3224          inst->src[i].reg_offset = temp.reg_offset;
3225          inst->src[i].reladdr = NULL;
3226       }
3227    }
3228 }
3229
3230 /**
3231  * Emits an instruction before @inst to load the value named by @orig_src
3232  * from the pull constant buffer (surface) at @base_offset to @temp.
3233  */
3234 void
3235 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
3236                                       dst_reg temp, src_reg orig_src,
3237                                       int base_offset)
3238 {
3239    int reg_offset = base_offset + orig_src.reg_offset;
3240    src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3241    src_reg offset = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
3242    vec4_instruction *load;
3243
3244    if (brw->gen >= 7) {
3245       dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
3246       grf_offset.type = offset.type;
3247       emit_before(inst, MOV(grf_offset, offset));
3248
3249       load = new(mem_ctx) vec4_instruction(this,
3250                                            VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
3251                                            temp, index, src_reg(grf_offset));
3252    } else {
3253       load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
3254                                            temp, index, offset);
3255       load->base_mrf = 14;
3256       load->mlen = 1;
3257    }
3258    emit_before(inst, load);
3259 }
3260
3261 /**
3262  * Implements array access of uniforms by inserting a
3263  * PULL_CONSTANT_LOAD instruction.
3264  *
3265  * Unlike temporary GRF array access (where we don't support it due to
3266  * the difficulty of doing relative addressing on instruction
3267  * destinations), we could potentially do array access of uniforms
3268  * that were loaded in GRF space as push constants.  In real-world
3269  * usage we've seen, though, the arrays being used are always larger
3270  * than we could load as push constants, so just always move all
3271  * uniform array access out to a pull constant buffer.
3272  */
3273 void
3274 vec4_visitor::move_uniform_array_access_to_pull_constants()
3275 {
3276    int pull_constant_loc[this->uniforms];
3277
3278    for (int i = 0; i < this->uniforms; i++) {
3279       pull_constant_loc[i] = -1;
3280    }
3281
3282    /* Walk through and find array access of uniforms.  Put a copy of that
3283     * uniform in the pull constant buffer.
3284     *
3285     * Note that we don't move constant-indexed accesses to arrays.  No
3286     * testing has been done of the performance impact of this choice.
3287     */
3288    foreach_list_safe(node, &this->instructions) {
3289       vec4_instruction *inst = (vec4_instruction *)node;
3290
3291       for (int i = 0 ; i < 3; i++) {
3292          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3293             continue;
3294
3295          int uniform = inst->src[i].reg;
3296
3297          /* If this array isn't already present in the pull constant buffer,
3298           * add it.
3299           */
3300          if (pull_constant_loc[uniform] == -1) {
3301             const float **values = &stage_prog_data->param[uniform * 4];
3302
3303             pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3304
3305             for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3306                stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3307                   = values[j];
3308             }
3309          }
3310
3311          /* Set up the annotation tracking for new generated instructions. */
3312          base_ir = inst->ir;
3313          current_annotation = inst->annotation;
3314
3315          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3316
3317          emit_pull_constant_load(inst, temp, inst->src[i],
3318                                  pull_constant_loc[uniform]);
3319
3320          inst->src[i].file = temp.file;
3321          inst->src[i].reg = temp.reg;
3322          inst->src[i].reg_offset = temp.reg_offset;
3323          inst->src[i].reladdr = NULL;
3324       }
3325    }
3326
3327    /* Now there are no accesses of the UNIFORM file with a reladdr, so
3328     * no need to track them as larger-than-vec4 objects.  This will be
3329     * relied on in cutting out unused uniform vectors from push
3330     * constants.
3331     */
3332    split_uniform_registers();
3333 }
3334
3335 void
3336 vec4_visitor::resolve_ud_negate(src_reg *reg)
3337 {
3338    if (reg->type != BRW_REGISTER_TYPE_UD ||
3339        !reg->negate)
3340       return;
3341
3342    src_reg temp = src_reg(this, glsl_type::uvec4_type);
3343    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3344    *reg = temp;
3345 }
3346
3347 vec4_visitor::vec4_visitor(struct brw_context *brw,
3348                            struct brw_vec4_compile *c,
3349                            struct gl_program *prog,
3350                            const struct brw_vec4_prog_key *key,
3351                            struct brw_vec4_prog_data *prog_data,
3352                            struct gl_shader_program *shader_prog,
3353                            struct brw_shader *shader,
3354                            void *mem_ctx,
3355                            bool debug_flag,
3356                            bool no_spills,
3357                            shader_time_shader_type st_base,
3358                            shader_time_shader_type st_written,
3359                            shader_time_shader_type st_reset)
3360    : sanity_param_count(0),
3361      fail_msg(NULL),
3362      first_non_payload_grf(0),
3363      need_all_constants_in_pull_buffer(false),
3364      debug_flag(debug_flag),
3365      no_spills(no_spills),
3366      st_base(st_base),
3367      st_written(st_written),
3368      st_reset(st_reset)
3369 {
3370    this->brw = brw;
3371    this->ctx = &brw->ctx;
3372    this->shader_prog = shader_prog;
3373    this->shader = shader;
3374
3375    this->mem_ctx = mem_ctx;
3376    this->failed = false;
3377
3378    this->base_ir = NULL;
3379    this->current_annotation = NULL;
3380    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3381
3382    this->c = c;
3383    this->prog = prog;
3384    this->key = key;
3385    this->prog_data = prog_data;
3386    this->stage_prog_data = &prog_data->base;
3387
3388    this->variable_ht = hash_table_ctor(0,
3389                                        hash_table_pointer_hash,
3390                                        hash_table_pointer_compare);
3391
3392    this->virtual_grf_start = NULL;
3393    this->virtual_grf_end = NULL;
3394    this->virtual_grf_sizes = NULL;
3395    this->virtual_grf_count = 0;
3396    this->virtual_grf_reg_map = NULL;
3397    this->virtual_grf_reg_count = 0;
3398    this->virtual_grf_array_size = 0;
3399    this->live_intervals_valid = false;
3400
3401    this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3402
3403    this->uniforms = 0;
3404
3405    /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3406     * at least one. See setup_uniforms() in brw_vec4.cpp.
3407     */
3408    this->uniform_array_size = 1;
3409    if (prog_data) {
3410       this->uniform_array_size = MAX2(stage_prog_data->nr_params, 1);
3411    }
3412
3413    this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3414    this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3415 }
3416
3417 vec4_visitor::~vec4_visitor()
3418 {
3419    hash_table_dtor(this->variable_ht);
3420 }
3421
3422
3423 void
3424 vec4_visitor::fail(const char *format, ...)
3425 {
3426    va_list va;
3427    char *msg;
3428
3429    if (failed)
3430       return;
3431
3432    failed = true;
3433
3434    va_start(va, format);
3435    msg = ralloc_vasprintf(mem_ctx, format, va);
3436    va_end(va);
3437    msg = ralloc_asprintf(mem_ctx, "vec4 compile failed: %s\n", msg);
3438
3439    this->fail_msg = msg;
3440
3441    if (debug_flag) {
3442       fprintf(stderr, "%s",  msg);
3443    }
3444 }
3445
3446 } /* namespace brw */