src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "glsl/ir_uniform.h"
  26 extern "C" {
  27 #include "program/sampler.h"
  28 }
  29
  30 namespace brw {
  31
  32 vec4_instruction::vec4_instruction(vec4_visitor *v,
  33                                    enum opcode opcode, dst_reg dst,
  34                                    src_reg src0, src_reg src1, src_reg src2)
  35 {
  36    this->opcode = opcode;
  37    this->dst = dst;
  38    this->src[0] = src0;
  39    this->src[1] = src1;
  40    this->src[2] = src2;
  41    this->saturate = false;
  42    this->force_writemask_all = false;
  43    this->no_dd_clear = false;
  44    this->no_dd_check = false;
  45    this->conditional_mod = BRW_CONDITIONAL_NONE;
  46    this->sampler = 0;
  47    this->texture_offset = 0;
  48    this->target = 0;
  49    this->shadow_compare = false;
  50    this->ir = v->base_ir;
  51    this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
  52    this->header_present = false;
  53    this->mlen = 0;
  54    this->base_mrf = 0;
  55    this->offset = 0;
  56    this->annotation = v->current_annotation;
  57 }
  58
  59 vec4_instruction *
  60 vec4_visitor::emit(vec4_instruction *inst)
  61 {
  62    this->instructions.push_tail(inst);
  63
  64    return inst;
  65 }
  66
  67 vec4_instruction *
  68 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
  69 {
  70    new_inst->ir = inst->ir;
  71    new_inst->annotation = inst->annotation;
  72
  73    inst->insert_before(new_inst);
  74
  75    return inst;
  76 }
  77
  78 vec4_instruction *
  79 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
  80                    src_reg src0, src_reg src1, src_reg src2)
  81 {
  82    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
  83                                              src0, src1, src2));
  84 }
  85
  86
  87 vec4_instruction *
  88 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
  89 {
  90    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
  91 }
  92
  93 vec4_instruction *
  94 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
  95 {
  96    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
  97 }
  98
  99 vec4_instruction *
 100 vec4_visitor::emit(enum opcode opcode, dst_reg dst)
 101 {
 102    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst));
 103 }
 104
 105 vec4_instruction *
 106 vec4_visitor::emit(enum opcode opcode)
 107 {
 108    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
 109 }
 110
 111 #define ALU1(op)                                                        \
 112    vec4_instruction *                                                   \
 113    vec4_visitor::op(dst_reg dst, src_reg src0)                          \
 114    {                                                                    \
 115       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 116                                            src0);                       \
 117    }
 118
 119 #define ALU2(op)                                                        \
 120    vec4_instruction *                                                   \
 121    vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1)            \
 122    {                                                                    \
 123       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 124                                            src0, src1);                 \
 125    }
 126
 127 #define ALU3(op)                                                        \
 128    vec4_instruction *                                                   \
 129    vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1, src_reg src2)\
 130    {                                                                    \
 131       assert(brw->gen >= 6);                                            \
 132       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 133                                            src0, src1, src2);           \
 134    }
 135
 136 ALU1(NOT)
 137 ALU1(MOV)
 138 ALU1(FRC)
 139 ALU1(RNDD)
 140 ALU1(RNDE)
 141 ALU1(RNDZ)
 142 ALU1(F32TO16)
 143 ALU1(F16TO32)
 144 ALU2(ADD)
 145 ALU2(MUL)
 146 ALU2(MACH)
 147 ALU2(AND)
 148 ALU2(OR)
 149 ALU2(XOR)
 150 ALU2(DP3)
 151 ALU2(DP4)
 152 ALU2(DPH)
 153 ALU2(SHL)
 154 ALU2(SHR)
 155 ALU2(ASR)
 156 ALU3(LRP)
 157 ALU1(BFREV)
 158 ALU3(BFE)
 159 ALU2(BFI1)
 160 ALU3(BFI2)
 161 ALU1(FBH)
 162 ALU1(FBL)
 163 ALU1(CBIT)
 164 ALU3(MAD)
 165 ALU2(ADDC)
 166 ALU2(SUBB)
 167
 168 /** Gen4 predicated IF. */
 169 vec4_instruction *
 170 vec4_visitor::IF(uint32_t predicate)
 171 {
 172    vec4_instruction *inst;
 173
 174    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
 175    inst->predicate = predicate;
 176
 177    return inst;
 178 }
 179
 180 /** Gen6 IF with embedded comparison. */
 181 vec4_instruction *
 182 vec4_visitor::IF(src_reg src0, src_reg src1, uint32_t condition)
 183 {
 184    assert(brw->gen == 6);
 185
 186    vec4_instruction *inst;
 187
 188    resolve_ud_negate(&src0);
 189    resolve_ud_negate(&src1);
 190
 191    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
 192                                         src0, src1);
 193    inst->conditional_mod = condition;
 194
 195    return inst;
 196 }
 197
 198 /**
 199  * CMP: Sets the low bit of the destination channels with the result
 200  * of the comparison, while the upper bits are undefined, and updates
 201  * the flag register with the packed 16 bits of the result.
 202  */
 203 vec4_instruction *
 204 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, uint32_t condition)
 205 {
 206    vec4_instruction *inst;
 207
 208    /* original gen4 does type conversion to the destination type
 209     * before before comparison, producing garbage results for floating
 210     * point comparisons.
 211     */
 212    if (brw->gen == 4) {
 213       dst.type = src0.type;
 214       if (dst.file == HW_REG)
 215          dst.fixed_hw_reg.type = dst.type;
 216    }
 217
 218    resolve_ud_negate(&src0);
 219    resolve_ud_negate(&src1);
 220
 221    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
 222    inst->conditional_mod = condition;
 223
 224    return inst;
 225 }
 226
 227 vec4_instruction *
 228 vec4_visitor::SCRATCH_READ(dst_reg dst, src_reg index)
 229 {
 230    vec4_instruction *inst;
 231
 232    inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_READ,
 233                                         dst, index);
 234    inst->base_mrf = 14;
 235    inst->mlen = 2;
 236
 237    return inst;
 238 }
 239
 240 vec4_instruction *
 241 vec4_visitor::SCRATCH_WRITE(dst_reg dst, src_reg src, src_reg index)
 242 {
 243    vec4_instruction *inst;
 244
 245    inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_WRITE,
 246                                         dst, src, index);
 247    inst->base_mrf = 13;
 248    inst->mlen = 3;
 249
 250    return inst;
 251 }
 252
 253 void
 254 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
 255 {
 256    static enum opcode dot_opcodes[] = {
 257       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
 258    };
 259
 260    emit(dot_opcodes[elements - 2], dst, src0, src1);
 261 }
 262
 263 src_reg
 264 vec4_visitor::fix_3src_operand(src_reg src)
 265 {
 266    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
 267     * able to use vertical stride of zero to replicate the vec4 uniform, like
 268     *
 269     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
 270     *
 271     * But you can't, since vertical stride is always four in three-source
 272     * instructions. Instead, insert a MOV instruction to do the replication so
 273     * that the three-source instruction can consume it.
 274     */
 275
 276    /* The MOV is only needed if the source is a uniform or immediate. */
 277    if (src.file != UNIFORM && src.file != IMM)
 278       return src;
 279
 280    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 281    expanded.type = src.type;
 282    emit(MOV(expanded, src));
 283    return src_reg(expanded);
 284 }
 285
 286 src_reg
 287 vec4_visitor::fix_math_operand(src_reg src)
 288 {
 289    /* The gen6 math instruction ignores the source modifiers --
 290     * swizzle, abs, negate, and at least some parts of the register
 291     * region description.
 292     *
 293     * Rather than trying to enumerate all these cases, *always* expand the
 294     * operand to a temp GRF for gen6.
 295     *
 296     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 297     * can't use.
 298     */
 299
 300    if (brw->gen == 7 && src.file != IMM)
 301       return src;
 302
 303    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 304    expanded.type = src.type;
 305    emit(MOV(expanded, src));
 306    return src_reg(expanded);
 307 }
 308
 309 void
 310 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
 311 {
 312    src = fix_math_operand(src);
 313
 314    if (dst.writemask != WRITEMASK_XYZW) {
 315       /* The gen6 math instruction must be align1, so we can't do
 316        * writemasks.
 317        */
 318       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 319
 320       emit(opcode, temp_dst, src);
 321
 322       emit(MOV(dst, src_reg(temp_dst)));
 323    } else {
 324       emit(opcode, dst, src);
 325    }
 326 }
 327
 328 void
 329 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
 330 {
 331    vec4_instruction *inst = emit(opcode, dst, src);
 332    inst->base_mrf = 1;
 333    inst->mlen = 1;
 334 }
 335
 336 void
 337 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
 338 {
 339    switch (opcode) {
 340    case SHADER_OPCODE_RCP:
 341    case SHADER_OPCODE_RSQ:
 342    case SHADER_OPCODE_SQRT:
 343    case SHADER_OPCODE_EXP2:
 344    case SHADER_OPCODE_LOG2:
 345    case SHADER_OPCODE_SIN:
 346    case SHADER_OPCODE_COS:
 347       break;
 348    default:
 349       assert(!"not reached: bad math opcode");
 350       return;
 351    }
 352
 353    if (brw->gen >= 6) {
 354       return emit_math1_gen6(opcode, dst, src);
 355    } else {
 356       return emit_math1_gen4(opcode, dst, src);
 357    }
 358 }
 359
 360 void
 361 vec4_visitor::emit_math2_gen6(enum opcode opcode,
 362                               dst_reg dst, src_reg src0, src_reg src1)
 363 {
 364    src0 = fix_math_operand(src0);
 365    src1 = fix_math_operand(src1);
 366
 367    if (dst.writemask != WRITEMASK_XYZW) {
 368       /* The gen6 math instruction must be align1, so we can't do
 369        * writemasks.
 370        */
 371       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 372       temp_dst.type = dst.type;
 373
 374       emit(opcode, temp_dst, src0, src1);
 375
 376       emit(MOV(dst, src_reg(temp_dst)));
 377    } else {
 378       emit(opcode, dst, src0, src1);
 379    }
 380 }
 381
 382 void
 383 vec4_visitor::emit_math2_gen4(enum opcode opcode,
 384                               dst_reg dst, src_reg src0, src_reg src1)
 385 {
 386    vec4_instruction *inst = emit(opcode, dst, src0, src1);
 387    inst->base_mrf = 1;
 388    inst->mlen = 2;
 389 }
 390
 391 void
 392 vec4_visitor::emit_math(enum opcode opcode,
 393                         dst_reg dst, src_reg src0, src_reg src1)
 394 {
 395    switch (opcode) {
 396    case SHADER_OPCODE_POW:
 397    case SHADER_OPCODE_INT_QUOTIENT:
 398    case SHADER_OPCODE_INT_REMAINDER:
 399       break;
 400    default:
 401       assert(!"not reached: unsupported binary math opcode");
 402       return;
 403    }
 404
 405    if (brw->gen >= 6) {
 406       return emit_math2_gen6(opcode, dst, src0, src1);
 407    } else {
 408       return emit_math2_gen4(opcode, dst, src0, src1);
 409    }
 410 }
 411
 412 void
 413 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 414 {
 415    if (brw->gen < 7)
 416       assert(!"ir_unop_pack_half_2x16 should be lowered");
 417
 418    assert(dst.type == BRW_REGISTER_TYPE_UD);
 419    assert(src0.type == BRW_REGISTER_TYPE_F);
 420
 421    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
 422     *
 423     *   Because this instruction does not have a 16-bit floating-point type,
 424     *   the destination data type must be Word (W).
 425     *
 426     *   The destination must be DWord-aligned and specify a horizontal stride
 427     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
 428     *   each destination channel and the upper word is not modified.
 429     *
 430     * The above restriction implies that the f32to16 instruction must use
 431     * align1 mode, because only in align1 mode is it possible to specify
 432     * horizontal stride.  We choose here to defy the hardware docs and emit
 433     * align16 instructions.
 434     *
 435     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
 436     * instructions. I was partially successful in that the code passed all
 437     * tests.  However, the code was dubiously correct and fragile, and the
 438     * tests were not harsh enough to probe that frailty. Not trusting the
 439     * code, I chose instead to remain in align16 mode in defiance of the hw
 440     * docs).
 441     *
 442     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
 443     * simulator, emitting a f32to16 in align16 mode with UD as destination
 444     * data type is safe. The behavior differs from that specified in the PRM
 445     * in that the upper word of each destination channel is cleared to 0.
 446     */
 447
 448    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 449    src_reg tmp_src(tmp_dst);
 450
 451 #if 0
 452    /* Verify the undocumented behavior on which the following instructions
 453     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
 454     * then the result of the bit-or instruction below will be incorrect.
 455     *
 456     * You should inspect the disasm output in order to verify that the MOV is
 457     * not optimized away.
 458     */
 459    emit(MOV(tmp_dst, src_reg(0x12345678u)));
 460 #endif
 461
 462    /* Give tmp the form below, where "." means untouched.
 463     *
 464     *     w z          y          x w z          y          x
 465     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
 466     *
 467     * That the upper word of each write-channel be 0 is required for the
 468     * following bit-shift and bit-or instructions to work. Note that this
 469     * relies on the undocumented hardware behavior mentioned above.
 470     */
 471    tmp_dst.writemask = WRITEMASK_XY;
 472    emit(F32TO16(tmp_dst, src0));
 473
 474    /* Give the write-channels of dst the form:
 475     *   0xhhhh0000
 476     */
 477    tmp_src.swizzle = BRW_SWIZZLE_YYYY;
 478    emit(SHL(dst, tmp_src, src_reg(16u)));
 479
 480    /* Finally, give the write-channels of dst the form of packHalf2x16's
 481     * output:
 482     *   0xhhhhllll
 483     */
 484    tmp_src.swizzle = BRW_SWIZZLE_XXXX;
 485    emit(OR(dst, src_reg(dst), tmp_src));
 486 }
 487
 488 void
 489 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 490 {
 491    if (brw->gen < 7)
 492       assert(!"ir_unop_unpack_half_2x16 should be lowered");
 493
 494    assert(dst.type == BRW_REGISTER_TYPE_F);
 495    assert(src0.type == BRW_REGISTER_TYPE_UD);
 496
 497    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
 498     *
 499     *   Because this instruction does not have a 16-bit floating-point type,
 500     *   the source data type must be Word (W). The destination type must be
 501     *   F (Float).
 502     *
 503     * To use W as the source data type, we must adjust horizontal strides,
 504     * which is only possible in align1 mode. All my [chadv] attempts at
 505     * emitting align1 instructions for unpackHalf2x16 failed to pass the
 506     * Piglit tests, so I gave up.
 507     *
 508     * I've verified that, on gen7 hardware and the simulator, it is safe to
 509     * emit f16to32 in align16 mode with UD as source data type.
 510     */
 511
 512    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 513    src_reg tmp_src(tmp_dst);
 514
 515    tmp_dst.writemask = WRITEMASK_X;
 516    emit(AND(tmp_dst, src0, src_reg(0xffffu)));
 517
 518    tmp_dst.writemask = WRITEMASK_Y;
 519    emit(SHR(tmp_dst, src0, src_reg(16u)));
 520
 521    dst.writemask = WRITEMASK_XY;
 522    emit(F16TO32(dst, tmp_src));
 523 }
 524
 525 void
 526 vec4_visitor::visit_instructions(const exec_list *list)
 527 {
 528    foreach_list(node, list) {
 529       ir_instruction *ir = (ir_instruction *)node;
 530
 531       base_ir = ir;
 532       ir->accept(this);
 533    }
 534 }
 535
 536
 537 static int
 538 type_size(const struct glsl_type *type)
 539 {
 540    unsigned int i;
 541    int size;
 542
 543    switch (type->base_type) {
 544    case GLSL_TYPE_UINT:
 545    case GLSL_TYPE_INT:
 546    case GLSL_TYPE_FLOAT:
 547    case GLSL_TYPE_BOOL:
 548       if (type->is_matrix()) {
 549          return type->matrix_columns;
 550       } else {
 551          /* Regardless of size of vector, it gets a vec4. This is bad
 552           * packing for things like floats, but otherwise arrays become a
 553           * mess.  Hopefully a later pass over the code can pack scalars
 554           * down if appropriate.
 555           */
 556          return 1;
 557       }
 558    case GLSL_TYPE_ARRAY:
 559       assert(type->length > 0);
 560       return type_size(type->fields.array) * type->length;
 561    case GLSL_TYPE_STRUCT:
 562       size = 0;
 563       for (i = 0; i < type->length; i++) {
 564          size += type_size(type->fields.structure[i].type);
 565       }
 566       return size;
 567    case GLSL_TYPE_SAMPLER:
 568       /* Samplers take up one slot in UNIFORMS[], but they're baked in
 569        * at link time.
 570        */
 571       return 1;
 572    case GLSL_TYPE_ATOMIC_UINT:
 573       return 0;
 574    case GLSL_TYPE_IMAGE:
 575    case GLSL_TYPE_VOID:
 576    case GLSL_TYPE_ERROR:
 577    case GLSL_TYPE_INTERFACE:
 578       assert(0);
 579       break;
 580    }
 581
 582    return 0;
 583 }
 584
 585 int
 586 vec4_visitor::virtual_grf_alloc(int size)
 587 {
 588    if (virtual_grf_array_size <= virtual_grf_count) {
 589       if (virtual_grf_array_size == 0)
 590          virtual_grf_array_size = 16;
 591       else
 592          virtual_grf_array_size *= 2;
 593       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 594                                    virtual_grf_array_size);
 595       virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
 596                                      virtual_grf_array_size);
 597    }
 598    virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
 599    virtual_grf_reg_count += size;
 600    virtual_grf_sizes[virtual_grf_count] = size;
 601    return virtual_grf_count++;
 602 }
 603
 604 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 605 {
 606    init();
 607
 608    this->file = GRF;
 609    this->reg = v->virtual_grf_alloc(type_size(type));
 610
 611    if (type->is_array() || type->is_record()) {
 612       this->swizzle = BRW_SWIZZLE_NOOP;
 613    } else {
 614       this->swizzle = swizzle_for_size(type->vector_elements);
 615    }
 616
 617    this->type = brw_type_for_base_type(type);
 618 }
 619
 620 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 621 {
 622    init();
 623
 624    this->file = GRF;
 625    this->reg = v->virtual_grf_alloc(type_size(type));
 626
 627    if (type->is_array() || type->is_record()) {
 628       this->writemask = WRITEMASK_XYZW;
 629    } else {
 630       this->writemask = (1 << type->vector_elements) - 1;
 631    }
 632
 633    this->type = brw_type_for_base_type(type);
 634 }
 635
 636 /* Our support for uniforms is piggy-backed on the struct
 637  * gl_fragment_program, because that's where the values actually
 638  * get stored, rather than in some global gl_shader_program uniform
 639  * store.
 640  */
 641 void
 642 vec4_visitor::setup_uniform_values(ir_variable *ir)
 643 {
 644    int namelen = strlen(ir->name);
 645
 646    /* The data for our (non-builtin) uniforms is stored in a series of
 647     * gl_uniform_driver_storage structs for each subcomponent that
 648     * glGetUniformLocation() could name.  We know it's been set up in the same
 649     * order we'd walk the type, so walk the list of storage and find anything
 650     * with our name, or the prefix of a component that starts with our name.
 651     */
 652    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 653       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 654
 655       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 656           (storage->name[namelen] != 0 &&
 657            storage->name[namelen] != '.' &&
 658            storage->name[namelen] != '[')) {
 659          continue;
 660       }
 661
 662       gl_constant_value *components = storage->storage;
 663       unsigned vector_count = (MAX2(storage->array_elements, 1) *
 664                                storage->type->matrix_columns);
 665
 666       for (unsigned s = 0; s < vector_count; s++) {
 667          uniform_vector_size[uniforms] = storage->type->vector_elements;
 668
 669          int i;
 670          for (i = 0; i < uniform_vector_size[uniforms]; i++) {
 671             stage_prog_data->param[uniforms * 4 + i] = &components->f;
 672             components++;
 673          }
 674          for (; i < 4; i++) {
 675             static float zero = 0;
 676             stage_prog_data->param[uniforms * 4 + i] = &zero;
 677          }
 678
 679          uniforms++;
 680       }
 681    }
 682 }
 683
 684 void
 685 vec4_visitor::setup_uniform_clipplane_values()
 686 {
 687    gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
 688
 689    for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
 690       this->uniform_vector_size[this->uniforms] = 4;
 691       this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
 692       this->userplane[i].type = BRW_REGISTER_TYPE_F;
 693       for (int j = 0; j < 4; ++j) {
 694          stage_prog_data->param[this->uniforms * 4 + j] = &clip_planes[i][j];
 695       }
 696       ++this->uniforms;
 697    }
 698 }
 699
 700 /* Our support for builtin uniforms is even scarier than non-builtin.
 701  * It sits on top of the PROG_STATE_VAR parameters that are
 702  * automatically updated from GL context state.
 703  */
 704 void
 705 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
 706 {
 707    const ir_state_slot *const slots = ir->state_slots;
 708    assert(ir->state_slots != NULL);
 709
 710    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 711       /* This state reference has already been setup by ir_to_mesa,
 712        * but we'll get the same index back here.  We can reference
 713        * ParameterValues directly, since unlike brw_fs.cpp, we never
 714        * add new state references during compile.
 715        */
 716       int index = _mesa_add_state_reference(this->prog->Parameters,
 717                                             (gl_state_index *)slots[i].tokens);
 718       float *values = &this->prog->Parameters->ParameterValues[index][0].f;
 719
 720       this->uniform_vector_size[this->uniforms] = 0;
 721       /* Add each of the unique swizzled channels of the element.
 722        * This will end up matching the size of the glsl_type of this field.
 723        */
 724       int last_swiz = -1;
 725       for (unsigned int j = 0; j < 4; j++) {
 726          int swiz = GET_SWZ(slots[i].swizzle, j);
 727          last_swiz = swiz;
 728
 729          stage_prog_data->param[this->uniforms * 4 + j] = &values[swiz];
 730          if (swiz <= last_swiz)
 731             this->uniform_vector_size[this->uniforms]++;
 732       }
 733       this->uniforms++;
 734    }
 735 }
 736
 737 dst_reg *
 738 vec4_visitor::variable_storage(ir_variable *var)
 739 {
 740    return (dst_reg *)hash_table_find(this->variable_ht, var);
 741 }
 742
 743 void
 744 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir, uint32_t *predicate)
 745 {
 746    ir_expression *expr = ir->as_expression();
 747
 748    *predicate = BRW_PREDICATE_NORMAL;
 749
 750    if (expr) {
 751       src_reg op[2];
 752       vec4_instruction *inst;
 753
 754       assert(expr->get_num_operands() <= 2);
 755       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 756          expr->operands[i]->accept(this);
 757          op[i] = this->result;
 758
 759          resolve_ud_negate(&op[i]);
 760       }
 761
 762       switch (expr->operation) {
 763       case ir_unop_logic_not:
 764          inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
 765          inst->conditional_mod = BRW_CONDITIONAL_Z;
 766          break;
 767
 768       case ir_binop_logic_xor:
 769          inst = emit(XOR(dst_null_d(), op[0], op[1]));
 770          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 771          break;
 772
 773       case ir_binop_logic_or:
 774          inst = emit(OR(dst_null_d(), op[0], op[1]));
 775          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 776          break;
 777
 778       case ir_binop_logic_and:
 779          inst = emit(AND(dst_null_d(), op[0], op[1]));
 780          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 781          break;
 782
 783       case ir_unop_f2b:
 784          if (brw->gen >= 6) {
 785             emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
 786          } else {
 787             inst = emit(MOV(dst_null_f(), op[0]));
 788             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 789          }
 790          break;
 791
 792       case ir_unop_i2b:
 793          if (brw->gen >= 6) {
 794             emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 795          } else {
 796             inst = emit(MOV(dst_null_d(), op[0]));
 797             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 798          }
 799          break;
 800
 801       case ir_binop_all_equal:
 802          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 803          *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
 804          break;
 805
 806       case ir_binop_any_nequal:
 807          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 808          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 809          break;
 810
 811       case ir_unop_any:
 812          inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 813          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 814          break;
 815
 816       case ir_binop_greater:
 817       case ir_binop_gequal:
 818       case ir_binop_less:
 819       case ir_binop_lequal:
 820       case ir_binop_equal:
 821       case ir_binop_nequal:
 822          emit(CMP(dst_null_d(), op[0], op[1],
 823                   brw_conditional_for_comparison(expr->operation)));
 824          break;
 825
 826       default:
 827          assert(!"not reached");
 828          break;
 829       }
 830       return;
 831    }
 832
 833    ir->accept(this);
 834
 835    resolve_ud_negate(&this->result);
 836
 837    if (brw->gen >= 6) {
 838       vec4_instruction *inst = emit(AND(dst_null_d(),
 839                                         this->result, src_reg(1)));
 840       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 841    } else {
 842       vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
 843       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 844    }
 845 }
 846
 847 /**
 848  * Emit a gen6 IF statement with the comparison folded into the IF
 849  * instruction.
 850  */
 851 void
 852 vec4_visitor::emit_if_gen6(ir_if *ir)
 853 {
 854    ir_expression *expr = ir->condition->as_expression();
 855
 856    if (expr) {
 857       src_reg op[2];
 858       dst_reg temp;
 859
 860       assert(expr->get_num_operands() <= 2);
 861       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 862          expr->operands[i]->accept(this);
 863          op[i] = this->result;
 864       }
 865
 866       switch (expr->operation) {
 867       case ir_unop_logic_not:
 868          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
 869          return;
 870
 871       case ir_binop_logic_xor:
 872          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
 873          return;
 874
 875       case ir_binop_logic_or:
 876          temp = dst_reg(this, glsl_type::bool_type);
 877          emit(OR(temp, op[0], op[1]));
 878          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 879          return;
 880
 881       case ir_binop_logic_and:
 882          temp = dst_reg(this, glsl_type::bool_type);
 883          emit(AND(temp, op[0], op[1]));
 884          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 885          return;
 886
 887       case ir_unop_f2b:
 888          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 889          return;
 890
 891       case ir_unop_i2b:
 892          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 893          return;
 894
 895       case ir_binop_greater:
 896       case ir_binop_gequal:
 897       case ir_binop_less:
 898       case ir_binop_lequal:
 899       case ir_binop_equal:
 900       case ir_binop_nequal:
 901          emit(IF(op[0], op[1],
 902                  brw_conditional_for_comparison(expr->operation)));
 903          return;
 904
 905       case ir_binop_all_equal:
 906          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 907          emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
 908          return;
 909
 910       case ir_binop_any_nequal:
 911          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 912          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 913          return;
 914
 915       case ir_unop_any:
 916          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 917          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 918          return;
 919
 920       default:
 921          assert(!"not reached");
 922          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 923          return;
 924       }
 925       return;
 926    }
 927
 928    ir->condition->accept(this);
 929
 930    emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
 931 }
 932
 933 void
 934 vec4_visitor::visit(ir_variable *ir)
 935 {
 936    dst_reg *reg = NULL;
 937
 938    if (variable_storage(ir))
 939       return;
 940
 941    switch (ir->data.mode) {
 942    case ir_var_shader_in:
 943       reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
 944       break;
 945
 946    case ir_var_shader_out:
 947       reg = new(mem_ctx) dst_reg(this, ir->type);
 948
 949       for (int i = 0; i < type_size(ir->type); i++) {
 950          output_reg[ir->data.location + i] = *reg;
 951          output_reg[ir->data.location + i].reg_offset = i;
 952          output_reg[ir->data.location + i].type =
 953             brw_type_for_base_type(ir->type->get_scalar_type());
 954          output_reg_annotation[ir->data.location + i] = ir->name;
 955       }
 956       break;
 957
 958    case ir_var_auto:
 959    case ir_var_temporary:
 960       reg = new(mem_ctx) dst_reg(this, ir->type);
 961       break;
 962
 963    case ir_var_uniform:
 964       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
 965
 966       /* Thanks to the lower_ubo_reference pass, we will see only
 967        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
 968        * variables, so no need for them to be in variable_ht.
 969        *
 970        * Atomic counters take no uniform storage, no need to do
 971        * anything here.
 972        */
 973       if (ir->is_in_uniform_block() || ir->type->contains_atomic())
 974          return;
 975
 976       /* Track how big the whole uniform variable is, in case we need to put a
 977        * copy of its data into pull constants for array access.
 978        */
 979       this->uniform_size[this->uniforms] = type_size(ir->type);
 980
 981       if (!strncmp(ir->name, "gl_", 3)) {
 982          setup_builtin_uniform_values(ir);
 983       } else {
 984          setup_uniform_values(ir);
 985       }
 986       break;
 987
 988    case ir_var_system_value:
 989       reg = make_reg_for_system_value(ir);
 990       break;
 991
 992    default:
 993       assert(!"not reached");
 994    }
 995
 996    reg->type = brw_type_for_base_type(ir->type);
 997    hash_table_insert(this->variable_ht, reg, ir);
 998 }
 999
1000 void
1001 vec4_visitor::visit(ir_loop *ir)
1002 {
1003    /* We don't want debugging output to print the whole body of the
1004     * loop as the annotation.
1005     */
1006    this->base_ir = NULL;
1007
1008    emit(BRW_OPCODE_DO);
1009
1010    visit_instructions(&ir->body_instructions);
1011
1012    emit(BRW_OPCODE_WHILE);
1013 }
1014
1015 void
1016 vec4_visitor::visit(ir_loop_jump *ir)
1017 {
1018    switch (ir->mode) {
1019    case ir_loop_jump::jump_break:
1020       emit(BRW_OPCODE_BREAK);
1021       break;
1022    case ir_loop_jump::jump_continue:
1023       emit(BRW_OPCODE_CONTINUE);
1024       break;
1025    }
1026 }
1027
1028
1029 void
1030 vec4_visitor::visit(ir_function_signature *ir)
1031 {
1032    assert(0);
1033    (void)ir;
1034 }
1035
1036 void
1037 vec4_visitor::visit(ir_function *ir)
1038 {
1039    /* Ignore function bodies other than main() -- we shouldn't see calls to
1040     * them since they should all be inlined.
1041     */
1042    if (strcmp(ir->name, "main") == 0) {
1043       const ir_function_signature *sig;
1044       exec_list empty;
1045
1046       sig = ir->matching_signature(NULL, &empty);
1047
1048       assert(sig);
1049
1050       visit_instructions(&sig->body);
1051    }
1052 }
1053
1054 bool
1055 vec4_visitor::try_emit_sat(ir_expression *ir)
1056 {
1057    ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
1058    if (!sat_src)
1059       return false;
1060
1061    sat_src->accept(this);
1062    src_reg src = this->result;
1063
1064    this->result = src_reg(this, ir->type);
1065    vec4_instruction *inst;
1066    inst = emit(MOV(dst_reg(this->result), src));
1067    inst->saturate = true;
1068
1069    return true;
1070 }
1071
1072 bool
1073 vec4_visitor::try_emit_mad(ir_expression *ir, int mul_arg)
1074 {
1075    /* 3-src instructions were introduced in gen6. */
1076    if (brw->gen < 6)
1077       return false;
1078
1079    /* MAD can only handle floating-point data. */
1080    if (ir->type->base_type != GLSL_TYPE_FLOAT)
1081       return false;
1082
1083    ir_rvalue *nonmul = ir->operands[1 - mul_arg];
1084    ir_expression *mul = ir->operands[mul_arg]->as_expression();
1085
1086    if (!mul || mul->operation != ir_binop_mul)
1087       return false;
1088
1089    nonmul->accept(this);
1090    src_reg src0 = fix_3src_operand(this->result);
1091
1092    mul->operands[0]->accept(this);
1093    src_reg src1 = fix_3src_operand(this->result);
1094
1095    mul->operands[1]->accept(this);
1096    src_reg src2 = fix_3src_operand(this->result);
1097
1098    this->result = src_reg(this, ir->type);
1099    emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1100
1101    return true;
1102 }
1103
1104 void
1105 vec4_visitor::emit_bool_comparison(unsigned int op,
1106                                  dst_reg dst, src_reg src0, src_reg src1)
1107 {
1108    /* original gen4 does destination conversion before comparison. */
1109    if (brw->gen < 5)
1110       dst.type = src0.type;
1111
1112    emit(CMP(dst, src0, src1, brw_conditional_for_comparison(op)));
1113
1114    dst.type = BRW_REGISTER_TYPE_D;
1115    emit(AND(dst, src_reg(dst), src_reg(0x1)));
1116 }
1117
1118 void
1119 vec4_visitor::emit_minmax(uint32_t conditionalmod, dst_reg dst,
1120                           src_reg src0, src_reg src1)
1121 {
1122    vec4_instruction *inst;
1123
1124    if (brw->gen >= 6) {
1125       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1126       inst->conditional_mod = conditionalmod;
1127    } else {
1128       emit(CMP(dst, src0, src1, conditionalmod));
1129
1130       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1131       inst->predicate = BRW_PREDICATE_NORMAL;
1132    }
1133 }
1134
1135 static bool
1136 is_16bit_constant(ir_rvalue *rvalue)
1137 {
1138    ir_constant *constant = rvalue->as_constant();
1139    if (!constant)
1140       return false;
1141
1142    if (constant->type != glsl_type::int_type &&
1143        constant->type != glsl_type::uint_type)
1144       return false;
1145
1146    return constant->value.u[0] < (1 << 16);
1147 }
1148
1149 void
1150 vec4_visitor::visit(ir_expression *ir)
1151 {
1152    unsigned int operand;
1153    src_reg op[Elements(ir->operands)];
1154    src_reg result_src;
1155    dst_reg result_dst;
1156    vec4_instruction *inst;
1157
1158    if (try_emit_sat(ir))
1159       return;
1160
1161    if (ir->operation == ir_binop_add) {
1162       if (try_emit_mad(ir, 0) || try_emit_mad(ir, 1))
1163          return;
1164    }
1165
1166    for (operand = 0; operand < ir->get_num_operands(); operand++) {
1167       this->result.file = BAD_FILE;
1168       ir->operands[operand]->accept(this);
1169       if (this->result.file == BAD_FILE) {
1170          fprintf(stderr, "Failed to get tree for expression operand:\n");
1171          ir->operands[operand]->fprint(stderr);
1172          exit(1);
1173       }
1174       op[operand] = this->result;
1175
1176       /* Matrix expression operands should have been broken down to vector
1177        * operations already.
1178        */
1179       assert(!ir->operands[operand]->type->is_matrix());
1180    }
1181
1182    int vector_elements = ir->operands[0]->type->vector_elements;
1183    if (ir->operands[1]) {
1184       vector_elements = MAX2(vector_elements,
1185                              ir->operands[1]->type->vector_elements);
1186    }
1187
1188    this->result.file = BAD_FILE;
1189
1190    /* Storage for our result.  Ideally for an assignment we'd be using
1191     * the actual storage for the result here, instead.
1192     */
1193    result_src = src_reg(this, ir->type);
1194    /* convenience for the emit functions below. */
1195    result_dst = dst_reg(result_src);
1196    /* If nothing special happens, this is the result. */
1197    this->result = result_src;
1198    /* Limit writes to the channels that will be used by result_src later.
1199     * This does limit this temp's use as a temporary for multi-instruction
1200     * sequences.
1201     */
1202    result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1203
1204    switch (ir->operation) {
1205    case ir_unop_logic_not:
1206       /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
1207        * ones complement of the whole register, not just bit 0.
1208        */
1209       emit(XOR(result_dst, op[0], src_reg(1)));
1210       break;
1211    case ir_unop_neg:
1212       op[0].negate = !op[0].negate;
1213       emit(MOV(result_dst, op[0]));
1214       break;
1215    case ir_unop_abs:
1216       op[0].abs = true;
1217       op[0].negate = false;
1218       emit(MOV(result_dst, op[0]));
1219       break;
1220
1221    case ir_unop_sign:
1222       if (ir->type->is_float()) {
1223          /* AND(val, 0x80000000) gives the sign bit.
1224           *
1225           * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1226           * zero.
1227           */
1228          emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1229
1230          op[0].type = BRW_REGISTER_TYPE_UD;
1231          result_dst.type = BRW_REGISTER_TYPE_UD;
1232          emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1233
1234          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1235          inst->predicate = BRW_PREDICATE_NORMAL;
1236
1237          this->result.type = BRW_REGISTER_TYPE_F;
1238       } else {
1239          /*  ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1240           *               -> non-negative val generates 0x00000000.
1241           *  Predicated OR sets 1 if val is positive.
1242           */
1243          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1244
1245          emit(ASR(result_dst, op[0], src_reg(31)));
1246
1247          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1248          inst->predicate = BRW_PREDICATE_NORMAL;
1249       }
1250       break;
1251
1252    case ir_unop_rcp:
1253       emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1254       break;
1255
1256    case ir_unop_exp2:
1257       emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1258       break;
1259    case ir_unop_log2:
1260       emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1261       break;
1262    case ir_unop_exp:
1263    case ir_unop_log:
1264       assert(!"not reached: should be handled by ir_explog_to_explog2");
1265       break;
1266    case ir_unop_sin:
1267    case ir_unop_sin_reduced:
1268       emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1269       break;
1270    case ir_unop_cos:
1271    case ir_unop_cos_reduced:
1272       emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1273       break;
1274
1275    case ir_unop_dFdx:
1276    case ir_unop_dFdy:
1277       assert(!"derivatives not valid in vertex shader");
1278       break;
1279
1280    case ir_unop_bitfield_reverse:
1281       emit(BFREV(result_dst, op[0]));
1282       break;
1283    case ir_unop_bit_count:
1284       emit(CBIT(result_dst, op[0]));
1285       break;
1286    case ir_unop_find_msb: {
1287       src_reg temp = src_reg(this, glsl_type::uint_type);
1288
1289       inst = emit(FBH(dst_reg(temp), op[0]));
1290       inst->dst.writemask = WRITEMASK_XYZW;
1291
1292       /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1293        * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1294        * subtract the result from 31 to convert the MSB count into an LSB count.
1295        */
1296
1297       /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1298       temp.swizzle = BRW_SWIZZLE_NOOP;
1299       emit(MOV(result_dst, temp));
1300
1301       src_reg src_tmp = src_reg(result_dst);
1302       emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1303
1304       src_tmp.negate = true;
1305       inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1306       inst->predicate = BRW_PREDICATE_NORMAL;
1307       break;
1308    }
1309    case ir_unop_find_lsb:
1310       emit(FBL(result_dst, op[0]));
1311       break;
1312
1313    case ir_unop_noise:
1314       assert(!"not reached: should be handled by lower_noise");
1315       break;
1316
1317    case ir_binop_add:
1318       emit(ADD(result_dst, op[0], op[1]));
1319       break;
1320    case ir_binop_sub:
1321       assert(!"not reached: should be handled by ir_sub_to_add_neg");
1322       break;
1323
1324    case ir_binop_mul:
1325       if (brw->gen < 8 && ir->type->is_integer()) {
1326          /* For integer multiplication, the MUL uses the low 16 bits of one of
1327           * the operands (src0 through SNB, src1 on IVB and later).  The MACH
1328           * accumulates in the contribution of the upper 16 bits of that
1329           * operand.  If we can determine that one of the args is in the low
1330           * 16 bits, though, we can just emit a single MUL.
1331           */
1332          if (is_16bit_constant(ir->operands[0])) {
1333             if (brw->gen < 7)
1334                emit(MUL(result_dst, op[0], op[1]));
1335             else
1336                emit(MUL(result_dst, op[1], op[0]));
1337          } else if (is_16bit_constant(ir->operands[1])) {
1338             if (brw->gen < 7)
1339                emit(MUL(result_dst, op[1], op[0]));
1340             else
1341                emit(MUL(result_dst, op[0], op[1]));
1342          } else {
1343             struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1344
1345             emit(MUL(acc, op[0], op[1]));
1346             emit(MACH(dst_null_d(), op[0], op[1]));
1347             emit(MOV(result_dst, src_reg(acc)));
1348          }
1349       } else {
1350          emit(MUL(result_dst, op[0], op[1]));
1351       }
1352       break;
1353    case ir_binop_imul_high: {
1354       struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1355
1356       emit(MUL(acc, op[0], op[1]));
1357       emit(MACH(result_dst, op[0], op[1]));
1358       break;
1359    }
1360    case ir_binop_div:
1361       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1362       assert(ir->type->is_integer());
1363       emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1364       break;
1365    case ir_binop_carry: {
1366       struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1367
1368       emit(ADDC(dst_null_ud(), op[0], op[1]));
1369       emit(MOV(result_dst, src_reg(acc)));
1370       break;
1371    }
1372    case ir_binop_borrow: {
1373       struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1374
1375       emit(SUBB(dst_null_ud(), op[0], op[1]));
1376       emit(MOV(result_dst, src_reg(acc)));
1377       break;
1378    }
1379    case ir_binop_mod:
1380       /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1381       assert(ir->type->is_integer());
1382       emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1383       break;
1384
1385    case ir_binop_less:
1386    case ir_binop_greater:
1387    case ir_binop_lequal:
1388    case ir_binop_gequal:
1389    case ir_binop_equal:
1390    case ir_binop_nequal: {
1391       emit(CMP(result_dst, op[0], op[1],
1392                brw_conditional_for_comparison(ir->operation)));
1393       emit(AND(result_dst, result_src, src_reg(0x1)));
1394       break;
1395    }
1396
1397    case ir_binop_all_equal:
1398       /* "==" operator producing a scalar boolean. */
1399       if (ir->operands[0]->type->is_vector() ||
1400           ir->operands[1]->type->is_vector()) {
1401          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1402          emit(MOV(result_dst, src_reg(0)));
1403          inst = emit(MOV(result_dst, src_reg(1)));
1404          inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1405       } else {
1406          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1407          emit(AND(result_dst, result_src, src_reg(0x1)));
1408       }
1409       break;
1410    case ir_binop_any_nequal:
1411       /* "!=" operator producing a scalar boolean. */
1412       if (ir->operands[0]->type->is_vector() ||
1413           ir->operands[1]->type->is_vector()) {
1414          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1415
1416          emit(MOV(result_dst, src_reg(0)));
1417          inst = emit(MOV(result_dst, src_reg(1)));
1418          inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1419       } else {
1420          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1421          emit(AND(result_dst, result_src, src_reg(0x1)));
1422       }
1423       break;
1424
1425    case ir_unop_any:
1426       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1427       emit(MOV(result_dst, src_reg(0)));
1428
1429       inst = emit(MOV(result_dst, src_reg(1)));
1430       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1431       break;
1432
1433    case ir_binop_logic_xor:
1434       emit(XOR(result_dst, op[0], op[1]));
1435       break;
1436
1437    case ir_binop_logic_or:
1438       emit(OR(result_dst, op[0], op[1]));
1439       break;
1440
1441    case ir_binop_logic_and:
1442       emit(AND(result_dst, op[0], op[1]));
1443       break;
1444
1445    case ir_binop_dot:
1446       assert(ir->operands[0]->type->is_vector());
1447       assert(ir->operands[0]->type == ir->operands[1]->type);
1448       emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1449       break;
1450
1451    case ir_unop_sqrt:
1452       emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1453       break;
1454    case ir_unop_rsq:
1455       emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1456       break;
1457
1458    case ir_unop_bitcast_i2f:
1459    case ir_unop_bitcast_u2f:
1460       this->result = op[0];
1461       this->result.type = BRW_REGISTER_TYPE_F;
1462       break;
1463
1464    case ir_unop_bitcast_f2i:
1465       this->result = op[0];
1466       this->result.type = BRW_REGISTER_TYPE_D;
1467       break;
1468
1469    case ir_unop_bitcast_f2u:
1470       this->result = op[0];
1471       this->result.type = BRW_REGISTER_TYPE_UD;
1472       break;
1473
1474    case ir_unop_i2f:
1475    case ir_unop_i2u:
1476    case ir_unop_u2i:
1477    case ir_unop_u2f:
1478    case ir_unop_b2f:
1479    case ir_unop_b2i:
1480    case ir_unop_f2i:
1481    case ir_unop_f2u:
1482       emit(MOV(result_dst, op[0]));
1483       break;
1484    case ir_unop_f2b:
1485    case ir_unop_i2b: {
1486       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1487       emit(AND(result_dst, result_src, src_reg(1)));
1488       break;
1489    }
1490
1491    case ir_unop_trunc:
1492       emit(RNDZ(result_dst, op[0]));
1493       break;
1494    case ir_unop_ceil:
1495       op[0].negate = !op[0].negate;
1496       inst = emit(RNDD(result_dst, op[0]));
1497       this->result.negate = true;
1498       break;
1499    case ir_unop_floor:
1500       inst = emit(RNDD(result_dst, op[0]));
1501       break;
1502    case ir_unop_fract:
1503       inst = emit(FRC(result_dst, op[0]));
1504       break;
1505    case ir_unop_round_even:
1506       emit(RNDE(result_dst, op[0]));
1507       break;
1508
1509    case ir_binop_min:
1510       emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1511       break;
1512    case ir_binop_max:
1513       emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1514       break;
1515
1516    case ir_binop_pow:
1517       emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1518       break;
1519
1520    case ir_unop_bit_not:
1521       inst = emit(NOT(result_dst, op[0]));
1522       break;
1523    case ir_binop_bit_and:
1524       inst = emit(AND(result_dst, op[0], op[1]));
1525       break;
1526    case ir_binop_bit_xor:
1527       inst = emit(XOR(result_dst, op[0], op[1]));
1528       break;
1529    case ir_binop_bit_or:
1530       inst = emit(OR(result_dst, op[0], op[1]));
1531       break;
1532
1533    case ir_binop_lshift:
1534       inst = emit(SHL(result_dst, op[0], op[1]));
1535       break;
1536
1537    case ir_binop_rshift:
1538       if (ir->type->base_type == GLSL_TYPE_INT)
1539          inst = emit(ASR(result_dst, op[0], op[1]));
1540       else
1541          inst = emit(SHR(result_dst, op[0], op[1]));
1542       break;
1543
1544    case ir_binop_bfm:
1545       emit(BFI1(result_dst, op[0], op[1]));
1546       break;
1547
1548    case ir_binop_ubo_load: {
1549       ir_constant *uniform_block = ir->operands[0]->as_constant();
1550       ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1551       unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1552       src_reg offset;
1553
1554       /* Now, load the vector from that offset. */
1555       assert(ir->type->is_vector() || ir->type->is_scalar());
1556
1557       src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1558       packed_consts.type = result.type;
1559       src_reg surf_index =
1560          src_reg(prog_data->base.binding_table.ubo_start + uniform_block->value.u[0]);
1561       if (const_offset_ir) {
1562          if (brw->gen >= 8) {
1563             /* Store the offset in a GRF so we can send-from-GRF. */
1564             offset = src_reg(this, glsl_type::int_type);
1565             emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1566          } else {
1567             /* Immediates are fine on older generations since they'll be moved
1568              * to a (potentially fake) MRF at the generator level.
1569              */
1570             offset = src_reg(const_offset / 16);
1571          }
1572       } else {
1573          offset = src_reg(this, glsl_type::uint_type);
1574          emit(SHR(dst_reg(offset), op[1], src_reg(4)));
1575       }
1576
1577       if (brw->gen >= 7) {
1578          dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1579          grf_offset.type = offset.type;
1580
1581          emit(MOV(grf_offset, offset));
1582
1583          emit(new(mem_ctx) vec4_instruction(this,
1584                                             VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1585                                             dst_reg(packed_consts),
1586                                             surf_index,
1587                                             src_reg(grf_offset)));
1588       } else {
1589          vec4_instruction *pull =
1590             emit(new(mem_ctx) vec4_instruction(this,
1591                                                VS_OPCODE_PULL_CONSTANT_LOAD,
1592                                                dst_reg(packed_consts),
1593                                                surf_index,
1594                                                offset));
1595          pull->base_mrf = 14;
1596          pull->mlen = 1;
1597       }
1598
1599       packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1600       packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1601                                             const_offset % 16 / 4,
1602                                             const_offset % 16 / 4,
1603                                             const_offset % 16 / 4);
1604
1605       /* UBO bools are any nonzero int.  We store bools as either 0 or 1. */
1606       if (ir->type->base_type == GLSL_TYPE_BOOL) {
1607          emit(CMP(result_dst, packed_consts, src_reg(0u),
1608                   BRW_CONDITIONAL_NZ));
1609          emit(AND(result_dst, result, src_reg(0x1)));
1610       } else {
1611          emit(MOV(result_dst, packed_consts));
1612       }
1613       break;
1614    }
1615
1616    case ir_binop_vector_extract:
1617       assert(!"should have been lowered by vec_index_to_cond_assign");
1618       break;
1619
1620    case ir_triop_fma:
1621       op[0] = fix_3src_operand(op[0]);
1622       op[1] = fix_3src_operand(op[1]);
1623       op[2] = fix_3src_operand(op[2]);
1624       /* Note that the instruction's argument order is reversed from GLSL
1625        * and the IR.
1626        */
1627       emit(MAD(result_dst, op[2], op[1], op[0]));
1628       break;
1629
1630    case ir_triop_lrp:
1631       op[0] = fix_3src_operand(op[0]);
1632       op[1] = fix_3src_operand(op[1]);
1633       op[2] = fix_3src_operand(op[2]);
1634       /* Note that the instruction's argument order is reversed from GLSL
1635        * and the IR.
1636        */
1637       emit(LRP(result_dst, op[2], op[1], op[0]));
1638       break;
1639
1640    case ir_triop_csel:
1641       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1642       inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1643       inst->predicate = BRW_PREDICATE_NORMAL;
1644       break;
1645
1646    case ir_triop_bfi:
1647       op[0] = fix_3src_operand(op[0]);
1648       op[1] = fix_3src_operand(op[1]);
1649       op[2] = fix_3src_operand(op[2]);
1650       emit(BFI2(result_dst, op[0], op[1], op[2]));
1651       break;
1652
1653    case ir_triop_bitfield_extract:
1654       op[0] = fix_3src_operand(op[0]);
1655       op[1] = fix_3src_operand(op[1]);
1656       op[2] = fix_3src_operand(op[2]);
1657       /* Note that the instruction's argument order is reversed from GLSL
1658        * and the IR.
1659        */
1660       emit(BFE(result_dst, op[2], op[1], op[0]));
1661       break;
1662
1663    case ir_triop_vector_insert:
1664       assert(!"should have been lowered by lower_vector_insert");
1665       break;
1666
1667    case ir_quadop_bitfield_insert:
1668       assert(!"not reached: should be handled by "
1669               "bitfield_insert_to_bfm_bfi\n");
1670       break;
1671
1672    case ir_quadop_vector:
1673       assert(!"not reached: should be handled by lower_quadop_vector");
1674       break;
1675
1676    case ir_unop_pack_half_2x16:
1677       emit_pack_half_2x16(result_dst, op[0]);
1678       break;
1679    case ir_unop_unpack_half_2x16:
1680       emit_unpack_half_2x16(result_dst, op[0]);
1681       break;
1682    case ir_unop_pack_snorm_2x16:
1683    case ir_unop_pack_snorm_4x8:
1684    case ir_unop_pack_unorm_2x16:
1685    case ir_unop_pack_unorm_4x8:
1686    case ir_unop_unpack_snorm_2x16:
1687    case ir_unop_unpack_snorm_4x8:
1688    case ir_unop_unpack_unorm_2x16:
1689    case ir_unop_unpack_unorm_4x8:
1690       assert(!"not reached: should be handled by lower_packing_builtins");
1691       break;
1692    case ir_unop_unpack_half_2x16_split_x:
1693    case ir_unop_unpack_half_2x16_split_y:
1694    case ir_binop_pack_half_2x16_split:
1695       assert(!"not reached: should not occur in vertex shader");
1696       break;
1697    case ir_binop_ldexp:
1698       assert(!"not reached: should be handled by ldexp_to_arith()");
1699       break;
1700    }
1701 }
1702
1703
1704 void
1705 vec4_visitor::visit(ir_swizzle *ir)
1706 {
1707    src_reg src;
1708    int i = 0;
1709    int swizzle[4];
1710
1711    /* Note that this is only swizzles in expressions, not those on the left
1712     * hand side of an assignment, which do write masking.  See ir_assignment
1713     * for that.
1714     */
1715
1716    ir->val->accept(this);
1717    src = this->result;
1718    assert(src.file != BAD_FILE);
1719
1720    for (i = 0; i < ir->type->vector_elements; i++) {
1721       switch (i) {
1722       case 0:
1723          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1724          break;
1725       case 1:
1726          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1727          break;
1728       case 2:
1729          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1730          break;
1731       case 3:
1732          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1733             break;
1734       }
1735    }
1736    for (; i < 4; i++) {
1737       /* Replicate the last channel out. */
1738       swizzle[i] = swizzle[ir->type->vector_elements - 1];
1739    }
1740
1741    src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1742
1743    this->result = src;
1744 }
1745
1746 void
1747 vec4_visitor::visit(ir_dereference_variable *ir)
1748 {
1749    const struct glsl_type *type = ir->type;
1750    dst_reg *reg = variable_storage(ir->var);
1751
1752    if (!reg) {
1753       fail("Failed to find variable storage for %s\n", ir->var->name);
1754       this->result = src_reg(brw_null_reg());
1755       return;
1756    }
1757
1758    this->result = src_reg(*reg);
1759
1760    /* System values get their swizzle from the dst_reg writemask */
1761    if (ir->var->data.mode == ir_var_system_value)
1762       return;
1763
1764    if (type->is_scalar() || type->is_vector() || type->is_matrix())
1765       this->result.swizzle = swizzle_for_size(type->vector_elements);
1766 }
1767
1768
1769 int
1770 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1771 {
1772    /* Under normal circumstances array elements are stored consecutively, so
1773     * the stride is equal to the size of the array element.
1774     */
1775    return type_size(ir->type);
1776 }
1777
1778
1779 void
1780 vec4_visitor::visit(ir_dereference_array *ir)
1781 {
1782    ir_constant *constant_index;
1783    src_reg src;
1784    int array_stride = compute_array_stride(ir);
1785
1786    constant_index = ir->array_index->constant_expression_value();
1787
1788    ir->array->accept(this);
1789    src = this->result;
1790
1791    if (constant_index) {
1792       src.reg_offset += constant_index->value.i[0] * array_stride;
1793    } else {
1794       /* Variable index array dereference.  It eats the "vec4" of the
1795        * base of the array and an index that offsets the Mesa register
1796        * index.
1797        */
1798       ir->array_index->accept(this);
1799
1800       src_reg index_reg;
1801
1802       if (array_stride == 1) {
1803          index_reg = this->result;
1804       } else {
1805          index_reg = src_reg(this, glsl_type::int_type);
1806
1807          emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
1808       }
1809
1810       if (src.reladdr) {
1811          src_reg temp = src_reg(this, glsl_type::int_type);
1812
1813          emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1814
1815          index_reg = temp;
1816       }
1817
1818       src.reladdr = ralloc(mem_ctx, src_reg);
1819       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1820    }
1821
1822    /* If the type is smaller than a vec4, replicate the last channel out. */
1823    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1824       src.swizzle = swizzle_for_size(ir->type->vector_elements);
1825    else
1826       src.swizzle = BRW_SWIZZLE_NOOP;
1827    src.type = brw_type_for_base_type(ir->type);
1828
1829    this->result = src;
1830 }
1831
1832 void
1833 vec4_visitor::visit(ir_dereference_record *ir)
1834 {
1835    unsigned int i;
1836    const glsl_type *struct_type = ir->record->type;
1837    int offset = 0;
1838
1839    ir->record->accept(this);
1840
1841    for (i = 0; i < struct_type->length; i++) {
1842       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1843          break;
1844       offset += type_size(struct_type->fields.structure[i].type);
1845    }
1846
1847    /* If the type is smaller than a vec4, replicate the last channel out. */
1848    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1849       this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1850    else
1851       this->result.swizzle = BRW_SWIZZLE_NOOP;
1852    this->result.type = brw_type_for_base_type(ir->type);
1853
1854    this->result.reg_offset += offset;
1855 }
1856
1857 /**
1858  * We want to be careful in assignment setup to hit the actual storage
1859  * instead of potentially using a temporary like we might with the
1860  * ir_dereference handler.
1861  */
1862 static dst_reg
1863 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1864 {
1865    /* The LHS must be a dereference.  If the LHS is a variable indexed array
1866     * access of a vector, it must be separated into a series conditional moves
1867     * before reaching this point (see ir_vec_index_to_cond_assign).
1868     */
1869    assert(ir->as_dereference());
1870    ir_dereference_array *deref_array = ir->as_dereference_array();
1871    if (deref_array) {
1872       assert(!deref_array->array->type->is_vector());
1873    }
1874
1875    /* Use the rvalue deref handler for the most part.  We'll ignore
1876     * swizzles in it and write swizzles using writemask, though.
1877     */
1878    ir->accept(v);
1879    return dst_reg(v->result);
1880 }
1881
1882 void
1883 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1884                               const struct glsl_type *type, uint32_t predicate)
1885 {
1886    if (type->base_type == GLSL_TYPE_STRUCT) {
1887       for (unsigned int i = 0; i < type->length; i++) {
1888          emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1889       }
1890       return;
1891    }
1892
1893    if (type->is_array()) {
1894       for (unsigned int i = 0; i < type->length; i++) {
1895          emit_block_move(dst, src, type->fields.array, predicate);
1896       }
1897       return;
1898    }
1899
1900    if (type->is_matrix()) {
1901       const struct glsl_type *vec_type;
1902
1903       vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1904                                          type->vector_elements, 1);
1905
1906       for (int i = 0; i < type->matrix_columns; i++) {
1907          emit_block_move(dst, src, vec_type, predicate);
1908       }
1909       return;
1910    }
1911
1912    assert(type->is_scalar() || type->is_vector());
1913
1914    dst->type = brw_type_for_base_type(type);
1915    src->type = dst->type;
1916
1917    dst->writemask = (1 << type->vector_elements) - 1;
1918
1919    src->swizzle = swizzle_for_size(type->vector_elements);
1920
1921    vec4_instruction *inst = emit(MOV(*dst, *src));
1922    inst->predicate = predicate;
1923
1924    dst->reg_offset++;
1925    src->reg_offset++;
1926 }
1927
1928
1929 /* If the RHS processing resulted in an instruction generating a
1930  * temporary value, and it would be easy to rewrite the instruction to
1931  * generate its result right into the LHS instead, do so.  This ends
1932  * up reliably removing instructions where it can be tricky to do so
1933  * later without real UD chain information.
1934  */
1935 bool
1936 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
1937                                      dst_reg dst,
1938                                      src_reg src,
1939                                      vec4_instruction *pre_rhs_inst,
1940                                      vec4_instruction *last_rhs_inst)
1941 {
1942    /* This could be supported, but it would take more smarts. */
1943    if (ir->condition)
1944       return false;
1945
1946    if (pre_rhs_inst == last_rhs_inst)
1947       return false; /* No instructions generated to work with. */
1948
1949    /* Make sure the last instruction generated our source reg. */
1950    if (src.file != GRF ||
1951        src.file != last_rhs_inst->dst.file ||
1952        src.reg != last_rhs_inst->dst.reg ||
1953        src.reg_offset != last_rhs_inst->dst.reg_offset ||
1954        src.reladdr ||
1955        src.abs ||
1956        src.negate ||
1957        last_rhs_inst->predicate != BRW_PREDICATE_NONE)
1958       return false;
1959
1960    /* Check that that last instruction fully initialized the channels
1961     * we want to use, in the order we want to use them.  We could
1962     * potentially reswizzle the operands of many instructions so that
1963     * we could handle out of order channels, but don't yet.
1964     */
1965
1966    for (unsigned i = 0; i < 4; i++) {
1967       if (dst.writemask & (1 << i)) {
1968          if (!(last_rhs_inst->dst.writemask & (1 << i)))
1969             return false;
1970
1971          if (BRW_GET_SWZ(src.swizzle, i) != i)
1972             return false;
1973       }
1974    }
1975
1976    /* Success!  Rewrite the instruction. */
1977    last_rhs_inst->dst.file = dst.file;
1978    last_rhs_inst->dst.reg = dst.reg;
1979    last_rhs_inst->dst.reg_offset = dst.reg_offset;
1980    last_rhs_inst->dst.reladdr = dst.reladdr;
1981    last_rhs_inst->dst.writemask &= dst.writemask;
1982
1983    return true;
1984 }
1985
1986 void
1987 vec4_visitor::visit(ir_assignment *ir)
1988 {
1989    dst_reg dst = get_assignment_lhs(ir->lhs, this);
1990    uint32_t predicate = BRW_PREDICATE_NONE;
1991
1992    if (!ir->lhs->type->is_scalar() &&
1993        !ir->lhs->type->is_vector()) {
1994       ir->rhs->accept(this);
1995       src_reg src = this->result;
1996
1997       if (ir->condition) {
1998          emit_bool_to_cond_code(ir->condition, &predicate);
1999       }
2000
2001       /* emit_block_move doesn't account for swizzles in the source register.
2002        * This should be ok, since the source register is a structure or an
2003        * array, and those can't be swizzled.  But double-check to be sure.
2004        */
2005       assert(src.swizzle ==
2006              (ir->rhs->type->is_matrix()
2007               ? swizzle_for_size(ir->rhs->type->vector_elements)
2008               : BRW_SWIZZLE_NOOP));
2009
2010       emit_block_move(&dst, &src, ir->rhs->type, predicate);
2011       return;
2012    }
2013
2014    /* Now we're down to just a scalar/vector with writemasks. */
2015    int i;
2016
2017    vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2018    pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2019
2020    ir->rhs->accept(this);
2021
2022    last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2023
2024    src_reg src = this->result;
2025
2026    int swizzles[4];
2027    int first_enabled_chan = 0;
2028    int src_chan = 0;
2029
2030    assert(ir->lhs->type->is_vector() ||
2031           ir->lhs->type->is_scalar());
2032    dst.writemask = ir->write_mask;
2033
2034    for (int i = 0; i < 4; i++) {
2035       if (dst.writemask & (1 << i)) {
2036          first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
2037          break;
2038       }
2039    }
2040
2041    /* Swizzle a small RHS vector into the channels being written.
2042     *
2043     * glsl ir treats write_mask as dictating how many channels are
2044     * present on the RHS while in our instructions we need to make
2045     * those channels appear in the slots of the vec4 they're written to.
2046     */
2047    for (int i = 0; i < 4; i++) {
2048       if (dst.writemask & (1 << i))
2049          swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
2050       else
2051          swizzles[i] = first_enabled_chan;
2052    }
2053    src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2054                               swizzles[2], swizzles[3]);
2055
2056    if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2057       return;
2058    }
2059
2060    if (ir->condition) {
2061       emit_bool_to_cond_code(ir->condition, &predicate);
2062    }
2063
2064    for (i = 0; i < type_size(ir->lhs->type); i++) {
2065       vec4_instruction *inst = emit(MOV(dst, src));
2066       inst->predicate = predicate;
2067
2068       dst.reg_offset++;
2069       src.reg_offset++;
2070    }
2071 }
2072
2073 void
2074 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2075 {
2076    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2077       foreach_list(node, &ir->components) {
2078          ir_constant *field_value = (ir_constant *)node;
2079
2080          emit_constant_values(dst, field_value);
2081       }
2082       return;
2083    }
2084
2085    if (ir->type->is_array()) {
2086       for (unsigned int i = 0; i < ir->type->length; i++) {
2087          emit_constant_values(dst, ir->array_elements[i]);
2088       }
2089       return;
2090    }
2091
2092    if (ir->type->is_matrix()) {
2093       for (int i = 0; i < ir->type->matrix_columns; i++) {
2094          float *vec = &ir->value.f[i * ir->type->vector_elements];
2095
2096          for (int j = 0; j < ir->type->vector_elements; j++) {
2097             dst->writemask = 1 << j;
2098             dst->type = BRW_REGISTER_TYPE_F;
2099
2100             emit(MOV(*dst, src_reg(vec[j])));
2101          }
2102          dst->reg_offset++;
2103       }
2104       return;
2105    }
2106
2107    int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2108
2109    for (int i = 0; i < ir->type->vector_elements; i++) {
2110       if (!(remaining_writemask & (1 << i)))
2111          continue;
2112
2113       dst->writemask = 1 << i;
2114       dst->type = brw_type_for_base_type(ir->type);
2115
2116       /* Find other components that match the one we're about to
2117        * write.  Emits fewer instructions for things like vec4(0.5,
2118        * 1.5, 1.5, 1.5).
2119        */
2120       for (int j = i + 1; j < ir->type->vector_elements; j++) {
2121          if (ir->type->base_type == GLSL_TYPE_BOOL) {
2122             if (ir->value.b[i] == ir->value.b[j])
2123                dst->writemask |= (1 << j);
2124          } else {
2125             /* u, i, and f storage all line up, so no need for a
2126              * switch case for comparing each type.
2127              */
2128             if (ir->value.u[i] == ir->value.u[j])
2129                dst->writemask |= (1 << j);
2130          }
2131       }
2132
2133       switch (ir->type->base_type) {
2134       case GLSL_TYPE_FLOAT:
2135          emit(MOV(*dst, src_reg(ir->value.f[i])));
2136          break;
2137       case GLSL_TYPE_INT:
2138          emit(MOV(*dst, src_reg(ir->value.i[i])));
2139          break;
2140       case GLSL_TYPE_UINT:
2141          emit(MOV(*dst, src_reg(ir->value.u[i])));
2142          break;
2143       case GLSL_TYPE_BOOL:
2144          emit(MOV(*dst, src_reg(ir->value.b[i])));
2145          break;
2146       default:
2147          assert(!"Non-float/uint/int/bool constant");
2148          break;
2149       }
2150
2151       remaining_writemask &= ~dst->writemask;
2152    }
2153    dst->reg_offset++;
2154 }
2155
2156 void
2157 vec4_visitor::visit(ir_constant *ir)
2158 {
2159    dst_reg dst = dst_reg(this, ir->type);
2160    this->result = src_reg(dst);
2161
2162    emit_constant_values(&dst, ir);
2163 }
2164
2165 void
2166 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2167 {
2168    ir_dereference *deref = static_cast<ir_dereference *>(
2169       ir->actual_parameters.get_head());
2170    ir_variable *location = deref->variable_referenced();
2171    unsigned surf_index = (prog_data->base.binding_table.abo_start +
2172                           location->data.atomic.buffer_index);
2173
2174    /* Calculate the surface offset */
2175    src_reg offset(this, glsl_type::uint_type);
2176    ir_dereference_array *deref_array = deref->as_dereference_array();
2177    if (deref_array) {
2178       deref_array->array_index->accept(this);
2179
2180       src_reg tmp(this, glsl_type::uint_type);
2181       emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2182       emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2183    } else {
2184       offset = location->data.atomic.offset;
2185    }
2186
2187    /* Emit the appropriate machine instruction */
2188    const char *callee = ir->callee->function_name();
2189    dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2190
2191    if (!strcmp("__intrinsic_atomic_read", callee)) {
2192       emit_untyped_surface_read(surf_index, dst, offset);
2193
2194    } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2195       emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2196                           src_reg(), src_reg());
2197
2198    } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2199       emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2200                           src_reg(), src_reg());
2201    }
2202 }
2203
2204 void
2205 vec4_visitor::visit(ir_call *ir)
2206 {
2207    const char *callee = ir->callee->function_name();
2208
2209    if (!strcmp("__intrinsic_atomic_read", callee) ||
2210        !strcmp("__intrinsic_atomic_increment", callee) ||
2211        !strcmp("__intrinsic_atomic_predecrement", callee)) {
2212       visit_atomic_counter_intrinsic(ir);
2213    } else {
2214       assert(!"Unsupported intrinsic.");
2215    }
2216 }
2217
2218 src_reg
2219 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, int sampler)
2220 {
2221    vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_MCS);
2222    inst->base_mrf = 2;
2223    inst->mlen = 1;
2224    inst->sampler = sampler;
2225    inst->dst = dst_reg(this, glsl_type::uvec4_type);
2226    inst->dst.writemask = WRITEMASK_XYZW;
2227
2228    /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2229    int param_base = inst->base_mrf;
2230    int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2231    int zero_mask = 0xf & ~coord_mask;
2232
2233    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2234             coordinate));
2235
2236    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2237             src_reg(0)));
2238
2239    emit(inst);
2240    return src_reg(inst->dst);
2241 }
2242
2243 void
2244 vec4_visitor::visit(ir_texture *ir)
2245 {
2246    int sampler =
2247       _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2248
2249    /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2250     * emitting anything other than setting up the constant result.
2251     */
2252    if (ir->op == ir_tg4) {
2253       ir_constant *chan = ir->lod_info.component->as_constant();
2254       int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2255       if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2256          dst_reg result(this, ir->type);
2257          this->result = src_reg(result);
2258          emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2259          return;
2260       }
2261    }
2262
2263    /* Should be lowered by do_lower_texture_projection */
2264    assert(!ir->projector);
2265
2266    /* Should be lowered */
2267    assert(!ir->offset || !ir->offset->type->is_array());
2268
2269    /* Generate code to compute all the subexpression trees.  This has to be
2270     * done before loading any values into MRFs for the sampler message since
2271     * generating these values may involve SEND messages that need the MRFs.
2272     */
2273    src_reg coordinate;
2274    if (ir->coordinate) {
2275       ir->coordinate->accept(this);
2276       coordinate = this->result;
2277    }
2278
2279    src_reg shadow_comparitor;
2280    if (ir->shadow_comparitor) {
2281       ir->shadow_comparitor->accept(this);
2282       shadow_comparitor = this->result;
2283    }
2284
2285    bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2286    src_reg offset_value;
2287    if (has_nonconstant_offset) {
2288       ir->offset->accept(this);
2289       offset_value = src_reg(this->result);
2290    }
2291
2292    const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2293    src_reg lod, dPdx, dPdy, sample_index, mcs;
2294    switch (ir->op) {
2295    case ir_tex:
2296       lod = src_reg(0.0f);
2297       lod_type = glsl_type::float_type;
2298       break;
2299    case ir_txf:
2300    case ir_txl:
2301    case ir_txs:
2302       ir->lod_info.lod->accept(this);
2303       lod = this->result;
2304       lod_type = ir->lod_info.lod->type;
2305       break;
2306    case ir_query_levels:
2307       lod = src_reg(0);
2308       lod_type = glsl_type::int_type;
2309       break;
2310    case ir_txf_ms:
2311       ir->lod_info.sample_index->accept(this);
2312       sample_index = this->result;
2313       sample_index_type = ir->lod_info.sample_index->type;
2314
2315       if (brw->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2316          mcs = emit_mcs_fetch(ir, coordinate, sampler);
2317       else
2318          mcs = src_reg(0u);
2319       break;
2320    case ir_txd:
2321       ir->lod_info.grad.dPdx->accept(this);
2322       dPdx = this->result;
2323
2324       ir->lod_info.grad.dPdy->accept(this);
2325       dPdy = this->result;
2326
2327       lod_type = ir->lod_info.grad.dPdx->type;
2328       break;
2329    case ir_txb:
2330    case ir_lod:
2331    case ir_tg4:
2332       break;
2333    }
2334
2335    vec4_instruction *inst = NULL;
2336    switch (ir->op) {
2337    case ir_tex:
2338    case ir_txl:
2339       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXL);
2340       break;
2341    case ir_txd:
2342       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXD);
2343       break;
2344    case ir_txf:
2345       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF);
2346       break;
2347    case ir_txf_ms:
2348       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_CMS);
2349       break;
2350    case ir_txs:
2351       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
2352       break;
2353    case ir_tg4:
2354       if (has_nonconstant_offset)
2355          inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TG4_OFFSET);
2356       else
2357          inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TG4);
2358       break;
2359    case ir_query_levels:
2360       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
2361       break;
2362    case ir_txb:
2363       assert(!"TXB is not valid for vertex shaders.");
2364       break;
2365    case ir_lod:
2366       assert(!"LOD is not valid for vertex shaders.");
2367       break;
2368    default:
2369       assert(!"Unrecognized tex op");
2370    }
2371
2372    if (ir->offset != NULL && ir->op != ir_txf)
2373       inst->texture_offset = brw_texture_offset(ctx, ir->offset->as_constant());
2374
2375    /* Stuff the channel select bits in the top of the texture offset */
2376    if (ir->op == ir_tg4)
2377       inst->texture_offset |= gather_channel(ir, sampler) << 16;
2378
2379    /* The message header is necessary for:
2380     * - Gen4 (always)
2381     * - Texel offsets
2382     * - Gather channel selection
2383     * - Sampler indices too large to fit in a 4-bit value.
2384     */
2385    inst->header_present =
2386       brw->gen < 5 || inst->texture_offset != 0 || ir->op == ir_tg4 ||
2387       sampler >= 16;
2388    inst->base_mrf = 2;
2389    inst->mlen = inst->header_present + 1; /* always at least one */
2390    inst->sampler = sampler;
2391    inst->dst = dst_reg(this, ir->type);
2392    inst->dst.writemask = WRITEMASK_XYZW;
2393    inst->shadow_compare = ir->shadow_comparitor != NULL;
2394
2395    /* MRF for the first parameter */
2396    int param_base = inst->base_mrf + inst->header_present;
2397
2398    if (ir->op == ir_txs || ir->op == ir_query_levels) {
2399       int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2400       emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2401    } else {
2402       /* Load the coordinate */
2403       /* FINISHME: gl_clamp_mask and saturate */
2404       int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2405       int zero_mask = 0xf & ~coord_mask;
2406
2407       emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2408                coordinate));
2409
2410       if (zero_mask != 0) {
2411          emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2412                   src_reg(0)));
2413       }
2414       /* Load the shadow comparitor */
2415       if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2416          emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2417                           WRITEMASK_X),
2418                   shadow_comparitor));
2419          inst->mlen++;
2420       }
2421
2422       /* Load the LOD info */
2423       if (ir->op == ir_tex || ir->op == ir_txl) {
2424          int mrf, writemask;
2425          if (brw->gen >= 5) {
2426             mrf = param_base + 1;
2427             if (ir->shadow_comparitor) {
2428                writemask = WRITEMASK_Y;
2429                /* mlen already incremented */
2430             } else {
2431                writemask = WRITEMASK_X;
2432                inst->mlen++;
2433             }
2434          } else /* brw->gen == 4 */ {
2435             mrf = param_base;
2436             writemask = WRITEMASK_W;
2437          }
2438          emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2439       } else if (ir->op == ir_txf) {
2440          emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2441       } else if (ir->op == ir_txf_ms) {
2442          emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2443                   sample_index));
2444          if (brw->gen >= 7)
2445             /* MCS data is in the first channel of `mcs`, but we need to get it into
2446              * the .y channel of the second vec4 of params, so replicate .x across
2447              * the whole vec4 and then mask off everything except .y
2448              */
2449             mcs.swizzle = BRW_SWIZZLE_XXXX;
2450             emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2451                      mcs));
2452          inst->mlen++;
2453       } else if (ir->op == ir_txd) {
2454          const glsl_type *type = lod_type;
2455
2456          if (brw->gen >= 5) {
2457             dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2458             dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2459             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2460             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2461             inst->mlen++;
2462
2463             if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2464                dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2465                dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2466                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2467                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2468                inst->mlen++;
2469
2470                if (ir->shadow_comparitor) {
2471                   emit(MOV(dst_reg(MRF, param_base + 2,
2472                                    ir->shadow_comparitor->type, WRITEMASK_Z),
2473                            shadow_comparitor));
2474                }
2475             }
2476          } else /* brw->gen == 4 */ {
2477             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2478             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2479             inst->mlen += 2;
2480          }
2481       } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2482          if (ir->shadow_comparitor) {
2483             emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2484                      shadow_comparitor));
2485          }
2486
2487          emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2488                   offset_value));
2489          inst->mlen++;
2490       }
2491    }
2492
2493    emit(inst);
2494
2495    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2496     * spec requires layers.
2497     */
2498    if (ir->op == ir_txs) {
2499       glsl_type const *type = ir->sampler->type;
2500       if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2501           type->sampler_array) {
2502          emit_math(SHADER_OPCODE_INT_QUOTIENT,
2503                    writemask(inst->dst, WRITEMASK_Z),
2504                    src_reg(inst->dst), src_reg(6));
2505       }
2506    }
2507
2508    if (brw->gen == 6 && ir->op == ir_tg4) {
2509       emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2510    }
2511
2512    swizzle_result(ir, src_reg(inst->dst), sampler);
2513 }
2514
2515 /**
2516  * Apply workarounds for Gen6 gather with UINT/SINT
2517  */
2518 void
2519 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2520 {
2521    if (!wa)
2522       return;
2523
2524    int width = (wa & WA_8BIT) ? 8 : 16;
2525    dst_reg dst_f = dst;
2526    dst_f.type = BRW_REGISTER_TYPE_F;
2527
2528    /* Convert from UNORM to UINT */
2529    emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2530    emit(MOV(dst, src_reg(dst_f)));
2531
2532    if (wa & WA_SIGN) {
2533       /* Reinterpret the UINT value as a signed INT value by
2534        * shifting the sign bit into place, then shifting back
2535        * preserving sign.
2536        */
2537       emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2538       emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2539    }
2540 }
2541
2542 /**
2543  * Set up the gather channel based on the swizzle, for gather4.
2544  */
2545 uint32_t
2546 vec4_visitor::gather_channel(ir_texture *ir, int sampler)
2547 {
2548    ir_constant *chan = ir->lod_info.component->as_constant();
2549    int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2550    switch (swiz) {
2551       case SWIZZLE_X: return 0;
2552       case SWIZZLE_Y:
2553          /* gather4 sampler is broken for green channel on RG32F --
2554           * we must ask for blue instead.
2555           */
2556          if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2557             return 2;
2558          return 1;
2559       case SWIZZLE_Z: return 2;
2560       case SWIZZLE_W: return 3;
2561       default:
2562          assert(!"Not reached"); /* zero, one swizzles handled already */
2563          return 0;
2564    }
2565 }
2566
2567 void
2568 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, int sampler)
2569 {
2570    int s = key->tex.swizzles[sampler];
2571
2572    this->result = src_reg(this, ir->type);
2573    dst_reg swizzled_result(this->result);
2574
2575    if (ir->op == ir_query_levels) {
2576       /* # levels is in .w */
2577       orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2578       emit(MOV(swizzled_result, orig_val));
2579       return;
2580    }
2581
2582    if (ir->op == ir_txs || ir->type == glsl_type::float_type
2583                         || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2584       emit(MOV(swizzled_result, orig_val));
2585       return;
2586    }
2587
2588
2589    int zero_mask = 0, one_mask = 0, copy_mask = 0;
2590    int swizzle[4] = {0};
2591
2592    for (int i = 0; i < 4; i++) {
2593       switch (GET_SWZ(s, i)) {
2594       case SWIZZLE_ZERO:
2595          zero_mask |= (1 << i);
2596          break;
2597       case SWIZZLE_ONE:
2598          one_mask |= (1 << i);
2599          break;
2600       default:
2601          copy_mask |= (1 << i);
2602          swizzle[i] = GET_SWZ(s, i);
2603          break;
2604       }
2605    }
2606
2607    if (copy_mask) {
2608       orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2609       swizzled_result.writemask = copy_mask;
2610       emit(MOV(swizzled_result, orig_val));
2611    }
2612
2613    if (zero_mask) {
2614       swizzled_result.writemask = zero_mask;
2615       emit(MOV(swizzled_result, src_reg(0.0f)));
2616    }
2617
2618    if (one_mask) {
2619       swizzled_result.writemask = one_mask;
2620       emit(MOV(swizzled_result, src_reg(1.0f)));
2621    }
2622 }
2623
2624 void
2625 vec4_visitor::visit(ir_return *ir)
2626 {
2627    assert(!"not reached");
2628 }
2629
2630 void
2631 vec4_visitor::visit(ir_discard *ir)
2632 {
2633    assert(!"not reached");
2634 }
2635
2636 void
2637 vec4_visitor::visit(ir_if *ir)
2638 {
2639    /* Don't point the annotation at the if statement, because then it plus
2640     * the then and else blocks get printed.
2641     */
2642    this->base_ir = ir->condition;
2643
2644    if (brw->gen == 6) {
2645       emit_if_gen6(ir);
2646    } else {
2647       uint32_t predicate;
2648       emit_bool_to_cond_code(ir->condition, &predicate);
2649       emit(IF(predicate));
2650    }
2651
2652    visit_instructions(&ir->then_instructions);
2653
2654    if (!ir->else_instructions.is_empty()) {
2655       this->base_ir = ir->condition;
2656       emit(BRW_OPCODE_ELSE);
2657
2658       visit_instructions(&ir->else_instructions);
2659    }
2660
2661    this->base_ir = ir->condition;
2662    emit(BRW_OPCODE_ENDIF);
2663 }
2664
2665 void
2666 vec4_visitor::visit(ir_emit_vertex *)
2667 {
2668    assert(!"not reached");
2669 }
2670
2671 void
2672 vec4_visitor::visit(ir_end_primitive *)
2673 {
2674    assert(!"not reached");
2675 }
2676
2677 void
2678 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2679                                   dst_reg dst, src_reg offset,
2680                                   src_reg src0, src_reg src1)
2681 {
2682    unsigned mlen = 0;
2683
2684    /* Set the atomic operation offset. */
2685    emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2686    mlen++;
2687
2688    /* Set the atomic operation arguments. */
2689    if (src0.file != BAD_FILE) {
2690       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2691       mlen++;
2692    }
2693
2694    if (src1.file != BAD_FILE) {
2695       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2696       mlen++;
2697    }
2698
2699    /* Emit the instruction.  Note that this maps to the normal SIMD8
2700     * untyped atomic message on Ivy Bridge, but that's OK because
2701     * unused channels will be masked out.
2702     */
2703    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
2704                                  src_reg(atomic_op), src_reg(surf_index));
2705    inst->base_mrf = 0;
2706    inst->mlen = mlen;
2707 }
2708
2709 void
2710 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
2711                                         src_reg offset)
2712 {
2713    /* Set the surface read offset. */
2714    emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
2715
2716    /* Emit the instruction.  Note that this maps to the normal SIMD8
2717     * untyped surface read message, but that's OK because unused
2718     * channels will be masked out.
2719     */
2720    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ,
2721                                  dst, src_reg(surf_index));
2722    inst->base_mrf = 0;
2723    inst->mlen = 1;
2724 }
2725
2726 void
2727 vec4_visitor::emit_ndc_computation()
2728 {
2729    /* Get the position */
2730    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2731
2732    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2733    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2734    output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2735
2736    current_annotation = "NDC";
2737    dst_reg ndc_w = ndc;
2738    ndc_w.writemask = WRITEMASK_W;
2739    src_reg pos_w = pos;
2740    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2741    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2742
2743    dst_reg ndc_xyz = ndc;
2744    ndc_xyz.writemask = WRITEMASK_XYZ;
2745
2746    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2747 }
2748
2749 void
2750 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2751 {
2752    if (brw->gen < 6 &&
2753        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2754         key->userclip_active || brw->has_negative_rhw_bug)) {
2755       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2756       dst_reg header1_w = header1;
2757       header1_w.writemask = WRITEMASK_W;
2758
2759       emit(MOV(header1, 0u));
2760
2761       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2762          src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2763
2764          current_annotation = "Point size";
2765          emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2766          emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2767       }
2768
2769       if (key->userclip_active) {
2770          current_annotation = "Clipping flags";
2771          dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
2772          dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
2773
2774          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
2775          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
2776          emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
2777
2778          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
2779          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
2780          emit(SHL(flags1, src_reg(flags1), src_reg(4)));
2781          emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
2782       }
2783
2784       /* i965 clipping workaround:
2785        * 1) Test for -ve rhw
2786        * 2) If set,
2787        *      set ndc = (0,0,0,0)
2788        *      set ucp[6] = 1
2789        *
2790        * Later, clipping will detect ucp[6] and ensure the primitive is
2791        * clipped against all fixed planes.
2792        */
2793       if (brw->has_negative_rhw_bug) {
2794          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
2795          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
2796          emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
2797          vec4_instruction *inst;
2798          inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
2799          inst->predicate = BRW_PREDICATE_NORMAL;
2800          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
2801          inst->predicate = BRW_PREDICATE_NORMAL;
2802       }
2803
2804       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2805    } else if (brw->gen < 6) {
2806       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2807    } else {
2808       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2809       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2810          emit(MOV(brw_writemask(reg, WRITEMASK_W),
2811                   src_reg(output_reg[VARYING_SLOT_PSIZ])));
2812       }
2813       if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
2814          emit(MOV(retype(brw_writemask(reg, WRITEMASK_Y), BRW_REGISTER_TYPE_D),
2815                   src_reg(output_reg[VARYING_SLOT_LAYER])));
2816       }
2817       if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
2818          emit(MOV(retype(brw_writemask(reg, WRITEMASK_Z), BRW_REGISTER_TYPE_D),
2819                   src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
2820       }
2821    }
2822 }
2823
2824 void
2825 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
2826 {
2827    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2828     *
2829     *     "If a linked set of shaders forming the vertex stage contains no
2830     *     static write to gl_ClipVertex or gl_ClipDistance, but the
2831     *     application has requested clipping against user clip planes through
2832     *     the API, then the coordinate written to gl_Position is used for
2833     *     comparison against the user clip planes."
2834     *
2835     * This function is only called if the shader didn't write to
2836     * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
2837     * if the user wrote to it; otherwise we use gl_Position.
2838     */
2839    gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
2840    if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
2841       clip_vertex = VARYING_SLOT_POS;
2842    }
2843
2844    for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
2845         ++i) {
2846       reg.writemask = 1 << i;
2847       emit(DP4(reg,
2848                src_reg(output_reg[clip_vertex]),
2849                src_reg(this->userplane[i + offset])));
2850    }
2851 }
2852
2853 void
2854 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
2855 {
2856    assert (varying < VARYING_SLOT_MAX);
2857    reg.type = output_reg[varying].type;
2858    current_annotation = output_reg_annotation[varying];
2859    /* Copy the register, saturating if necessary */
2860    vec4_instruction *inst = emit(MOV(reg,
2861                                      src_reg(output_reg[varying])));
2862    if ((varying == VARYING_SLOT_COL0 ||
2863         varying == VARYING_SLOT_COL1 ||
2864         varying == VARYING_SLOT_BFC0 ||
2865         varying == VARYING_SLOT_BFC1) &&
2866        key->clamp_vertex_color) {
2867       inst->saturate = true;
2868    }
2869 }
2870
2871 void
2872 vec4_visitor::emit_urb_slot(int mrf, int varying)
2873 {
2874    struct brw_reg hw_reg = brw_message_reg(mrf);
2875    dst_reg reg = dst_reg(MRF, mrf);
2876    reg.type = BRW_REGISTER_TYPE_F;
2877
2878    switch (varying) {
2879    case VARYING_SLOT_PSIZ:
2880       /* PSIZ is always in slot 0, and is coupled with other flags. */
2881       current_annotation = "indices, point width, clip flags";
2882       emit_psiz_and_flags(hw_reg);
2883       break;
2884    case BRW_VARYING_SLOT_NDC:
2885       current_annotation = "NDC";
2886       emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
2887       break;
2888    case VARYING_SLOT_POS:
2889       current_annotation = "gl_Position";
2890       emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
2891       break;
2892    case VARYING_SLOT_EDGE:
2893       /* This is present when doing unfilled polygons.  We're supposed to copy
2894        * the edge flag from the user-provided vertex array
2895        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
2896        * of that attribute (starts as 1.0f).  This is then used in clipping to
2897        * determine which edges should be drawn as wireframe.
2898        */
2899       current_annotation = "edge flag";
2900       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
2901                                     glsl_type::float_type, WRITEMASK_XYZW))));
2902       break;
2903    case BRW_VARYING_SLOT_PAD:
2904       /* No need to write to this slot */
2905       break;
2906    default:
2907       emit_generic_urb_slot(reg, varying);
2908       break;
2909    }
2910 }
2911
2912 static int
2913 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
2914 {
2915    if (brw->gen >= 6) {
2916       /* URB data written (does not include the message header reg) must
2917        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
2918        * section 5.4.3.2.2: URB_INTERLEAVED.
2919        *
2920        * URB entries are allocated on a multiple of 1024 bits, so an
2921        * extra 128 bits written here to make the end align to 256 is
2922        * no problem.
2923        */
2924       if ((mlen % 2) != 1)
2925          mlen++;
2926    }
2927
2928    return mlen;
2929 }
2930
2931
2932 /**
2933  * Generates the VUE payload plus the necessary URB write instructions to
2934  * output it.
2935  *
2936  * The VUE layout is documented in Volume 2a.
2937  */
2938 void
2939 vec4_visitor::emit_vertex()
2940 {
2941    /* MRF 0 is reserved for the debugger, so start with message header
2942     * in MRF 1.
2943     */
2944    int base_mrf = 1;
2945    int mrf = base_mrf;
2946    /* In the process of generating our URB write message contents, we
2947     * may need to unspill a register or load from an array.  Those
2948     * reads would use MRFs 14-15.
2949     */
2950    int max_usable_mrf = 13;
2951
2952    /* The following assertion verifies that max_usable_mrf causes an
2953     * even-numbered amount of URB write data, which will meet gen6's
2954     * requirements for length alignment.
2955     */
2956    assert ((max_usable_mrf - base_mrf) % 2 == 0);
2957
2958    /* First mrf is the g0-based message header containing URB handles and
2959     * such.
2960     */
2961    emit_urb_write_header(mrf++);
2962
2963    if (brw->gen < 6) {
2964       emit_ndc_computation();
2965    }
2966
2967    /* Lower legacy ff and ClipVertex clipping to clip distances */
2968    if (key->userclip_active && !prog->UsesClipDistanceOut) {
2969       current_annotation = "user clip distances";
2970
2971       output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
2972       output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
2973
2974       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
2975       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
2976    }
2977
2978    /* We may need to split this up into several URB writes, so do them in a
2979     * loop.
2980     */
2981    int slot = 0;
2982    bool complete = false;
2983    do {
2984       /* URB offset is in URB row increments, and each of our MRFs is half of
2985        * one of those, since we're doing interleaved writes.
2986        */
2987       int offset = slot / 2;
2988
2989       mrf = base_mrf + 1;
2990       for (; slot < prog_data->vue_map.num_slots; ++slot) {
2991          emit_urb_slot(mrf++, prog_data->vue_map.slot_to_varying[slot]);
2992
2993          /* If this was max_usable_mrf, we can't fit anything more into this
2994           * URB WRITE.
2995           */
2996          if (mrf > max_usable_mrf) {
2997             slot++;
2998             break;
2999          }
3000       }
3001
3002       complete = slot >= prog_data->vue_map.num_slots;
3003       current_annotation = "URB write";
3004       vec4_instruction *inst = emit_urb_write_opcode(complete);
3005       inst->base_mrf = base_mrf;
3006       inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
3007       inst->offset += offset;
3008    } while(!complete);
3009 }
3010
3011
3012 src_reg
3013 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
3014                                  src_reg *reladdr, int reg_offset)
3015 {
3016    /* Because we store the values to scratch interleaved like our
3017     * vertex data, we need to scale the vec4 index by 2.
3018     */
3019    int message_header_scale = 2;
3020
3021    /* Pre-gen6, the message header uses byte offsets instead of vec4
3022     * (16-byte) offset units.
3023     */
3024    if (brw->gen < 6)
3025       message_header_scale *= 16;
3026
3027    if (reladdr) {
3028       src_reg index = src_reg(this, glsl_type::int_type);
3029
3030       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3031       emit_before(inst, MUL(dst_reg(index),
3032                             index, src_reg(message_header_scale)));
3033
3034       return index;
3035    } else {
3036       return src_reg(reg_offset * message_header_scale);
3037    }
3038 }
3039
3040 src_reg
3041 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
3042                                        src_reg *reladdr, int reg_offset)
3043 {
3044    if (reladdr) {
3045       src_reg index = src_reg(this, glsl_type::int_type);
3046
3047       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3048
3049       /* Pre-gen6, the message header uses byte offsets instead of vec4
3050        * (16-byte) offset units.
3051        */
3052       if (brw->gen < 6) {
3053          emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
3054       }
3055
3056       return index;
3057    } else if (brw->gen >= 8) {
3058       /* Store the offset in a GRF so we can send-from-GRF. */
3059       src_reg offset = src_reg(this, glsl_type::int_type);
3060       emit_before(inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3061       return offset;
3062    } else {
3063       int message_header_scale = brw->gen < 6 ? 16 : 1;
3064       return src_reg(reg_offset * message_header_scale);
3065    }
3066 }
3067
3068 /**
3069  * Emits an instruction before @inst to load the value named by @orig_src
3070  * from scratch space at @base_offset to @temp.
3071  *
3072  * @base_offset is measured in 32-byte units (the size of a register).
3073  */
3074 void
3075 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
3076                                 dst_reg temp, src_reg orig_src,
3077                                 int base_offset)
3078 {
3079    int reg_offset = base_offset + orig_src.reg_offset;
3080    src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
3081
3082    emit_before(inst, SCRATCH_READ(temp, index));
3083 }
3084
3085 /**
3086  * Emits an instruction after @inst to store the value to be written
3087  * to @orig_dst to scratch space at @base_offset, from @temp.
3088  *
3089  * @base_offset is measured in 32-byte units (the size of a register).
3090  */
3091 void
3092 vec4_visitor::emit_scratch_write(vec4_instruction *inst, int base_offset)
3093 {
3094    int reg_offset = base_offset + inst->dst.reg_offset;
3095    src_reg index = get_scratch_offset(inst, inst->dst.reladdr, reg_offset);
3096
3097    /* Create a temporary register to store *inst's result in.
3098     *
3099     * We have to be careful in MOVing from our temporary result register in
3100     * the scratch write.  If we swizzle from channels of the temporary that
3101     * weren't initialized, it will confuse live interval analysis, which will
3102     * make spilling fail to make progress.
3103     */
3104    src_reg temp = src_reg(this, glsl_type::vec4_type);
3105    temp.type = inst->dst.type;
3106    int first_writemask_chan = ffs(inst->dst.writemask) - 1;
3107    int swizzles[4];
3108    for (int i = 0; i < 4; i++)
3109       if (inst->dst.writemask & (1 << i))
3110          swizzles[i] = i;
3111       else
3112          swizzles[i] = first_writemask_chan;
3113    temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
3114                                swizzles[2], swizzles[3]);
3115
3116    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3117                                        inst->dst.writemask));
3118    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3119    write->predicate = inst->predicate;
3120    write->ir = inst->ir;
3121    write->annotation = inst->annotation;
3122    inst->insert_after(write);
3123
3124    inst->dst.file = temp.file;
3125    inst->dst.reg = temp.reg;
3126    inst->dst.reg_offset = temp.reg_offset;
3127    inst->dst.reladdr = NULL;
3128 }
3129
3130 /**
3131  * We can't generally support array access in GRF space, because a
3132  * single instruction's destination can only span 2 contiguous
3133  * registers.  So, we send all GRF arrays that get variable index
3134  * access to scratch space.
3135  */
3136 void
3137 vec4_visitor::move_grf_array_access_to_scratch()
3138 {
3139    int scratch_loc[this->virtual_grf_count];
3140
3141    for (int i = 0; i < this->virtual_grf_count; i++) {
3142       scratch_loc[i] = -1;
3143    }
3144
3145    /* First, calculate the set of virtual GRFs that need to be punted
3146     * to scratch due to having any array access on them, and where in
3147     * scratch.
3148     */
3149    foreach_list(node, &this->instructions) {
3150       vec4_instruction *inst = (vec4_instruction *)node;
3151
3152       if (inst->dst.file == GRF && inst->dst.reladdr &&
3153           scratch_loc[inst->dst.reg] == -1) {
3154          scratch_loc[inst->dst.reg] = c->last_scratch;
3155          c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
3156       }
3157
3158       for (int i = 0 ; i < 3; i++) {
3159          src_reg *src = &inst->src[i];
3160
3161          if (src->file == GRF && src->reladdr &&
3162              scratch_loc[src->reg] == -1) {
3163             scratch_loc[src->reg] = c->last_scratch;
3164             c->last_scratch += this->virtual_grf_sizes[src->reg];
3165          }
3166       }
3167    }
3168
3169    /* Now, for anything that will be accessed through scratch, rewrite
3170     * it to load/store.  Note that this is a _safe list walk, because
3171     * we may generate a new scratch_write instruction after the one
3172     * we're processing.
3173     */
3174    foreach_list_safe(node, &this->instructions) {
3175       vec4_instruction *inst = (vec4_instruction *)node;
3176
3177       /* Set up the annotation tracking for new generated instructions. */
3178       base_ir = inst->ir;
3179       current_annotation = inst->annotation;
3180
3181       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
3182          emit_scratch_write(inst, scratch_loc[inst->dst.reg]);
3183       }
3184
3185       for (int i = 0 ; i < 3; i++) {
3186          if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
3187             continue;
3188
3189          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3190
3191          emit_scratch_read(inst, temp, inst->src[i],
3192                            scratch_loc[inst->src[i].reg]);
3193
3194          inst->src[i].file = temp.file;
3195          inst->src[i].reg = temp.reg;
3196          inst->src[i].reg_offset = temp.reg_offset;
3197          inst->src[i].reladdr = NULL;
3198       }
3199    }
3200 }
3201
3202 /**
3203  * Emits an instruction before @inst to load the value named by @orig_src
3204  * from the pull constant buffer (surface) at @base_offset to @temp.
3205  */
3206 void
3207 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
3208                                       dst_reg temp, src_reg orig_src,
3209                                       int base_offset)
3210 {
3211    int reg_offset = base_offset + orig_src.reg_offset;
3212    src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3213    src_reg offset = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
3214    vec4_instruction *load;
3215
3216    if (brw->gen >= 7) {
3217       dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
3218       grf_offset.type = offset.type;
3219       emit_before(inst, MOV(grf_offset, offset));
3220
3221       load = new(mem_ctx) vec4_instruction(this,
3222                                            VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
3223                                            temp, index, src_reg(grf_offset));
3224    } else {
3225       load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
3226                                            temp, index, offset);
3227       load->base_mrf = 14;
3228       load->mlen = 1;
3229    }
3230    emit_before(inst, load);
3231 }
3232
3233 /**
3234  * Implements array access of uniforms by inserting a
3235  * PULL_CONSTANT_LOAD instruction.
3236  *
3237  * Unlike temporary GRF array access (where we don't support it due to
3238  * the difficulty of doing relative addressing on instruction
3239  * destinations), we could potentially do array access of uniforms
3240  * that were loaded in GRF space as push constants.  In real-world
3241  * usage we've seen, though, the arrays being used are always larger
3242  * than we could load as push constants, so just always move all
3243  * uniform array access out to a pull constant buffer.
3244  */
3245 void
3246 vec4_visitor::move_uniform_array_access_to_pull_constants()
3247 {
3248    int pull_constant_loc[this->uniforms];
3249
3250    for (int i = 0; i < this->uniforms; i++) {
3251       pull_constant_loc[i] = -1;
3252    }
3253
3254    /* Walk through and find array access of uniforms.  Put a copy of that
3255     * uniform in the pull constant buffer.
3256     *
3257     * Note that we don't move constant-indexed accesses to arrays.  No
3258     * testing has been done of the performance impact of this choice.
3259     */
3260    foreach_list_safe(node, &this->instructions) {
3261       vec4_instruction *inst = (vec4_instruction *)node;
3262
3263       for (int i = 0 ; i < 3; i++) {
3264          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3265             continue;
3266
3267          int uniform = inst->src[i].reg;
3268
3269          /* If this array isn't already present in the pull constant buffer,
3270           * add it.
3271           */
3272          if (pull_constant_loc[uniform] == -1) {
3273             const float **values = &stage_prog_data->param[uniform * 4];
3274
3275             pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3276
3277             for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3278                stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3279                   = values[j];
3280             }
3281          }
3282
3283          /* Set up the annotation tracking for new generated instructions. */
3284          base_ir = inst->ir;
3285          current_annotation = inst->annotation;
3286
3287          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3288
3289          emit_pull_constant_load(inst, temp, inst->src[i],
3290                                  pull_constant_loc[uniform]);
3291
3292          inst->src[i].file = temp.file;
3293          inst->src[i].reg = temp.reg;
3294          inst->src[i].reg_offset = temp.reg_offset;
3295          inst->src[i].reladdr = NULL;
3296       }
3297    }
3298
3299    /* Now there are no accesses of the UNIFORM file with a reladdr, so
3300     * no need to track them as larger-than-vec4 objects.  This will be
3301     * relied on in cutting out unused uniform vectors from push
3302     * constants.
3303     */
3304    split_uniform_registers();
3305 }
3306
3307 void
3308 vec4_visitor::resolve_ud_negate(src_reg *reg)
3309 {
3310    if (reg->type != BRW_REGISTER_TYPE_UD ||
3311        !reg->negate)
3312       return;
3313
3314    src_reg temp = src_reg(this, glsl_type::uvec4_type);
3315    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3316    *reg = temp;
3317 }
3318
3319 vec4_visitor::vec4_visitor(struct brw_context *brw,
3320                            struct brw_vec4_compile *c,
3321                            struct gl_program *prog,
3322                            const struct brw_vec4_prog_key *key,
3323                            struct brw_vec4_prog_data *prog_data,
3324                            struct gl_shader_program *shader_prog,
3325                            struct brw_shader *shader,
3326                            void *mem_ctx,
3327                            bool debug_flag,
3328                            bool no_spills,
3329                            shader_time_shader_type st_base,
3330                            shader_time_shader_type st_written,
3331                            shader_time_shader_type st_reset)
3332    : sanity_param_count(0),
3333      fail_msg(NULL),
3334      first_non_payload_grf(0),
3335      need_all_constants_in_pull_buffer(false),
3336      debug_flag(debug_flag),
3337      no_spills(no_spills),
3338      st_base(st_base),
3339      st_written(st_written),
3340      st_reset(st_reset)
3341 {
3342    this->brw = brw;
3343    this->ctx = &brw->ctx;
3344    this->shader_prog = shader_prog;
3345    this->shader = shader;
3346
3347    this->mem_ctx = mem_ctx;
3348    this->failed = false;
3349
3350    this->base_ir = NULL;
3351    this->current_annotation = NULL;
3352    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3353
3354    this->c = c;
3355    this->prog = prog;
3356    this->key = key;
3357    this->prog_data = prog_data;
3358    this->stage_prog_data = &prog_data->base;
3359
3360    this->variable_ht = hash_table_ctor(0,
3361                                        hash_table_pointer_hash,
3362                                        hash_table_pointer_compare);
3363
3364    this->virtual_grf_start = NULL;
3365    this->virtual_grf_end = NULL;
3366    this->virtual_grf_sizes = NULL;
3367    this->virtual_grf_count = 0;
3368    this->virtual_grf_reg_map = NULL;
3369    this->virtual_grf_reg_count = 0;
3370    this->virtual_grf_array_size = 0;
3371    this->live_intervals_valid = false;
3372
3373    this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3374
3375    this->uniforms = 0;
3376 }
3377
3378 vec4_visitor::~vec4_visitor()
3379 {
3380    hash_table_dtor(this->variable_ht);
3381 }
3382
3383
3384 void
3385 vec4_visitor::fail(const char *format, ...)
3386 {
3387    va_list va;
3388    char *msg;
3389
3390    if (failed)
3391       return;
3392
3393    failed = true;
3394
3395    va_start(va, format);
3396    msg = ralloc_vasprintf(mem_ctx, format, va);
3397    va_end(va);
3398    msg = ralloc_asprintf(mem_ctx, "vec4 compile failed: %s\n", msg);
3399
3400    this->fail_msg = msg;
3401
3402    if (debug_flag) {
3403       fprintf(stderr, "%s",  msg);
3404    }
3405 }
3406
3407 } /* namespace brw */