src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "glsl/ir_uniform.h"
  26 extern "C" {
  27 #include "main/context.h"
  28 #include "main/macros.h"
  29 #include "program/prog_parameter.h"
  30 #include "program/sampler.h"
  31 }
  32
  33 namespace brw {
  34
  35 vec4_instruction::vec4_instruction(vec4_visitor *v,
  36                                    enum opcode opcode, dst_reg dst,
  37                                    src_reg src0, src_reg src1, src_reg src2)
  38 {
  39    this->opcode = opcode;
  40    this->dst = dst;
  41    this->src[0] = src0;
  42    this->src[1] = src1;
  43    this->src[2] = src2;
  44    this->ir = v->base_ir;
  45    this->annotation = v->current_annotation;
  46 }
  47
  48 vec4_instruction *
  49 vec4_visitor::emit(vec4_instruction *inst)
  50 {
  51    this->instructions.push_tail(inst);
  52
  53    return inst;
  54 }
  55
  56 vec4_instruction *
  57 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
  58 {
  59    new_inst->ir = inst->ir;
  60    new_inst->annotation = inst->annotation;
  61
  62    inst->insert_before(new_inst);
  63
  64    return inst;
  65 }
  66
  67 vec4_instruction *
  68 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
  69                    src_reg src0, src_reg src1, src_reg src2)
  70 {
  71    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
  72                                              src0, src1, src2));
  73 }
  74
  75
  76 vec4_instruction *
  77 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
  78 {
  79    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
  80 }
  81
  82 vec4_instruction *
  83 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
  84 {
  85    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
  86 }
  87
  88 vec4_instruction *
  89 vec4_visitor::emit(enum opcode opcode)
  90 {
  91    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
  92 }
  93
  94 #define ALU1(op)                                                        \
  95    vec4_instruction *                                                   \
  96    vec4_visitor::op(dst_reg dst, src_reg src0)                          \
  97    {                                                                    \
  98       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
  99                                            src0);                       \
 100    }
 101
 102 #define ALU2(op)                                                        \
 103    vec4_instruction *                                                   \
 104    vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1)            \
 105    {                                                                    \
 106       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 107                                            src0, src1);                 \
 108    }
 109
 110 ALU1(NOT)
 111 ALU1(MOV)
 112 ALU1(FRC)
 113 ALU1(RNDD)
 114 ALU1(RNDE)
 115 ALU1(RNDZ)
 116 ALU1(F32TO16)
 117 ALU1(F16TO32)
 118 ALU2(ADD)
 119 ALU2(MUL)
 120 ALU2(MACH)
 121 ALU2(AND)
 122 ALU2(OR)
 123 ALU2(XOR)
 124 ALU2(DP3)
 125 ALU2(DP4)
 126 ALU2(DPH)
 127 ALU2(SHL)
 128 ALU2(SHR)
 129 ALU2(ASR)
 130
 131 /** Gen4 predicated IF. */
 132 vec4_instruction *
 133 vec4_visitor::IF(uint32_t predicate)
 134 {
 135    vec4_instruction *inst;
 136
 137    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
 138    inst->predicate = predicate;
 139
 140    return inst;
 141 }
 142
 143 /** Gen6+ IF with embedded comparison. */
 144 vec4_instruction *
 145 vec4_visitor::IF(src_reg src0, src_reg src1, uint32_t condition)
 146 {
 147    assert(intel->gen >= 6);
 148
 149    vec4_instruction *inst;
 150
 151    resolve_ud_negate(&src0);
 152    resolve_ud_negate(&src1);
 153
 154    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
 155                                         src0, src1);
 156    inst->conditional_mod = condition;
 157
 158    return inst;
 159 }
 160
 161 /**
 162  * CMP: Sets the low bit of the destination channels with the result
 163  * of the comparison, while the upper bits are undefined, and updates
 164  * the flag register with the packed 16 bits of the result.
 165  */
 166 vec4_instruction *
 167 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, uint32_t condition)
 168 {
 169    vec4_instruction *inst;
 170
 171    /* original gen4 does type conversion to the destination type
 172     * before before comparison, producing garbage results for floating
 173     * point comparisons.
 174     */
 175    if (intel->gen == 4) {
 176       dst.type = src0.type;
 177       if (dst.file == HW_REG)
 178          dst.fixed_hw_reg.type = dst.type;
 179    }
 180
 181    resolve_ud_negate(&src0);
 182    resolve_ud_negate(&src1);
 183
 184    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
 185    inst->conditional_mod = condition;
 186
 187    return inst;
 188 }
 189
 190 vec4_instruction *
 191 vec4_visitor::SCRATCH_READ(dst_reg dst, src_reg index)
 192 {
 193    vec4_instruction *inst;
 194
 195    inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_READ,
 196                                         dst, index);
 197    inst->base_mrf = 14;
 198    inst->mlen = 2;
 199
 200    return inst;
 201 }
 202
 203 vec4_instruction *
 204 vec4_visitor::SCRATCH_WRITE(dst_reg dst, src_reg src, src_reg index)
 205 {
 206    vec4_instruction *inst;
 207
 208    inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_WRITE,
 209                                         dst, src, index);
 210    inst->base_mrf = 13;
 211    inst->mlen = 3;
 212
 213    return inst;
 214 }
 215
 216 void
 217 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
 218 {
 219    static enum opcode dot_opcodes[] = {
 220       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
 221    };
 222
 223    emit(dot_opcodes[elements - 2], dst, src0, src1);
 224 }
 225
 226 src_reg
 227 vec4_visitor::fix_math_operand(src_reg src)
 228 {
 229    /* The gen6 math instruction ignores the source modifiers --
 230     * swizzle, abs, negate, and at least some parts of the register
 231     * region description.
 232     *
 233     * Rather than trying to enumerate all these cases, *always* expand the
 234     * operand to a temp GRF for gen6.
 235     *
 236     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 237     * can't use.
 238     */
 239
 240    if (intel->gen == 7 && src.file != IMM)
 241       return src;
 242
 243    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 244    expanded.type = src.type;
 245    emit(MOV(expanded, src));
 246    return src_reg(expanded);
 247 }
 248
 249 void
 250 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
 251 {
 252    src = fix_math_operand(src);
 253
 254    if (dst.writemask != WRITEMASK_XYZW) {
 255       /* The gen6 math instruction must be align1, so we can't do
 256        * writemasks.
 257        */
 258       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 259
 260       emit(opcode, temp_dst, src);
 261
 262       emit(MOV(dst, src_reg(temp_dst)));
 263    } else {
 264       emit(opcode, dst, src);
 265    }
 266 }
 267
 268 void
 269 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
 270 {
 271    vec4_instruction *inst = emit(opcode, dst, src);
 272    inst->base_mrf = 1;
 273    inst->mlen = 1;
 274 }
 275
 276 void
 277 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
 278 {
 279    switch (opcode) {
 280    case SHADER_OPCODE_RCP:
 281    case SHADER_OPCODE_RSQ:
 282    case SHADER_OPCODE_SQRT:
 283    case SHADER_OPCODE_EXP2:
 284    case SHADER_OPCODE_LOG2:
 285    case SHADER_OPCODE_SIN:
 286    case SHADER_OPCODE_COS:
 287       break;
 288    default:
 289       assert(!"not reached: bad math opcode");
 290       return;
 291    }
 292
 293    if (intel->gen >= 6) {
 294       return emit_math1_gen6(opcode, dst, src);
 295    } else {
 296       return emit_math1_gen4(opcode, dst, src);
 297    }
 298 }
 299
 300 void
 301 vec4_visitor::emit_math2_gen6(enum opcode opcode,
 302                               dst_reg dst, src_reg src0, src_reg src1)
 303 {
 304    src0 = fix_math_operand(src0);
 305    src1 = fix_math_operand(src1);
 306
 307    if (dst.writemask != WRITEMASK_XYZW) {
 308       /* The gen6 math instruction must be align1, so we can't do
 309        * writemasks.
 310        */
 311       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 312       temp_dst.type = dst.type;
 313
 314       emit(opcode, temp_dst, src0, src1);
 315
 316       emit(MOV(dst, src_reg(temp_dst)));
 317    } else {
 318       emit(opcode, dst, src0, src1);
 319    }
 320 }
 321
 322 void
 323 vec4_visitor::emit_math2_gen4(enum opcode opcode,
 324                               dst_reg dst, src_reg src0, src_reg src1)
 325 {
 326    vec4_instruction *inst = emit(opcode, dst, src0, src1);
 327    inst->base_mrf = 1;
 328    inst->mlen = 2;
 329 }
 330
 331 void
 332 vec4_visitor::emit_math(enum opcode opcode,
 333                         dst_reg dst, src_reg src0, src_reg src1)
 334 {
 335    switch (opcode) {
 336    case SHADER_OPCODE_POW:
 337    case SHADER_OPCODE_INT_QUOTIENT:
 338    case SHADER_OPCODE_INT_REMAINDER:
 339       break;
 340    default:
 341       assert(!"not reached: unsupported binary math opcode");
 342       return;
 343    }
 344
 345    if (intel->gen >= 6) {
 346       return emit_math2_gen6(opcode, dst, src0, src1);
 347    } else {
 348       return emit_math2_gen4(opcode, dst, src0, src1);
 349    }
 350 }
 351
 352 void
 353 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 354 {
 355    if (intel->gen < 7)
 356       assert(!"ir_unop_pack_half_2x16 should be lowered");
 357
 358    assert(dst.type == BRW_REGISTER_TYPE_UD);
 359    assert(src0.type == BRW_REGISTER_TYPE_F);
 360
 361    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
 362     *
 363     *   Because this instruction does not have a 16-bit floating-point type,
 364     *   the destination data type must be Word (W).
 365     *
 366     *   The destination must be DWord-aligned and specify a horizontal stride
 367     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
 368     *   each destination channel and the upper word is not modified.
 369     *
 370     * The above restriction implies that the f32to16 instruction must use
 371     * align1 mode, because only in align1 mode is it possible to specify
 372     * horizontal stride.  We choose here to defy the hardware docs and emit
 373     * align16 instructions.
 374     *
 375     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
 376     * instructions. I was partially successful in that the code passed all
 377     * tests.  However, the code was dubiously correct and fragile, and the
 378     * tests were not harsh enough to probe that frailty. Not trusting the
 379     * code, I chose instead to remain in align16 mode in defiance of the hw
 380     * docs).
 381     *
 382     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
 383     * simulator, emitting a f32to16 in align16 mode with UD as destination
 384     * data type is safe. The behavior differs from that specified in the PRM
 385     * in that the upper word of each destination channel is cleared to 0.
 386     */
 387
 388    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 389    src_reg tmp_src(tmp_dst);
 390
 391 #if 0
 392    /* Verify the undocumented behavior on which the following instructions
 393     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
 394     * then the result of the bit-or instruction below will be incorrect.
 395     *
 396     * You should inspect the disasm output in order to verify that the MOV is
 397     * not optimized away.
 398     */
 399    emit(MOV(tmp_dst, src_reg(0x12345678u)));
 400 #endif
 401
 402    /* Give tmp the form below, where "." means untouched.
 403     *
 404     *     w z          y          x w z          y          x
 405     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
 406     *
 407     * That the upper word of each write-channel be 0 is required for the
 408     * following bit-shift and bit-or instructions to work. Note that this
 409     * relies on the undocumented hardware behavior mentioned above.
 410     */
 411    tmp_dst.writemask = WRITEMASK_XY;
 412    emit(F32TO16(tmp_dst, src0));
 413
 414    /* Give the write-channels of dst the form:
 415     *   0xhhhh0000
 416     */
 417    tmp_src.swizzle = SWIZZLE_Y;
 418    emit(SHL(dst, tmp_src, src_reg(16u)));
 419
 420    /* Finally, give the write-channels of dst the form of packHalf2x16's
 421     * output:
 422     *   0xhhhhllll
 423     */
 424    tmp_src.swizzle = SWIZZLE_X;
 425    emit(OR(dst, src_reg(dst), tmp_src));
 426 }
 427
 428 void
 429 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 430 {
 431    if (intel->gen < 7)
 432       assert(!"ir_unop_unpack_half_2x16 should be lowered");
 433
 434    assert(dst.type == BRW_REGISTER_TYPE_F);
 435    assert(src0.type == BRW_REGISTER_TYPE_UD);
 436
 437    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
 438     *
 439     *   Because this instruction does not have a 16-bit floating-point type,
 440     *   the source data type must be Word (W). The destination type must be
 441     *   F (Float).
 442     *
 443     * To use W as the source data type, we must adjust horizontal strides,
 444     * which is only possible in align1 mode. All my [chadv] attempts at
 445     * emitting align1 instructions for unpackHalf2x16 failed to pass the
 446     * Piglit tests, so I gave up.
 447     *
 448     * I've verified that, on gen7 hardware and the simulator, it is safe to
 449     * emit f16to32 in align16 mode with UD as source data type.
 450     */
 451
 452    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 453    src_reg tmp_src(tmp_dst);
 454
 455    tmp_dst.writemask = WRITEMASK_X;
 456    emit(AND(tmp_dst, src0, src_reg(0xffffu)));
 457
 458    tmp_dst.writemask = WRITEMASK_Y;
 459    emit(SHR(tmp_dst, src0, src_reg(16u)));
 460
 461    dst.writemask = WRITEMASK_XY;
 462    emit(F16TO32(dst, tmp_src));
 463 }
 464
 465 void
 466 vec4_visitor::visit_instructions(const exec_list *list)
 467 {
 468    foreach_list(node, list) {
 469       ir_instruction *ir = (ir_instruction *)node;
 470
 471       base_ir = ir;
 472       ir->accept(this);
 473    }
 474 }
 475
 476
 477 static int
 478 type_size(const struct glsl_type *type)
 479 {
 480    unsigned int i;
 481    int size;
 482
 483    switch (type->base_type) {
 484    case GLSL_TYPE_UINT:
 485    case GLSL_TYPE_INT:
 486    case GLSL_TYPE_FLOAT:
 487    case GLSL_TYPE_BOOL:
 488       if (type->is_matrix()) {
 489          return type->matrix_columns;
 490       } else {
 491          /* Regardless of size of vector, it gets a vec4. This is bad
 492           * packing for things like floats, but otherwise arrays become a
 493           * mess.  Hopefully a later pass over the code can pack scalars
 494           * down if appropriate.
 495           */
 496          return 1;
 497       }
 498    case GLSL_TYPE_ARRAY:
 499       assert(type->length > 0);
 500       return type_size(type->fields.array) * type->length;
 501    case GLSL_TYPE_STRUCT:
 502       size = 0;
 503       for (i = 0; i < type->length; i++) {
 504          size += type_size(type->fields.structure[i].type);
 505       }
 506       return size;
 507    case GLSL_TYPE_SAMPLER:
 508       /* Samplers take up one slot in UNIFORMS[], but they're baked in
 509        * at link time.
 510        */
 511       return 1;
 512    case GLSL_TYPE_VOID:
 513    case GLSL_TYPE_ERROR:
 514       assert(0);
 515       break;
 516    }
 517
 518    return 0;
 519 }
 520
 521 int
 522 vec4_visitor::virtual_grf_alloc(int size)
 523 {
 524    if (virtual_grf_array_size <= virtual_grf_count) {
 525       if (virtual_grf_array_size == 0)
 526          virtual_grf_array_size = 16;
 527       else
 528          virtual_grf_array_size *= 2;
 529       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 530                                    virtual_grf_array_size);
 531       virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
 532                                      virtual_grf_array_size);
 533    }
 534    virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
 535    virtual_grf_reg_count += size;
 536    virtual_grf_sizes[virtual_grf_count] = size;
 537    return virtual_grf_count++;
 538 }
 539
 540 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 541 {
 542    init();
 543
 544    this->file = GRF;
 545    this->reg = v->virtual_grf_alloc(type_size(type));
 546
 547    if (type->is_array() || type->is_record()) {
 548       this->swizzle = BRW_SWIZZLE_NOOP;
 549    } else {
 550       this->swizzle = swizzle_for_size(type->vector_elements);
 551    }
 552
 553    this->type = brw_type_for_base_type(type);
 554 }
 555
 556 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 557 {
 558    init();
 559
 560    this->file = GRF;
 561    this->reg = v->virtual_grf_alloc(type_size(type));
 562
 563    if (type->is_array() || type->is_record()) {
 564       this->writemask = WRITEMASK_XYZW;
 565    } else {
 566       this->writemask = (1 << type->vector_elements) - 1;
 567    }
 568
 569    this->type = brw_type_for_base_type(type);
 570 }
 571
 572 /* Our support for uniforms is piggy-backed on the struct
 573  * gl_fragment_program, because that's where the values actually
 574  * get stored, rather than in some global gl_shader_program uniform
 575  * store.
 576  */
 577 void
 578 vec4_visitor::setup_uniform_values(ir_variable *ir)
 579 {
 580    int namelen = strlen(ir->name);
 581
 582    /* The data for our (non-builtin) uniforms is stored in a series of
 583     * gl_uniform_driver_storage structs for each subcomponent that
 584     * glGetUniformLocation() could name.  We know it's been set up in the same
 585     * order we'd walk the type, so walk the list of storage and find anything
 586     * with our name, or the prefix of a component that starts with our name.
 587     */
 588    for (unsigned u = 0; u < prog->NumUserUniformStorage; u++) {
 589       struct gl_uniform_storage *storage = &prog->UniformStorage[u];
 590
 591       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 592           (storage->name[namelen] != 0 &&
 593            storage->name[namelen] != '.' &&
 594            storage->name[namelen] != '[')) {
 595          continue;
 596       }
 597
 598       gl_constant_value *components = storage->storage;
 599       unsigned vector_count = (MAX2(storage->array_elements, 1) *
 600                                storage->type->matrix_columns);
 601
 602       for (unsigned s = 0; s < vector_count; s++) {
 603          uniform_vector_size[uniforms] = storage->type->vector_elements;
 604
 605          int i;
 606          for (i = 0; i < uniform_vector_size[uniforms]; i++) {
 607             c->prog_data.param[uniforms * 4 + i] = &components->f;
 608             components++;
 609          }
 610          for (; i < 4; i++) {
 611             static float zero = 0;
 612             c->prog_data.param[uniforms * 4 + i] = &zero;
 613          }
 614
 615          uniforms++;
 616       }
 617    }
 618 }
 619
 620 void
 621 vec4_visitor::setup_uniform_clipplane_values()
 622 {
 623    gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
 624
 625    if (intel->gen < 6) {
 626       /* Pre-Gen6, we compact clip planes.  For example, if the user
 627        * enables just clip planes 0, 1, and 3, we will enable clip planes
 628        * 0, 1, and 2 in the hardware, and we'll move clip plane 3 to clip
 629        * plane 2.  This simplifies the implementation of the Gen6 clip
 630        * thread.
 631        */
 632       int compacted_clipplane_index = 0;
 633       for (int i = 0; i < MAX_CLIP_PLANES; ++i) {
 634          if (!(c->key.userclip_planes_enabled_gen_4_5 & (1 << i)))
 635             continue;
 636
 637          this->uniform_vector_size[this->uniforms] = 4;
 638          this->userplane[compacted_clipplane_index] = dst_reg(UNIFORM, this->uniforms);
 639          this->userplane[compacted_clipplane_index].type = BRW_REGISTER_TYPE_F;
 640          for (int j = 0; j < 4; ++j) {
 641             c->prog_data.param[this->uniforms * 4 + j] = &clip_planes[i][j];
 642          }
 643          ++compacted_clipplane_index;
 644          ++this->uniforms;
 645       }
 646    } else {
 647       /* In Gen6 and later, we don't compact clip planes, because this
 648        * simplifies the implementation of gl_ClipDistance.
 649        */
 650       for (int i = 0; i < c->key.nr_userclip_plane_consts; ++i) {
 651          this->uniform_vector_size[this->uniforms] = 4;
 652          this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
 653          this->userplane[i].type = BRW_REGISTER_TYPE_F;
 654          for (int j = 0; j < 4; ++j) {
 655             c->prog_data.param[this->uniforms * 4 + j] = &clip_planes[i][j];
 656          }
 657          ++this->uniforms;
 658       }
 659    }
 660 }
 661
 662 /* Our support for builtin uniforms is even scarier than non-builtin.
 663  * It sits on top of the PROG_STATE_VAR parameters that are
 664  * automatically updated from GL context state.
 665  */
 666 void
 667 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
 668 {
 669    const ir_state_slot *const slots = ir->state_slots;
 670    assert(ir->state_slots != NULL);
 671
 672    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 673       /* This state reference has already been setup by ir_to_mesa,
 674        * but we'll get the same index back here.  We can reference
 675        * ParameterValues directly, since unlike brw_fs.cpp, we never
 676        * add new state references during compile.
 677        */
 678       int index = _mesa_add_state_reference(this->vp->Base.Parameters,
 679                                             (gl_state_index *)slots[i].tokens);
 680       float *values = &this->vp->Base.Parameters->ParameterValues[index][0].f;
 681
 682       this->uniform_vector_size[this->uniforms] = 0;
 683       /* Add each of the unique swizzled channels of the element.
 684        * This will end up matching the size of the glsl_type of this field.
 685        */
 686       int last_swiz = -1;
 687       for (unsigned int j = 0; j < 4; j++) {
 688          int swiz = GET_SWZ(slots[i].swizzle, j);
 689          last_swiz = swiz;
 690
 691          c->prog_data.param[this->uniforms * 4 + j] = &values[swiz];
 692          if (swiz <= last_swiz)
 693             this->uniform_vector_size[this->uniforms]++;
 694       }
 695       this->uniforms++;
 696    }
 697 }
 698
 699 dst_reg *
 700 vec4_visitor::variable_storage(ir_variable *var)
 701 {
 702    return (dst_reg *)hash_table_find(this->variable_ht, var);
 703 }
 704
 705 void
 706 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir, uint32_t *predicate)
 707 {
 708    ir_expression *expr = ir->as_expression();
 709
 710    *predicate = BRW_PREDICATE_NORMAL;
 711
 712    if (expr) {
 713       src_reg op[2];
 714       vec4_instruction *inst;
 715
 716       assert(expr->get_num_operands() <= 2);
 717       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 718          expr->operands[i]->accept(this);
 719          op[i] = this->result;
 720
 721          resolve_ud_negate(&op[i]);
 722       }
 723
 724       switch (expr->operation) {
 725       case ir_unop_logic_not:
 726          inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
 727          inst->conditional_mod = BRW_CONDITIONAL_Z;
 728          break;
 729
 730       case ir_binop_logic_xor:
 731          inst = emit(XOR(dst_null_d(), op[0], op[1]));
 732          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 733          break;
 734
 735       case ir_binop_logic_or:
 736          inst = emit(OR(dst_null_d(), op[0], op[1]));
 737          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 738          break;
 739
 740       case ir_binop_logic_and:
 741          inst = emit(AND(dst_null_d(), op[0], op[1]));
 742          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 743          break;
 744
 745       case ir_unop_f2b:
 746          if (intel->gen >= 6) {
 747             emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
 748          } else {
 749             inst = emit(MOV(dst_null_f(), op[0]));
 750             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 751          }
 752          break;
 753
 754       case ir_unop_i2b:
 755          if (intel->gen >= 6) {
 756             emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 757          } else {
 758             inst = emit(MOV(dst_null_d(), op[0]));
 759             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 760          }
 761          break;
 762
 763       case ir_binop_all_equal:
 764          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 765          *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
 766          break;
 767
 768       case ir_binop_any_nequal:
 769          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 770          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 771          break;
 772
 773       case ir_unop_any:
 774          inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 775          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 776          break;
 777
 778       case ir_binop_greater:
 779       case ir_binop_gequal:
 780       case ir_binop_less:
 781       case ir_binop_lequal:
 782       case ir_binop_equal:
 783       case ir_binop_nequal:
 784          emit(CMP(dst_null_d(), op[0], op[1],
 785                   brw_conditional_for_comparison(expr->operation)));
 786          break;
 787
 788       default:
 789          assert(!"not reached");
 790          break;
 791       }
 792       return;
 793    }
 794
 795    ir->accept(this);
 796
 797    resolve_ud_negate(&this->result);
 798
 799    if (intel->gen >= 6) {
 800       vec4_instruction *inst = emit(AND(dst_null_d(),
 801                                         this->result, src_reg(1)));
 802       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 803    } else {
 804       vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
 805       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 806    }
 807 }
 808
 809 /**
 810  * Emit a gen6 IF statement with the comparison folded into the IF
 811  * instruction.
 812  */
 813 void
 814 vec4_visitor::emit_if_gen6(ir_if *ir)
 815 {
 816    ir_expression *expr = ir->condition->as_expression();
 817
 818    if (expr) {
 819       src_reg op[2];
 820       dst_reg temp;
 821
 822       assert(expr->get_num_operands() <= 2);
 823       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 824          expr->operands[i]->accept(this);
 825          op[i] = this->result;
 826       }
 827
 828       switch (expr->operation) {
 829       case ir_unop_logic_not:
 830          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
 831          return;
 832
 833       case ir_binop_logic_xor:
 834          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
 835          return;
 836
 837       case ir_binop_logic_or:
 838          temp = dst_reg(this, glsl_type::bool_type);
 839          emit(OR(temp, op[0], op[1]));
 840          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 841          return;
 842
 843       case ir_binop_logic_and:
 844          temp = dst_reg(this, glsl_type::bool_type);
 845          emit(AND(temp, op[0], op[1]));
 846          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 847          return;
 848
 849       case ir_unop_f2b:
 850          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 851          return;
 852
 853       case ir_unop_i2b:
 854          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 855          return;
 856
 857       case ir_binop_greater:
 858       case ir_binop_gequal:
 859       case ir_binop_less:
 860       case ir_binop_lequal:
 861       case ir_binop_equal:
 862       case ir_binop_nequal:
 863          emit(IF(op[0], op[1],
 864                  brw_conditional_for_comparison(expr->operation)));
 865          return;
 866
 867       case ir_binop_all_equal:
 868          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 869          emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
 870          return;
 871
 872       case ir_binop_any_nequal:
 873          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 874          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 875          return;
 876
 877       case ir_unop_any:
 878          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 879          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 880          return;
 881
 882       default:
 883          assert(!"not reached");
 884          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 885          return;
 886       }
 887       return;
 888    }
 889
 890    ir->condition->accept(this);
 891
 892    emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
 893 }
 894
 895 static dst_reg
 896 with_writemask(dst_reg const & r, int mask)
 897 {
 898    dst_reg result = r;
 899    result.writemask = mask;
 900    return result;
 901 }
 902
 903 void
 904 vec4_visitor::emit_attribute_fixups()
 905 {
 906    dst_reg sign_recovery_shift;
 907    dst_reg normalize_factor;
 908    dst_reg es3_normalize_factor;
 909
 910    for (int i = 0; i < VERT_ATTRIB_MAX; i++) {
 911       if (prog_data->inputs_read & BITFIELD64_BIT(i)) {
 912          uint8_t wa_flags = c->key.gl_attrib_wa_flags[i];
 913          dst_reg reg(ATTR, i);
 914          dst_reg reg_d = reg;
 915          reg_d.type = BRW_REGISTER_TYPE_D;
 916          dst_reg reg_ud = reg;
 917          reg_ud.type = BRW_REGISTER_TYPE_UD;
 918
 919          /* Do GL_FIXED rescaling for GLES2.0.  Our GL_FIXED attributes
 920           * come in as floating point conversions of the integer values.
 921           */
 922          if (wa_flags & BRW_ATTRIB_WA_COMPONENT_MASK) {
 923             dst_reg dst = reg;
 924             dst.type = brw_type_for_base_type(glsl_type::vec4_type);
 925             dst.writemask = (1 << (wa_flags & BRW_ATTRIB_WA_COMPONENT_MASK)) - 1;
 926             emit(MUL(dst, src_reg(dst), src_reg(1.0f / 65536.0f)));
 927          }
 928
 929          /* Do sign recovery for 2101010 formats if required. */
 930          if (wa_flags & BRW_ATTRIB_WA_SIGN) {
 931             if (sign_recovery_shift.file == BAD_FILE) {
 932                /* shift constant: <22,22,22,30> */
 933                sign_recovery_shift = dst_reg(this, glsl_type::uvec4_type);
 934                emit(MOV(with_writemask(sign_recovery_shift, WRITEMASK_XYZ), src_reg(22u)));
 935                emit(MOV(with_writemask(sign_recovery_shift, WRITEMASK_W), src_reg(30u)));
 936             }
 937
 938             emit(SHL(reg_ud, src_reg(reg_ud), src_reg(sign_recovery_shift)));
 939             emit(ASR(reg_d, src_reg(reg_d), src_reg(sign_recovery_shift)));
 940          }
 941
 942          /* Apply BGRA swizzle if required. */
 943          if (wa_flags & BRW_ATTRIB_WA_BGRA) {
 944             src_reg temp = src_reg(reg);
 945             temp.swizzle = BRW_SWIZZLE4(2,1,0,3);
 946             emit(MOV(reg, temp));
 947          }
 948
 949          if (wa_flags & BRW_ATTRIB_WA_NORMALIZE) {
 950             /* ES 3.0 has different rules for converting signed normalized
 951              * fixed-point numbers than desktop GL.
 952              */
 953             if (_mesa_is_gles3(ctx) && (wa_flags & BRW_ATTRIB_WA_SIGN)) {
 954                /* According to equation 2.2 of the ES 3.0 specification,
 955                 * signed normalization conversion is done by:
 956                 *
 957                 * f = c / (2^(b-1)-1)
 958                 */
 959                if (es3_normalize_factor.file == BAD_FILE) {
 960                   /* mul constant: 1 / (2^(b-1) - 1) */
 961                   es3_normalize_factor = dst_reg(this, glsl_type::vec4_type);
 962                   emit(MOV(with_writemask(es3_normalize_factor, WRITEMASK_XYZ),
 963                            src_reg(1.0f / ((1<<9) - 1))));
 964                   emit(MOV(with_writemask(es3_normalize_factor, WRITEMASK_W),
 965                            src_reg(1.0f / ((1<<1) - 1))));
 966                }
 967
 968                dst_reg dst = reg;
 969                dst.type = brw_type_for_base_type(glsl_type::vec4_type);
 970                emit(MOV(dst, src_reg(reg_d)));
 971                emit(MUL(dst, src_reg(dst), src_reg(es3_normalize_factor)));
 972                emit_minmax(BRW_CONDITIONAL_G, dst, src_reg(dst), src_reg(-1.0f));
 973             } else {
 974                /* The following equations are from the OpenGL 3.2 specification:
 975                 *
 976                 * 2.1 unsigned normalization
 977                 * f = c/(2^n-1)
 978                 *
 979                 * 2.2 signed normalization
 980                 * f = (2c+1)/(2^n-1)
 981                 *
 982                 * Both of these share a common divisor, which is represented by
 983                 * "normalize_factor" in the code below.
 984                 */
 985                if (normalize_factor.file == BAD_FILE) {
 986                   /* 1 / (2^b - 1) for b=<10,10,10,2> */
 987                   normalize_factor = dst_reg(this, glsl_type::vec4_type);
 988                   emit(MOV(with_writemask(normalize_factor, WRITEMASK_XYZ),
 989                            src_reg(1.0f / ((1<<10) - 1))));
 990                   emit(MOV(with_writemask(normalize_factor, WRITEMASK_W),
 991                            src_reg(1.0f / ((1<<2) - 1))));
 992                }
 993
 994                dst_reg dst = reg;
 995                dst.type = brw_type_for_base_type(glsl_type::vec4_type);
 996                emit(MOV(dst, src_reg((wa_flags & BRW_ATTRIB_WA_SIGN) ? reg_d : reg_ud)));
 997
 998                /* For signed normalization, we want the numerator to be 2c+1. */
 999                if (wa_flags & BRW_ATTRIB_WA_SIGN) {
1000                   emit(MUL(dst, src_reg(dst), src_reg(2.0f)));
1001                   emit(ADD(dst, src_reg(dst), src_reg(1.0f)));
1002                }
1003
1004                emit(MUL(dst, src_reg(dst), src_reg(normalize_factor)));
1005             }
1006          }
1007
1008          if (wa_flags & BRW_ATTRIB_WA_SCALE) {
1009             dst_reg dst = reg;
1010             dst.type = brw_type_for_base_type(glsl_type::vec4_type);
1011             emit(MOV(dst, src_reg((wa_flags & BRW_ATTRIB_WA_SIGN) ? reg_d : reg_ud)));
1012          }
1013       }
1014    }
1015 }
1016
1017 void
1018 vec4_visitor::visit(ir_variable *ir)
1019 {
1020    dst_reg *reg = NULL;
1021
1022    if (variable_storage(ir))
1023       return;
1024
1025    switch (ir->mode) {
1026    case ir_var_shader_in:
1027       reg = new(mem_ctx) dst_reg(ATTR, ir->location);
1028       break;
1029
1030    case ir_var_shader_out:
1031       reg = new(mem_ctx) dst_reg(this, ir->type);
1032
1033       for (int i = 0; i < type_size(ir->type); i++) {
1034          output_reg[ir->location + i] = *reg;
1035          output_reg[ir->location + i].reg_offset = i;
1036          output_reg[ir->location + i].type =
1037             brw_type_for_base_type(ir->type->get_scalar_type());
1038          output_reg_annotation[ir->location + i] = ir->name;
1039       }
1040       break;
1041
1042    case ir_var_auto:
1043    case ir_var_temporary:
1044       reg = new(mem_ctx) dst_reg(this, ir->type);
1045       break;
1046
1047    case ir_var_uniform:
1048       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
1049
1050       /* Thanks to the lower_ubo_reference pass, we will see only
1051        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
1052        * variables, so no need for them to be in variable_ht.
1053        */
1054       if (ir->uniform_block != -1)
1055          return;
1056
1057       /* Track how big the whole uniform variable is, in case we need to put a
1058        * copy of its data into pull constants for array access.
1059        */
1060       this->uniform_size[this->uniforms] = type_size(ir->type);
1061
1062       if (!strncmp(ir->name, "gl_", 3)) {
1063          setup_builtin_uniform_values(ir);
1064       } else {
1065          setup_uniform_values(ir);
1066       }
1067       break;
1068
1069    case ir_var_system_value:
1070       /* VertexID is stored by the VF as the last vertex element, but
1071        * we don't represent it with a flag in inputs_read, so we call
1072        * it VERT_ATTRIB_MAX, which setup_attributes() picks up on.
1073        */
1074       reg = new(mem_ctx) dst_reg(ATTR, VERT_ATTRIB_MAX);
1075       prog_data->uses_vertexid = true;
1076
1077       switch (ir->location) {
1078       case SYSTEM_VALUE_VERTEX_ID:
1079          reg->writemask = WRITEMASK_X;
1080          break;
1081       case SYSTEM_VALUE_INSTANCE_ID:
1082          reg->writemask = WRITEMASK_Y;
1083          break;
1084       default:
1085          assert(!"not reached");
1086          break;
1087       }
1088       break;
1089
1090    default:
1091       assert(!"not reached");
1092    }
1093
1094    reg->type = brw_type_for_base_type(ir->type);
1095    hash_table_insert(this->variable_ht, reg, ir);
1096 }
1097
1098 void
1099 vec4_visitor::visit(ir_loop *ir)
1100 {
1101    dst_reg counter;
1102
1103    /* We don't want debugging output to print the whole body of the
1104     * loop as the annotation.
1105     */
1106    this->base_ir = NULL;
1107
1108    if (ir->counter != NULL) {
1109       this->base_ir = ir->counter;
1110       ir->counter->accept(this);
1111       counter = *(variable_storage(ir->counter));
1112
1113       if (ir->from != NULL) {
1114          this->base_ir = ir->from;
1115          ir->from->accept(this);
1116
1117          emit(MOV(counter, this->result));
1118       }
1119    }
1120
1121    emit(BRW_OPCODE_DO);
1122
1123    if (ir->to) {
1124       this->base_ir = ir->to;
1125       ir->to->accept(this);
1126
1127       emit(CMP(dst_null_d(), src_reg(counter), this->result,
1128                brw_conditional_for_comparison(ir->cmp)));
1129
1130       vec4_instruction *inst = emit(BRW_OPCODE_BREAK);
1131       inst->predicate = BRW_PREDICATE_NORMAL;
1132    }
1133
1134    visit_instructions(&ir->body_instructions);
1135
1136
1137    if (ir->increment) {
1138       this->base_ir = ir->increment;
1139       ir->increment->accept(this);
1140       emit(ADD(counter, src_reg(counter), this->result));
1141    }
1142
1143    emit(BRW_OPCODE_WHILE);
1144 }
1145
1146 void
1147 vec4_visitor::visit(ir_loop_jump *ir)
1148 {
1149    switch (ir->mode) {
1150    case ir_loop_jump::jump_break:
1151       emit(BRW_OPCODE_BREAK);
1152       break;
1153    case ir_loop_jump::jump_continue:
1154       emit(BRW_OPCODE_CONTINUE);
1155       break;
1156    }
1157 }
1158
1159
1160 void
1161 vec4_visitor::visit(ir_function_signature *ir)
1162 {
1163    assert(0);
1164    (void)ir;
1165 }
1166
1167 void
1168 vec4_visitor::visit(ir_function *ir)
1169 {
1170    /* Ignore function bodies other than main() -- we shouldn't see calls to
1171     * them since they should all be inlined.
1172     */
1173    if (strcmp(ir->name, "main") == 0) {
1174       const ir_function_signature *sig;
1175       exec_list empty;
1176
1177       sig = ir->matching_signature(&empty);
1178
1179       assert(sig);
1180
1181       visit_instructions(&sig->body);
1182    }
1183 }
1184
1185 bool
1186 vec4_visitor::try_emit_sat(ir_expression *ir)
1187 {
1188    ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
1189    if (!sat_src)
1190       return false;
1191
1192    sat_src->accept(this);
1193    src_reg src = this->result;
1194
1195    this->result = src_reg(this, ir->type);
1196    vec4_instruction *inst;
1197    inst = emit(MOV(dst_reg(this->result), src));
1198    inst->saturate = true;
1199
1200    return true;
1201 }
1202
1203 void
1204 vec4_visitor::emit_bool_comparison(unsigned int op,
1205                                  dst_reg dst, src_reg src0, src_reg src1)
1206 {
1207    /* original gen4 does destination conversion before comparison. */
1208    if (intel->gen < 5)
1209       dst.type = src0.type;
1210
1211    emit(CMP(dst, src0, src1, brw_conditional_for_comparison(op)));
1212
1213    dst.type = BRW_REGISTER_TYPE_D;
1214    emit(AND(dst, src_reg(dst), src_reg(0x1)));
1215 }
1216
1217 void
1218 vec4_visitor::emit_minmax(uint32_t conditionalmod, dst_reg dst,
1219                           src_reg src0, src_reg src1)
1220 {
1221    vec4_instruction *inst;
1222
1223    if (intel->gen >= 6) {
1224       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1225       inst->conditional_mod = conditionalmod;
1226    } else {
1227       emit(CMP(dst, src0, src1, conditionalmod));
1228
1229       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1230       inst->predicate = BRW_PREDICATE_NORMAL;
1231    }
1232 }
1233
1234 void
1235 vec4_visitor::visit(ir_expression *ir)
1236 {
1237    unsigned int operand;
1238    src_reg op[Elements(ir->operands)];
1239    src_reg result_src;
1240    dst_reg result_dst;
1241    vec4_instruction *inst;
1242
1243    if (try_emit_sat(ir))
1244       return;
1245
1246    for (operand = 0; operand < ir->get_num_operands(); operand++) {
1247       this->result.file = BAD_FILE;
1248       ir->operands[operand]->accept(this);
1249       if (this->result.file == BAD_FILE) {
1250          printf("Failed to get tree for expression operand:\n");
1251          ir->operands[operand]->print();
1252          exit(1);
1253       }
1254       op[operand] = this->result;
1255
1256       /* Matrix expression operands should have been broken down to vector
1257        * operations already.
1258        */
1259       assert(!ir->operands[operand]->type->is_matrix());
1260    }
1261
1262    int vector_elements = ir->operands[0]->type->vector_elements;
1263    if (ir->operands[1]) {
1264       vector_elements = MAX2(vector_elements,
1265                              ir->operands[1]->type->vector_elements);
1266    }
1267
1268    this->result.file = BAD_FILE;
1269
1270    /* Storage for our result.  Ideally for an assignment we'd be using
1271     * the actual storage for the result here, instead.
1272     */
1273    result_src = src_reg(this, ir->type);
1274    /* convenience for the emit functions below. */
1275    result_dst = dst_reg(result_src);
1276    /* If nothing special happens, this is the result. */
1277    this->result = result_src;
1278    /* Limit writes to the channels that will be used by result_src later.
1279     * This does limit this temp's use as a temporary for multi-instruction
1280     * sequences.
1281     */
1282    result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1283
1284    switch (ir->operation) {
1285    case ir_unop_logic_not:
1286       /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
1287        * ones complement of the whole register, not just bit 0.
1288        */
1289       emit(XOR(result_dst, op[0], src_reg(1)));
1290       break;
1291    case ir_unop_neg:
1292       op[0].negate = !op[0].negate;
1293       this->result = op[0];
1294       break;
1295    case ir_unop_abs:
1296       op[0].abs = true;
1297       op[0].negate = false;
1298       this->result = op[0];
1299       break;
1300
1301    case ir_unop_sign:
1302       emit(MOV(result_dst, src_reg(0.0f)));
1303
1304       emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_G));
1305       inst = emit(MOV(result_dst, src_reg(1.0f)));
1306       inst->predicate = BRW_PREDICATE_NORMAL;
1307
1308       emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_L));
1309       inst = emit(MOV(result_dst, src_reg(-1.0f)));
1310       inst->predicate = BRW_PREDICATE_NORMAL;
1311
1312       break;
1313
1314    case ir_unop_rcp:
1315       emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1316       break;
1317
1318    case ir_unop_exp2:
1319       emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1320       break;
1321    case ir_unop_log2:
1322       emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1323       break;
1324    case ir_unop_exp:
1325    case ir_unop_log:
1326       assert(!"not reached: should be handled by ir_explog_to_explog2");
1327       break;
1328    case ir_unop_sin:
1329    case ir_unop_sin_reduced:
1330       emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1331       break;
1332    case ir_unop_cos:
1333    case ir_unop_cos_reduced:
1334       emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1335       break;
1336
1337    case ir_unop_dFdx:
1338    case ir_unop_dFdy:
1339       assert(!"derivatives not valid in vertex shader");
1340       break;
1341
1342    case ir_unop_noise:
1343       assert(!"not reached: should be handled by lower_noise");
1344       break;
1345
1346    case ir_binop_add:
1347       emit(ADD(result_dst, op[0], op[1]));
1348       break;
1349    case ir_binop_sub:
1350       assert(!"not reached: should be handled by ir_sub_to_add_neg");
1351       break;
1352
1353    case ir_binop_mul:
1354       if (ir->type->is_integer()) {
1355          /* For integer multiplication, the MUL uses the low 16 bits
1356           * of one of the operands (src0 on gen6, src1 on gen7).  The
1357           * MACH accumulates in the contribution of the upper 16 bits
1358           * of that operand.
1359           *
1360           * FINISHME: Emit just the MUL if we know an operand is small
1361           * enough.
1362           */
1363          struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_D);
1364
1365          emit(MUL(acc, op[0], op[1]));
1366          emit(MACH(dst_null_d(), op[0], op[1]));
1367          emit(MOV(result_dst, src_reg(acc)));
1368       } else {
1369          emit(MUL(result_dst, op[0], op[1]));
1370       }
1371       break;
1372    case ir_binop_div:
1373       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1374       assert(ir->type->is_integer());
1375       emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1376       break;
1377    case ir_binop_mod:
1378       /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1379       assert(ir->type->is_integer());
1380       emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1381       break;
1382
1383    case ir_binop_less:
1384    case ir_binop_greater:
1385    case ir_binop_lequal:
1386    case ir_binop_gequal:
1387    case ir_binop_equal:
1388    case ir_binop_nequal: {
1389       emit(CMP(result_dst, op[0], op[1],
1390                brw_conditional_for_comparison(ir->operation)));
1391       emit(AND(result_dst, result_src, src_reg(0x1)));
1392       break;
1393    }
1394
1395    case ir_binop_all_equal:
1396       /* "==" operator producing a scalar boolean. */
1397       if (ir->operands[0]->type->is_vector() ||
1398           ir->operands[1]->type->is_vector()) {
1399          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1400          emit(MOV(result_dst, src_reg(0)));
1401          inst = emit(MOV(result_dst, src_reg(1)));
1402          inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1403       } else {
1404          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1405          emit(AND(result_dst, result_src, src_reg(0x1)));
1406       }
1407       break;
1408    case ir_binop_any_nequal:
1409       /* "!=" operator producing a scalar boolean. */
1410       if (ir->operands[0]->type->is_vector() ||
1411           ir->operands[1]->type->is_vector()) {
1412          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1413
1414          emit(MOV(result_dst, src_reg(0)));
1415          inst = emit(MOV(result_dst, src_reg(1)));
1416          inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1417       } else {
1418          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1419          emit(AND(result_dst, result_src, src_reg(0x1)));
1420       }
1421       break;
1422
1423    case ir_unop_any:
1424       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1425       emit(MOV(result_dst, src_reg(0)));
1426
1427       inst = emit(MOV(result_dst, src_reg(1)));
1428       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1429       break;
1430
1431    case ir_binop_logic_xor:
1432       emit(XOR(result_dst, op[0], op[1]));
1433       break;
1434
1435    case ir_binop_logic_or:
1436       emit(OR(result_dst, op[0], op[1]));
1437       break;
1438
1439    case ir_binop_logic_and:
1440       emit(AND(result_dst, op[0], op[1]));
1441       break;
1442
1443    case ir_binop_dot:
1444       assert(ir->operands[0]->type->is_vector());
1445       assert(ir->operands[0]->type == ir->operands[1]->type);
1446       emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1447       break;
1448
1449    case ir_unop_sqrt:
1450       emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1451       break;
1452    case ir_unop_rsq:
1453       emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1454       break;
1455
1456    case ir_unop_bitcast_i2f:
1457    case ir_unop_bitcast_u2f:
1458       this->result = op[0];
1459       this->result.type = BRW_REGISTER_TYPE_F;
1460       break;
1461
1462    case ir_unop_bitcast_f2i:
1463       this->result = op[0];
1464       this->result.type = BRW_REGISTER_TYPE_D;
1465       break;
1466
1467    case ir_unop_bitcast_f2u:
1468       this->result = op[0];
1469       this->result.type = BRW_REGISTER_TYPE_UD;
1470       break;
1471
1472    case ir_unop_i2f:
1473    case ir_unop_i2u:
1474    case ir_unop_u2i:
1475    case ir_unop_u2f:
1476    case ir_unop_b2f:
1477    case ir_unop_b2i:
1478    case ir_unop_f2i:
1479    case ir_unop_f2u:
1480       emit(MOV(result_dst, op[0]));
1481       break;
1482    case ir_unop_f2b:
1483    case ir_unop_i2b: {
1484       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1485       emit(AND(result_dst, result_src, src_reg(1)));
1486       break;
1487    }
1488
1489    case ir_unop_trunc:
1490       emit(RNDZ(result_dst, op[0]));
1491       break;
1492    case ir_unop_ceil:
1493       op[0].negate = !op[0].negate;
1494       inst = emit(RNDD(result_dst, op[0]));
1495       this->result.negate = true;
1496       break;
1497    case ir_unop_floor:
1498       inst = emit(RNDD(result_dst, op[0]));
1499       break;
1500    case ir_unop_fract:
1501       inst = emit(FRC(result_dst, op[0]));
1502       break;
1503    case ir_unop_round_even:
1504       emit(RNDE(result_dst, op[0]));
1505       break;
1506
1507    case ir_binop_min:
1508       emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1509       break;
1510    case ir_binop_max:
1511       emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1512       break;
1513
1514    case ir_binop_pow:
1515       emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1516       break;
1517
1518    case ir_unop_bit_not:
1519       inst = emit(NOT(result_dst, op[0]));
1520       break;
1521    case ir_binop_bit_and:
1522       inst = emit(AND(result_dst, op[0], op[1]));
1523       break;
1524    case ir_binop_bit_xor:
1525       inst = emit(XOR(result_dst, op[0], op[1]));
1526       break;
1527    case ir_binop_bit_or:
1528       inst = emit(OR(result_dst, op[0], op[1]));
1529       break;
1530
1531    case ir_binop_lshift:
1532       inst = emit(SHL(result_dst, op[0], op[1]));
1533       break;
1534
1535    case ir_binop_rshift:
1536       if (ir->type->base_type == GLSL_TYPE_INT)
1537          inst = emit(ASR(result_dst, op[0], op[1]));
1538       else
1539          inst = emit(SHR(result_dst, op[0], op[1]));
1540       break;
1541
1542    case ir_binop_ubo_load: {
1543       ir_constant *uniform_block = ir->operands[0]->as_constant();
1544       ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1545       unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1546       src_reg offset = op[1];
1547
1548       /* Now, load the vector from that offset. */
1549       assert(ir->type->is_vector() || ir->type->is_scalar());
1550
1551       src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1552       packed_consts.type = result.type;
1553       src_reg surf_index =
1554          src_reg(SURF_INDEX_VS_UBO(uniform_block->value.u[0]));
1555       if (const_offset_ir) {
1556          offset = src_reg(const_offset / 16);
1557       } else {
1558          emit(SHR(dst_reg(offset), offset, src_reg(4)));
1559       }
1560
1561       vec4_instruction *pull =
1562          emit(new(mem_ctx) vec4_instruction(this,
1563                                             VS_OPCODE_PULL_CONSTANT_LOAD,
1564                                             dst_reg(packed_consts),
1565                                             surf_index,
1566                                             offset));
1567       pull->base_mrf = 14;
1568       pull->mlen = 1;
1569
1570       packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1571       packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1572                                             const_offset % 16 / 4,
1573                                             const_offset % 16 / 4,
1574                                             const_offset % 16 / 4);
1575
1576       /* UBO bools are any nonzero int.  We store bools as either 0 or 1. */
1577       if (ir->type->base_type == GLSL_TYPE_BOOL) {
1578          emit(CMP(result_dst, packed_consts, src_reg(0u),
1579                   BRW_CONDITIONAL_NZ));
1580          emit(AND(result_dst, result, src_reg(0x1)));
1581       } else {
1582          emit(MOV(result_dst, packed_consts));
1583       }
1584       break;
1585    }
1586
1587    case ir_quadop_vector:
1588       assert(!"not reached: should be handled by lower_quadop_vector");
1589       break;
1590
1591    case ir_unop_pack_half_2x16:
1592       emit_pack_half_2x16(result_dst, op[0]);
1593       break;
1594    case ir_unop_unpack_half_2x16:
1595       emit_unpack_half_2x16(result_dst, op[0]);
1596       break;
1597    case ir_unop_pack_snorm_2x16:
1598    case ir_unop_pack_unorm_2x16:
1599    case ir_unop_unpack_snorm_2x16:
1600    case ir_unop_unpack_unorm_2x16:
1601       assert(!"not reached: should be handled by lower_packing_builtins");
1602       break;
1603    case ir_unop_unpack_half_2x16_split_x:
1604    case ir_unop_unpack_half_2x16_split_y:
1605    case ir_binop_pack_half_2x16_split:
1606       assert(!"not reached: should not occur in vertex shader");
1607       break;
1608    }
1609 }
1610
1611
1612 void
1613 vec4_visitor::visit(ir_swizzle *ir)
1614 {
1615    src_reg src;
1616    int i = 0;
1617    int swizzle[4];
1618
1619    /* Note that this is only swizzles in expressions, not those on the left
1620     * hand side of an assignment, which do write masking.  See ir_assignment
1621     * for that.
1622     */
1623
1624    ir->val->accept(this);
1625    src = this->result;
1626    assert(src.file != BAD_FILE);
1627
1628    for (i = 0; i < ir->type->vector_elements; i++) {
1629       switch (i) {
1630       case 0:
1631          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1632          break;
1633       case 1:
1634          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1635          break;
1636       case 2:
1637          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1638          break;
1639       case 3:
1640          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1641             break;
1642       }
1643    }
1644    for (; i < 4; i++) {
1645       /* Replicate the last channel out. */
1646       swizzle[i] = swizzle[ir->type->vector_elements - 1];
1647    }
1648
1649    src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1650
1651    this->result = src;
1652 }
1653
1654 void
1655 vec4_visitor::visit(ir_dereference_variable *ir)
1656 {
1657    const struct glsl_type *type = ir->type;
1658    dst_reg *reg = variable_storage(ir->var);
1659
1660    if (!reg) {
1661       fail("Failed to find variable storage for %s\n", ir->var->name);
1662       this->result = src_reg(brw_null_reg());
1663       return;
1664    }
1665
1666    this->result = src_reg(*reg);
1667
1668    /* System values get their swizzle from the dst_reg writemask */
1669    if (ir->var->mode == ir_var_system_value)
1670       return;
1671
1672    if (type->is_scalar() || type->is_vector() || type->is_matrix())
1673       this->result.swizzle = swizzle_for_size(type->vector_elements);
1674 }
1675
1676 void
1677 vec4_visitor::visit(ir_dereference_array *ir)
1678 {
1679    ir_constant *constant_index;
1680    src_reg src;
1681    int element_size = type_size(ir->type);
1682
1683    constant_index = ir->array_index->constant_expression_value();
1684
1685    ir->array->accept(this);
1686    src = this->result;
1687
1688    if (constant_index) {
1689       src.reg_offset += constant_index->value.i[0] * element_size;
1690    } else {
1691       /* Variable index array dereference.  It eats the "vec4" of the
1692        * base of the array and an index that offsets the Mesa register
1693        * index.
1694        */
1695       ir->array_index->accept(this);
1696
1697       src_reg index_reg;
1698
1699       if (element_size == 1) {
1700          index_reg = this->result;
1701       } else {
1702          index_reg = src_reg(this, glsl_type::int_type);
1703
1704          emit(MUL(dst_reg(index_reg), this->result, src_reg(element_size)));
1705       }
1706
1707       if (src.reladdr) {
1708          src_reg temp = src_reg(this, glsl_type::int_type);
1709
1710          emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1711
1712          index_reg = temp;
1713       }
1714
1715       src.reladdr = ralloc(mem_ctx, src_reg);
1716       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1717    }
1718
1719    /* If the type is smaller than a vec4, replicate the last channel out. */
1720    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1721       src.swizzle = swizzle_for_size(ir->type->vector_elements);
1722    else
1723       src.swizzle = BRW_SWIZZLE_NOOP;
1724    src.type = brw_type_for_base_type(ir->type);
1725
1726    this->result = src;
1727 }
1728
1729 void
1730 vec4_visitor::visit(ir_dereference_record *ir)
1731 {
1732    unsigned int i;
1733    const glsl_type *struct_type = ir->record->type;
1734    int offset = 0;
1735
1736    ir->record->accept(this);
1737
1738    for (i = 0; i < struct_type->length; i++) {
1739       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1740          break;
1741       offset += type_size(struct_type->fields.structure[i].type);
1742    }
1743
1744    /* If the type is smaller than a vec4, replicate the last channel out. */
1745    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1746       this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1747    else
1748       this->result.swizzle = BRW_SWIZZLE_NOOP;
1749    this->result.type = brw_type_for_base_type(ir->type);
1750
1751    this->result.reg_offset += offset;
1752 }
1753
1754 /**
1755  * We want to be careful in assignment setup to hit the actual storage
1756  * instead of potentially using a temporary like we might with the
1757  * ir_dereference handler.
1758  */
1759 static dst_reg
1760 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1761 {
1762    /* The LHS must be a dereference.  If the LHS is a variable indexed array
1763     * access of a vector, it must be separated into a series conditional moves
1764     * before reaching this point (see ir_vec_index_to_cond_assign).
1765     */
1766    assert(ir->as_dereference());
1767    ir_dereference_array *deref_array = ir->as_dereference_array();
1768    if (deref_array) {
1769       assert(!deref_array->array->type->is_vector());
1770    }
1771
1772    /* Use the rvalue deref handler for the most part.  We'll ignore
1773     * swizzles in it and write swizzles using writemask, though.
1774     */
1775    ir->accept(v);
1776    return dst_reg(v->result);
1777 }
1778
1779 void
1780 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1781                               const struct glsl_type *type, uint32_t predicate)
1782 {
1783    if (type->base_type == GLSL_TYPE_STRUCT) {
1784       for (unsigned int i = 0; i < type->length; i++) {
1785          emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1786       }
1787       return;
1788    }
1789
1790    if (type->is_array()) {
1791       for (unsigned int i = 0; i < type->length; i++) {
1792          emit_block_move(dst, src, type->fields.array, predicate);
1793       }
1794       return;
1795    }
1796
1797    if (type->is_matrix()) {
1798       const struct glsl_type *vec_type;
1799
1800       vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1801                                          type->vector_elements, 1);
1802
1803       for (int i = 0; i < type->matrix_columns; i++) {
1804          emit_block_move(dst, src, vec_type, predicate);
1805       }
1806       return;
1807    }
1808
1809    assert(type->is_scalar() || type->is_vector());
1810
1811    dst->type = brw_type_for_base_type(type);
1812    src->type = dst->type;
1813
1814    dst->writemask = (1 << type->vector_elements) - 1;
1815
1816    src->swizzle = swizzle_for_size(type->vector_elements);
1817
1818    vec4_instruction *inst = emit(MOV(*dst, *src));
1819    inst->predicate = predicate;
1820
1821    dst->reg_offset++;
1822    src->reg_offset++;
1823 }
1824
1825
1826 /* If the RHS processing resulted in an instruction generating a
1827  * temporary value, and it would be easy to rewrite the instruction to
1828  * generate its result right into the LHS instead, do so.  This ends
1829  * up reliably removing instructions where it can be tricky to do so
1830  * later without real UD chain information.
1831  */
1832 bool
1833 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
1834                                      dst_reg dst,
1835                                      src_reg src,
1836                                      vec4_instruction *pre_rhs_inst,
1837                                      vec4_instruction *last_rhs_inst)
1838 {
1839    /* This could be supported, but it would take more smarts. */
1840    if (ir->condition)
1841       return false;
1842
1843    if (pre_rhs_inst == last_rhs_inst)
1844       return false; /* No instructions generated to work with. */
1845
1846    /* Make sure the last instruction generated our source reg. */
1847    if (src.file != GRF ||
1848        src.file != last_rhs_inst->dst.file ||
1849        src.reg != last_rhs_inst->dst.reg ||
1850        src.reg_offset != last_rhs_inst->dst.reg_offset ||
1851        src.reladdr ||
1852        src.abs ||
1853        src.negate ||
1854        last_rhs_inst->predicate != BRW_PREDICATE_NONE)
1855       return false;
1856
1857    /* Check that that last instruction fully initialized the channels
1858     * we want to use, in the order we want to use them.  We could
1859     * potentially reswizzle the operands of many instructions so that
1860     * we could handle out of order channels, but don't yet.
1861     */
1862
1863    for (unsigned i = 0; i < 4; i++) {
1864       if (dst.writemask & (1 << i)) {
1865          if (!(last_rhs_inst->dst.writemask & (1 << i)))
1866             return false;
1867
1868          if (BRW_GET_SWZ(src.swizzle, i) != i)
1869             return false;
1870       }
1871    }
1872
1873    /* Success!  Rewrite the instruction. */
1874    last_rhs_inst->dst.file = dst.file;
1875    last_rhs_inst->dst.reg = dst.reg;
1876    last_rhs_inst->dst.reg_offset = dst.reg_offset;
1877    last_rhs_inst->dst.reladdr = dst.reladdr;
1878    last_rhs_inst->dst.writemask &= dst.writemask;
1879
1880    return true;
1881 }
1882
1883 void
1884 vec4_visitor::visit(ir_assignment *ir)
1885 {
1886    dst_reg dst = get_assignment_lhs(ir->lhs, this);
1887    uint32_t predicate = BRW_PREDICATE_NONE;
1888
1889    if (!ir->lhs->type->is_scalar() &&
1890        !ir->lhs->type->is_vector()) {
1891       ir->rhs->accept(this);
1892       src_reg src = this->result;
1893
1894       if (ir->condition) {
1895          emit_bool_to_cond_code(ir->condition, &predicate);
1896       }
1897
1898       /* emit_block_move doesn't account for swizzles in the source register.
1899        * This should be ok, since the source register is a structure or an
1900        * array, and those can't be swizzled.  But double-check to be sure.
1901        */
1902       assert(src.swizzle ==
1903              (ir->rhs->type->is_matrix()
1904               ? swizzle_for_size(ir->rhs->type->vector_elements)
1905               : BRW_SWIZZLE_NOOP));
1906
1907       emit_block_move(&dst, &src, ir->rhs->type, predicate);
1908       return;
1909    }
1910
1911    /* Now we're down to just a scalar/vector with writemasks. */
1912    int i;
1913
1914    vec4_instruction *pre_rhs_inst, *last_rhs_inst;
1915    pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1916
1917    ir->rhs->accept(this);
1918
1919    last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1920
1921    src_reg src = this->result;
1922
1923    int swizzles[4];
1924    int first_enabled_chan = 0;
1925    int src_chan = 0;
1926
1927    assert(ir->lhs->type->is_vector() ||
1928           ir->lhs->type->is_scalar());
1929    dst.writemask = ir->write_mask;
1930
1931    for (int i = 0; i < 4; i++) {
1932       if (dst.writemask & (1 << i)) {
1933          first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
1934          break;
1935       }
1936    }
1937
1938    /* Swizzle a small RHS vector into the channels being written.
1939     *
1940     * glsl ir treats write_mask as dictating how many channels are
1941     * present on the RHS while in our instructions we need to make
1942     * those channels appear in the slots of the vec4 they're written to.
1943     */
1944    for (int i = 0; i < 4; i++) {
1945       if (dst.writemask & (1 << i))
1946          swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
1947       else
1948          swizzles[i] = first_enabled_chan;
1949    }
1950    src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
1951                               swizzles[2], swizzles[3]);
1952
1953    if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
1954       return;
1955    }
1956
1957    if (ir->condition) {
1958       emit_bool_to_cond_code(ir->condition, &predicate);
1959    }
1960
1961    for (i = 0; i < type_size(ir->lhs->type); i++) {
1962       vec4_instruction *inst = emit(MOV(dst, src));
1963       inst->predicate = predicate;
1964
1965       dst.reg_offset++;
1966       src.reg_offset++;
1967    }
1968 }
1969
1970 void
1971 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
1972 {
1973    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
1974       foreach_list(node, &ir->components) {
1975          ir_constant *field_value = (ir_constant *)node;
1976
1977          emit_constant_values(dst, field_value);
1978       }
1979       return;
1980    }
1981
1982    if (ir->type->is_array()) {
1983       for (unsigned int i = 0; i < ir->type->length; i++) {
1984          emit_constant_values(dst, ir->array_elements[i]);
1985       }
1986       return;
1987    }
1988
1989    if (ir->type->is_matrix()) {
1990       for (int i = 0; i < ir->type->matrix_columns; i++) {
1991          float *vec = &ir->value.f[i * ir->type->vector_elements];
1992
1993          for (int j = 0; j < ir->type->vector_elements; j++) {
1994             dst->writemask = 1 << j;
1995             dst->type = BRW_REGISTER_TYPE_F;
1996
1997             emit(MOV(*dst, src_reg(vec[j])));
1998          }
1999          dst->reg_offset++;
2000       }
2001       return;
2002    }
2003
2004    int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2005
2006    for (int i = 0; i < ir->type->vector_elements; i++) {
2007       if (!(remaining_writemask & (1 << i)))
2008          continue;
2009
2010       dst->writemask = 1 << i;
2011       dst->type = brw_type_for_base_type(ir->type);
2012
2013       /* Find other components that match the one we're about to
2014        * write.  Emits fewer instructions for things like vec4(0.5,
2015        * 1.5, 1.5, 1.5).
2016        */
2017       for (int j = i + 1; j < ir->type->vector_elements; j++) {
2018          if (ir->type->base_type == GLSL_TYPE_BOOL) {
2019             if (ir->value.b[i] == ir->value.b[j])
2020                dst->writemask |= (1 << j);
2021          } else {
2022             /* u, i, and f storage all line up, so no need for a
2023              * switch case for comparing each type.
2024              */
2025             if (ir->value.u[i] == ir->value.u[j])
2026                dst->writemask |= (1 << j);
2027          }
2028       }
2029
2030       switch (ir->type->base_type) {
2031       case GLSL_TYPE_FLOAT:
2032          emit(MOV(*dst, src_reg(ir->value.f[i])));
2033          break;
2034       case GLSL_TYPE_INT:
2035          emit(MOV(*dst, src_reg(ir->value.i[i])));
2036          break;
2037       case GLSL_TYPE_UINT:
2038          emit(MOV(*dst, src_reg(ir->value.u[i])));
2039          break;
2040       case GLSL_TYPE_BOOL:
2041          emit(MOV(*dst, src_reg(ir->value.b[i])));
2042          break;
2043       default:
2044          assert(!"Non-float/uint/int/bool constant");
2045          break;
2046       }
2047
2048       remaining_writemask &= ~dst->writemask;
2049    }
2050    dst->reg_offset++;
2051 }
2052
2053 void
2054 vec4_visitor::visit(ir_constant *ir)
2055 {
2056    dst_reg dst = dst_reg(this, ir->type);
2057    this->result = src_reg(dst);
2058
2059    emit_constant_values(&dst, ir);
2060 }
2061
2062 void
2063 vec4_visitor::visit(ir_call *ir)
2064 {
2065    assert(!"not reached");
2066 }
2067
2068 void
2069 vec4_visitor::visit(ir_texture *ir)
2070 {
2071    int sampler = _mesa_get_sampler_uniform_value(ir->sampler, prog, &vp->Base);
2072
2073    /* Should be lowered by do_lower_texture_projection */
2074    assert(!ir->projector);
2075
2076    /* Generate code to compute all the subexpression trees.  This has to be
2077     * done before loading any values into MRFs for the sampler message since
2078     * generating these values may involve SEND messages that need the MRFs.
2079     */
2080    src_reg coordinate;
2081    if (ir->coordinate) {
2082       ir->coordinate->accept(this);
2083       coordinate = this->result;
2084    }
2085
2086    src_reg shadow_comparitor;
2087    if (ir->shadow_comparitor) {
2088       ir->shadow_comparitor->accept(this);
2089       shadow_comparitor = this->result;
2090    }
2091
2092    const glsl_type *lod_type;
2093    src_reg lod, dPdx, dPdy;
2094    switch (ir->op) {
2095    case ir_tex:
2096       lod = src_reg(0.0f);
2097       lod_type = glsl_type::float_type;
2098       break;
2099    case ir_txf:
2100    case ir_txl:
2101    case ir_txs:
2102       ir->lod_info.lod->accept(this);
2103       lod = this->result;
2104       lod_type = ir->lod_info.lod->type;
2105       break;
2106    case ir_txd:
2107       ir->lod_info.grad.dPdx->accept(this);
2108       dPdx = this->result;
2109
2110       ir->lod_info.grad.dPdy->accept(this);
2111       dPdy = this->result;
2112
2113       lod_type = ir->lod_info.grad.dPdx->type;
2114       break;
2115    case ir_txb:
2116       break;
2117    }
2118
2119    vec4_instruction *inst = NULL;
2120    switch (ir->op) {
2121    case ir_tex:
2122    case ir_txl:
2123       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXL);
2124       break;
2125    case ir_txd:
2126       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXD);
2127       break;
2128    case ir_txf:
2129       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF);
2130       break;
2131    case ir_txs:
2132       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
2133       break;
2134    case ir_txb:
2135       assert(!"TXB is not valid for vertex shaders.");
2136    }
2137
2138    bool use_texture_offset = ir->offset != NULL && ir->op != ir_txf;
2139
2140    /* Texel offsets go in the message header; Gen4 also requires headers. */
2141    inst->header_present = use_texture_offset || intel->gen < 5;
2142    inst->base_mrf = 2;
2143    inst->mlen = inst->header_present + 1; /* always at least one */
2144    inst->sampler = sampler;
2145    inst->dst = dst_reg(this, ir->type);
2146    inst->dst.writemask = WRITEMASK_XYZW;
2147    inst->shadow_compare = ir->shadow_comparitor != NULL;
2148
2149    if (use_texture_offset)
2150       inst->texture_offset = brw_texture_offset(ir->offset->as_constant());
2151
2152    /* MRF for the first parameter */
2153    int param_base = inst->base_mrf + inst->header_present;
2154
2155    if (ir->op == ir_txs) {
2156       int writemask = intel->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2157       emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2158    } else {
2159       int i, coord_mask = 0, zero_mask = 0;
2160       /* Load the coordinate */
2161       /* FINISHME: gl_clamp_mask and saturate */
2162       for (i = 0; i < ir->coordinate->type->vector_elements; i++)
2163          coord_mask |= (1 << i);
2164       for (; i < 4; i++)
2165          zero_mask |= (1 << i);
2166
2167       if (ir->offset && ir->op == ir_txf) {
2168          /* It appears that the ld instruction used for txf does its
2169           * address bounds check before adding in the offset.  To work
2170           * around this, just add the integer offset to the integer
2171           * texel coordinate, and don't put the offset in the header.
2172           */
2173          ir_constant *offset = ir->offset->as_constant();
2174          assert(offset);
2175
2176          for (int j = 0; j < ir->coordinate->type->vector_elements; j++) {
2177             src_reg src = coordinate;
2178             src.swizzle = BRW_SWIZZLE4(BRW_GET_SWZ(src.swizzle, j),
2179                                        BRW_GET_SWZ(src.swizzle, j),
2180                                        BRW_GET_SWZ(src.swizzle, j),
2181                                        BRW_GET_SWZ(src.swizzle, j));
2182             emit(ADD(dst_reg(MRF, param_base, ir->coordinate->type, 1 << j),
2183                      src, offset->value.i[j]));
2184          }
2185       } else {
2186          emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2187                   coordinate));
2188       }
2189       emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2190                src_reg(0)));
2191       /* Load the shadow comparitor */
2192       if (ir->shadow_comparitor) {
2193          emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2194                           WRITEMASK_X),
2195                   shadow_comparitor));
2196          inst->mlen++;
2197       }
2198
2199       /* Load the LOD info */
2200       if (ir->op == ir_tex || ir->op == ir_txl) {
2201          int mrf, writemask;
2202          if (intel->gen >= 5) {
2203             mrf = param_base + 1;
2204             if (ir->shadow_comparitor) {
2205                writemask = WRITEMASK_Y;
2206                /* mlen already incremented */
2207             } else {
2208                writemask = WRITEMASK_X;
2209                inst->mlen++;
2210             }
2211          } else /* intel->gen == 4 */ {
2212             mrf = param_base;
2213             writemask = WRITEMASK_Z;
2214          }
2215          emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2216       } else if (ir->op == ir_txf) {
2217          emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W),
2218                   lod));
2219       } else if (ir->op == ir_txd) {
2220          const glsl_type *type = lod_type;
2221
2222          if (intel->gen >= 5) {
2223             dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2224             dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2225             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2226             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2227             inst->mlen++;
2228
2229             if (ir->type->vector_elements == 3) {
2230                dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2231                dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2232                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2233                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2234                inst->mlen++;
2235             }
2236          } else /* intel->gen == 4 */ {
2237             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2238             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2239             inst->mlen += 2;
2240          }
2241       }
2242    }
2243
2244    emit(inst);
2245
2246    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2247     * spec requires layers.
2248     */
2249    if (ir->op == ir_txs) {
2250       glsl_type const *type = ir->sampler->type;
2251       if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2252           type->sampler_array) {
2253          emit_math(SHADER_OPCODE_INT_QUOTIENT,
2254                    with_writemask(inst->dst, WRITEMASK_Z),
2255                    src_reg(inst->dst), src_reg(6));
2256       }
2257    }
2258
2259    swizzle_result(ir, src_reg(inst->dst), sampler);
2260 }
2261
2262 void
2263 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, int sampler)
2264 {
2265    int s = c->key.tex.swizzles[sampler];
2266
2267    this->result = src_reg(this, ir->type);
2268    dst_reg swizzled_result(this->result);
2269
2270    if (ir->op == ir_txs || ir->type == glsl_type::float_type
2271                         || s == SWIZZLE_NOOP) {
2272       emit(MOV(swizzled_result, orig_val));
2273       return;
2274    }
2275
2276    int zero_mask = 0, one_mask = 0, copy_mask = 0;
2277    int swizzle[4];
2278
2279    for (int i = 0; i < 4; i++) {
2280       switch (GET_SWZ(s, i)) {
2281       case SWIZZLE_ZERO:
2282          zero_mask |= (1 << i);
2283          break;
2284       case SWIZZLE_ONE:
2285          one_mask |= (1 << i);
2286          break;
2287       default:
2288          copy_mask |= (1 << i);
2289          swizzle[i] = GET_SWZ(s, i);
2290          break;
2291       }
2292    }
2293
2294    if (copy_mask) {
2295       orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2296       swizzled_result.writemask = copy_mask;
2297       emit(MOV(swizzled_result, orig_val));
2298    }
2299
2300    if (zero_mask) {
2301       swizzled_result.writemask = zero_mask;
2302       emit(MOV(swizzled_result, src_reg(0.0f)));
2303    }
2304
2305    if (one_mask) {
2306       swizzled_result.writemask = one_mask;
2307       emit(MOV(swizzled_result, src_reg(1.0f)));
2308    }
2309 }
2310
2311 void
2312 vec4_visitor::visit(ir_return *ir)
2313 {
2314    assert(!"not reached");
2315 }
2316
2317 void
2318 vec4_visitor::visit(ir_discard *ir)
2319 {
2320    assert(!"not reached");
2321 }
2322
2323 void
2324 vec4_visitor::visit(ir_if *ir)
2325 {
2326    /* Don't point the annotation at the if statement, because then it plus
2327     * the then and else blocks get printed.
2328     */
2329    this->base_ir = ir->condition;
2330
2331    if (intel->gen == 6) {
2332       emit_if_gen6(ir);
2333    } else {
2334       uint32_t predicate;
2335       emit_bool_to_cond_code(ir->condition, &predicate);
2336       emit(IF(predicate));
2337    }
2338
2339    visit_instructions(&ir->then_instructions);
2340
2341    if (!ir->else_instructions.is_empty()) {
2342       this->base_ir = ir->condition;
2343       emit(BRW_OPCODE_ELSE);
2344
2345       visit_instructions(&ir->else_instructions);
2346    }
2347
2348    this->base_ir = ir->condition;
2349    emit(BRW_OPCODE_ENDIF);
2350 }
2351
2352 void
2353 vec4_visitor::emit_ndc_computation()
2354 {
2355    /* Get the position */
2356    src_reg pos = src_reg(output_reg[VERT_RESULT_HPOS]);
2357
2358    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2359    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2360    output_reg[BRW_VERT_RESULT_NDC] = ndc;
2361
2362    current_annotation = "NDC";
2363    dst_reg ndc_w = ndc;
2364    ndc_w.writemask = WRITEMASK_W;
2365    src_reg pos_w = pos;
2366    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2367    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2368
2369    dst_reg ndc_xyz = ndc;
2370    ndc_xyz.writemask = WRITEMASK_XYZ;
2371
2372    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2373 }
2374
2375 void
2376 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2377 {
2378    if (intel->gen < 6 &&
2379        ((c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) ||
2380         c->key.userclip_active || brw->has_negative_rhw_bug)) {
2381       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2382       dst_reg header1_w = header1;
2383       header1_w.writemask = WRITEMASK_W;
2384       GLuint i;
2385
2386       emit(MOV(header1, 0u));
2387
2388       if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
2389          src_reg psiz = src_reg(output_reg[VERT_RESULT_PSIZ]);
2390
2391          current_annotation = "Point size";
2392          emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2393          emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2394       }
2395
2396       current_annotation = "Clipping flags";
2397       for (i = 0; i < c->key.nr_userclip_plane_consts; i++) {
2398          vec4_instruction *inst;
2399
2400          inst = emit(DP4(dst_null_f(), src_reg(output_reg[VERT_RESULT_HPOS]),
2401                          src_reg(this->userplane[i])));
2402          inst->conditional_mod = BRW_CONDITIONAL_L;
2403
2404          inst = emit(OR(header1_w, src_reg(header1_w), 1u << i));
2405          inst->predicate = BRW_PREDICATE_NORMAL;
2406       }
2407
2408       /* i965 clipping workaround:
2409        * 1) Test for -ve rhw
2410        * 2) If set,
2411        *      set ndc = (0,0,0,0)
2412        *      set ucp[6] = 1
2413        *
2414        * Later, clipping will detect ucp[6] and ensure the primitive is
2415        * clipped against all fixed planes.
2416        */
2417       if (brw->has_negative_rhw_bug) {
2418 #if 0
2419          /* FINISHME */
2420          brw_CMP(p,
2421                  vec8(brw_null_reg()),
2422                  BRW_CONDITIONAL_L,
2423                  brw_swizzle1(output_reg[BRW_VERT_RESULT_NDC], 3),
2424                  brw_imm_f(0));
2425
2426          brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<6));
2427          brw_MOV(p, output_reg[BRW_VERT_RESULT_NDC], brw_imm_f(0));
2428          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2429 #endif
2430       }
2431
2432       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2433    } else if (intel->gen < 6) {
2434       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2435    } else {
2436       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2437       if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
2438          emit(MOV(brw_writemask(reg, WRITEMASK_W),
2439                   src_reg(output_reg[VERT_RESULT_PSIZ])));
2440       }
2441    }
2442 }
2443
2444 void
2445 vec4_visitor::emit_clip_distances(struct brw_reg reg, int offset)
2446 {
2447    if (intel->gen < 6) {
2448       /* Clip distance slots are set aside in gen5, but they are not used.  It
2449        * is not clear whether we actually need to set aside space for them,
2450        * but the performance cost is negligible.
2451        */
2452       return;
2453    }
2454
2455    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2456     *
2457     *     "If a linked set of shaders forming the vertex stage contains no
2458     *     static write to gl_ClipVertex or gl_ClipDistance, but the
2459     *     application has requested clipping against user clip planes through
2460     *     the API, then the coordinate written to gl_Position is used for
2461     *     comparison against the user clip planes."
2462     *
2463     * This function is only called if the shader didn't write to
2464     * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
2465     * if the user wrote to it; otherwise we use gl_Position.
2466     */
2467    gl_vert_result clip_vertex = VERT_RESULT_CLIP_VERTEX;
2468    if (!(c->prog_data.outputs_written
2469          & BITFIELD64_BIT(VERT_RESULT_CLIP_VERTEX))) {
2470       clip_vertex = VERT_RESULT_HPOS;
2471    }
2472
2473    for (int i = 0; i + offset < c->key.nr_userclip_plane_consts && i < 4;
2474         ++i) {
2475       emit(DP4(dst_reg(brw_writemask(reg, 1 << i)),
2476                src_reg(output_reg[clip_vertex]),
2477                src_reg(this->userplane[i + offset])));
2478    }
2479 }
2480
2481 void
2482 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int vert_result)
2483 {
2484    assert (vert_result < VERT_RESULT_MAX);
2485    reg.type = output_reg[vert_result].type;
2486    current_annotation = output_reg_annotation[vert_result];
2487    /* Copy the register, saturating if necessary */
2488    vec4_instruction *inst = emit(MOV(reg,
2489                                      src_reg(output_reg[vert_result])));
2490    if ((vert_result == VERT_RESULT_COL0 ||
2491         vert_result == VERT_RESULT_COL1 ||
2492         vert_result == VERT_RESULT_BFC0 ||
2493         vert_result == VERT_RESULT_BFC1) &&
2494        c->key.clamp_vertex_color) {
2495       inst->saturate = true;
2496    }
2497 }
2498
2499 void
2500 vec4_visitor::emit_urb_slot(int mrf, int vert_result)
2501 {
2502    struct brw_reg hw_reg = brw_message_reg(mrf);
2503    dst_reg reg = dst_reg(MRF, mrf);
2504    reg.type = BRW_REGISTER_TYPE_F;
2505
2506    switch (vert_result) {
2507    case VERT_RESULT_PSIZ:
2508       /* PSIZ is always in slot 0, and is coupled with other flags. */
2509       current_annotation = "indices, point width, clip flags";
2510       emit_psiz_and_flags(hw_reg);
2511       break;
2512    case BRW_VERT_RESULT_NDC:
2513       current_annotation = "NDC";
2514       emit(MOV(reg, src_reg(output_reg[BRW_VERT_RESULT_NDC])));
2515       break;
2516    case BRW_VERT_RESULT_HPOS_DUPLICATE:
2517    case VERT_RESULT_HPOS:
2518       current_annotation = "gl_Position";
2519       emit(MOV(reg, src_reg(output_reg[VERT_RESULT_HPOS])));
2520       break;
2521    case VERT_RESULT_CLIP_DIST0:
2522    case VERT_RESULT_CLIP_DIST1:
2523       if (this->c->key.uses_clip_distance) {
2524          emit_generic_urb_slot(reg, vert_result);
2525       } else {
2526          current_annotation = "user clip distances";
2527          emit_clip_distances(hw_reg, (vert_result - VERT_RESULT_CLIP_DIST0) * 4);
2528       }
2529       break;
2530    case VERT_RESULT_EDGE:
2531       /* This is present when doing unfilled polygons.  We're supposed to copy
2532        * the edge flag from the user-provided vertex array
2533        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
2534        * of that attribute (starts as 1.0f).  This is then used in clipping to
2535        * determine which edges should be drawn as wireframe.
2536        */
2537       current_annotation = "edge flag";
2538       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
2539                                     glsl_type::float_type, WRITEMASK_XYZW))));
2540       break;
2541    case BRW_VERT_RESULT_PAD:
2542       /* No need to write to this slot */
2543       break;
2544    default:
2545       emit_generic_urb_slot(reg, vert_result);
2546       break;
2547    }
2548 }
2549
2550 static int
2551 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
2552 {
2553    struct intel_context *intel = &brw->intel;
2554
2555    if (intel->gen >= 6) {
2556       /* URB data written (does not include the message header reg) must
2557        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
2558        * section 5.4.3.2.2: URB_INTERLEAVED.
2559        *
2560        * URB entries are allocated on a multiple of 1024 bits, so an
2561        * extra 128 bits written here to make the end align to 256 is
2562        * no problem.
2563        */
2564       if ((mlen % 2) != 1)
2565          mlen++;
2566    }
2567
2568    return mlen;
2569 }
2570
2571 /**
2572  * Generates the VUE payload plus the 1 or 2 URB write instructions to
2573  * complete the VS thread.
2574  *
2575  * The VUE layout is documented in Volume 2a.
2576  */
2577 void
2578 vec4_visitor::emit_urb_writes()
2579 {
2580    /* MRF 0 is reserved for the debugger, so start with message header
2581     * in MRF 1.
2582     */
2583    int base_mrf = 1;
2584    int mrf = base_mrf;
2585    /* In the process of generating our URB write message contents, we
2586     * may need to unspill a register or load from an array.  Those
2587     * reads would use MRFs 14-15.
2588     */
2589    int max_usable_mrf = 13;
2590
2591    /* The following assertion verifies that max_usable_mrf causes an
2592     * even-numbered amount of URB write data, which will meet gen6's
2593     * requirements for length alignment.
2594     */
2595    assert ((max_usable_mrf - base_mrf) % 2 == 0);
2596
2597    /* First mrf is the g0-based message header containing URB handles and such,
2598     * which is implied in VS_OPCODE_URB_WRITE.
2599     */
2600    mrf++;
2601
2602    if (intel->gen < 6) {
2603       emit_ndc_computation();
2604    }
2605
2606    /* Set up the VUE data for the first URB write */
2607    int slot;
2608    for (slot = 0; slot < c->prog_data.vue_map.num_slots; ++slot) {
2609       emit_urb_slot(mrf++, c->prog_data.vue_map.slot_to_vert_result[slot]);
2610
2611       /* If this was max_usable_mrf, we can't fit anything more into this URB
2612        * WRITE.
2613        */
2614       if (mrf > max_usable_mrf) {
2615          slot++;
2616          break;
2617       }
2618    }
2619
2620    current_annotation = "URB write";
2621    vec4_instruction *inst = emit(VS_OPCODE_URB_WRITE);
2622    inst->base_mrf = base_mrf;
2623    inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2624    inst->eot = (slot >= c->prog_data.vue_map.num_slots);
2625
2626    /* Optional second URB write */
2627    if (!inst->eot) {
2628       mrf = base_mrf + 1;
2629
2630       for (; slot < c->prog_data.vue_map.num_slots; ++slot) {
2631          assert(mrf < max_usable_mrf);
2632
2633          emit_urb_slot(mrf++, c->prog_data.vue_map.slot_to_vert_result[slot]);
2634       }
2635
2636       current_annotation = "URB write";
2637       inst = emit(VS_OPCODE_URB_WRITE);
2638       inst->base_mrf = base_mrf;
2639       inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2640       inst->eot = true;
2641       /* URB destination offset.  In the previous write, we got MRFs
2642        * 2-13 minus the one header MRF, so 12 regs.  URB offset is in
2643        * URB row increments, and each of our MRFs is half of one of
2644        * those, since we're doing interleaved writes.
2645        */
2646       inst->offset = (max_usable_mrf - base_mrf) / 2;
2647    }
2648 }
2649
2650 src_reg
2651 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
2652                                  src_reg *reladdr, int reg_offset)
2653 {
2654    /* Because we store the values to scratch interleaved like our
2655     * vertex data, we need to scale the vec4 index by 2.
2656     */
2657    int message_header_scale = 2;
2658
2659    /* Pre-gen6, the message header uses byte offsets instead of vec4
2660     * (16-byte) offset units.
2661     */
2662    if (intel->gen < 6)
2663       message_header_scale *= 16;
2664
2665    if (reladdr) {
2666       src_reg index = src_reg(this, glsl_type::int_type);
2667
2668       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2669       emit_before(inst, MUL(dst_reg(index),
2670                             index, src_reg(message_header_scale)));
2671
2672       return index;
2673    } else {
2674       return src_reg(reg_offset * message_header_scale);
2675    }
2676 }
2677
2678 src_reg
2679 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
2680                                        src_reg *reladdr, int reg_offset)
2681 {
2682    if (reladdr) {
2683       src_reg index = src_reg(this, glsl_type::int_type);
2684
2685       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2686
2687       /* Pre-gen6, the message header uses byte offsets instead of vec4
2688        * (16-byte) offset units.
2689        */
2690       if (intel->gen < 6) {
2691          emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
2692       }
2693
2694       return index;
2695    } else {
2696       int message_header_scale = intel->gen < 6 ? 16 : 1;
2697       return src_reg(reg_offset * message_header_scale);
2698    }
2699 }
2700
2701 /**
2702  * Emits an instruction before @inst to load the value named by @orig_src
2703  * from scratch space at @base_offset to @temp.
2704  *
2705  * @base_offset is measured in 32-byte units (the size of a register).
2706  */
2707 void
2708 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
2709                                 dst_reg temp, src_reg orig_src,
2710                                 int base_offset)
2711 {
2712    int reg_offset = base_offset + orig_src.reg_offset;
2713    src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
2714
2715    emit_before(inst, SCRATCH_READ(temp, index));
2716 }
2717
2718 /**
2719  * Emits an instruction after @inst to store the value to be written
2720  * to @orig_dst to scratch space at @base_offset, from @temp.
2721  *
2722  * @base_offset is measured in 32-byte units (the size of a register).
2723  */
2724 void
2725 vec4_visitor::emit_scratch_write(vec4_instruction *inst, int base_offset)
2726 {
2727    int reg_offset = base_offset + inst->dst.reg_offset;
2728    src_reg index = get_scratch_offset(inst, inst->dst.reladdr, reg_offset);
2729
2730    /* Create a temporary register to store *inst's result in.
2731     *
2732     * We have to be careful in MOVing from our temporary result register in
2733     * the scratch write.  If we swizzle from channels of the temporary that
2734     * weren't initialized, it will confuse live interval analysis, which will
2735     * make spilling fail to make progress.
2736     */
2737    src_reg temp = src_reg(this, glsl_type::vec4_type);
2738    temp.type = inst->dst.type;
2739    int first_writemask_chan = ffs(inst->dst.writemask) - 1;
2740    int swizzles[4];
2741    for (int i = 0; i < 4; i++)
2742       if (inst->dst.writemask & (1 << i))
2743          swizzles[i] = i;
2744       else
2745          swizzles[i] = first_writemask_chan;
2746    temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2747                                swizzles[2], swizzles[3]);
2748
2749    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
2750                                        inst->dst.writemask));
2751    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
2752    write->predicate = inst->predicate;
2753    write->ir = inst->ir;
2754    write->annotation = inst->annotation;
2755    inst->insert_after(write);
2756
2757    inst->dst.file = temp.file;
2758    inst->dst.reg = temp.reg;
2759    inst->dst.reg_offset = temp.reg_offset;
2760    inst->dst.reladdr = NULL;
2761 }
2762
2763 /**
2764  * We can't generally support array access in GRF space, because a
2765  * single instruction's destination can only span 2 contiguous
2766  * registers.  So, we send all GRF arrays that get variable index
2767  * access to scratch space.
2768  */
2769 void
2770 vec4_visitor::move_grf_array_access_to_scratch()
2771 {
2772    int scratch_loc[this->virtual_grf_count];
2773
2774    for (int i = 0; i < this->virtual_grf_count; i++) {
2775       scratch_loc[i] = -1;
2776    }
2777
2778    /* First, calculate the set of virtual GRFs that need to be punted
2779     * to scratch due to having any array access on them, and where in
2780     * scratch.
2781     */
2782    foreach_list(node, &this->instructions) {
2783       vec4_instruction *inst = (vec4_instruction *)node;
2784
2785       if (inst->dst.file == GRF && inst->dst.reladdr &&
2786           scratch_loc[inst->dst.reg] == -1) {
2787          scratch_loc[inst->dst.reg] = c->last_scratch;
2788          c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
2789       }
2790
2791       for (int i = 0 ; i < 3; i++) {
2792          src_reg *src = &inst->src[i];
2793
2794          if (src->file == GRF && src->reladdr &&
2795              scratch_loc[src->reg] == -1) {
2796             scratch_loc[src->reg] = c->last_scratch;
2797             c->last_scratch += this->virtual_grf_sizes[src->reg];
2798          }
2799       }
2800    }
2801
2802    /* Now, for anything that will be accessed through scratch, rewrite
2803     * it to load/store.  Note that this is a _safe list walk, because
2804     * we may generate a new scratch_write instruction after the one
2805     * we're processing.
2806     */
2807    foreach_list_safe(node, &this->instructions) {
2808       vec4_instruction *inst = (vec4_instruction *)node;
2809
2810       /* Set up the annotation tracking for new generated instructions. */
2811       base_ir = inst->ir;
2812       current_annotation = inst->annotation;
2813
2814       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
2815          emit_scratch_write(inst, scratch_loc[inst->dst.reg]);
2816       }
2817
2818       for (int i = 0 ; i < 3; i++) {
2819          if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
2820             continue;
2821
2822          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
2823
2824          emit_scratch_read(inst, temp, inst->src[i],
2825                            scratch_loc[inst->src[i].reg]);
2826
2827          inst->src[i].file = temp.file;
2828          inst->src[i].reg = temp.reg;
2829          inst->src[i].reg_offset = temp.reg_offset;
2830          inst->src[i].reladdr = NULL;
2831       }
2832    }
2833 }
2834
2835 /**
2836  * Emits an instruction before @inst to load the value named by @orig_src
2837  * from the pull constant buffer (surface) at @base_offset to @temp.
2838  */
2839 void
2840 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
2841                                       dst_reg temp, src_reg orig_src,
2842                                       int base_offset)
2843 {
2844    int reg_offset = base_offset + orig_src.reg_offset;
2845    src_reg index = src_reg((unsigned)SURF_INDEX_VERT_CONST_BUFFER);
2846    src_reg offset = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
2847    vec4_instruction *load;
2848
2849    load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
2850                                         temp, index, offset);
2851    load->base_mrf = 14;
2852    load->mlen = 1;
2853    emit_before(inst, load);
2854 }
2855
2856 /**
2857  * Implements array access of uniforms by inserting a
2858  * PULL_CONSTANT_LOAD instruction.
2859  *
2860  * Unlike temporary GRF array access (where we don't support it due to
2861  * the difficulty of doing relative addressing on instruction
2862  * destinations), we could potentially do array access of uniforms
2863  * that were loaded in GRF space as push constants.  In real-world
2864  * usage we've seen, though, the arrays being used are always larger
2865  * than we could load as push constants, so just always move all
2866  * uniform array access out to a pull constant buffer.
2867  */
2868 void
2869 vec4_visitor::move_uniform_array_access_to_pull_constants()
2870 {
2871    int pull_constant_loc[this->uniforms];
2872
2873    for (int i = 0; i < this->uniforms; i++) {
2874       pull_constant_loc[i] = -1;
2875    }
2876
2877    /* Walk through and find array access of uniforms.  Put a copy of that
2878     * uniform in the pull constant buffer.
2879     *
2880     * Note that we don't move constant-indexed accesses to arrays.  No
2881     * testing has been done of the performance impact of this choice.
2882     */
2883    foreach_list_safe(node, &this->instructions) {
2884       vec4_instruction *inst = (vec4_instruction *)node;
2885
2886       for (int i = 0 ; i < 3; i++) {
2887          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2888             continue;
2889
2890          int uniform = inst->src[i].reg;
2891
2892          /* If this array isn't already present in the pull constant buffer,
2893           * add it.
2894           */
2895          if (pull_constant_loc[uniform] == -1) {
2896             const float **values = &prog_data->param[uniform * 4];
2897
2898             pull_constant_loc[uniform] = prog_data->nr_pull_params / 4;
2899
2900             for (int j = 0; j < uniform_size[uniform] * 4; j++) {
2901                prog_data->pull_param[prog_data->nr_pull_params++] = values[j];
2902             }
2903          }
2904
2905          /* Set up the annotation tracking for new generated instructions. */
2906          base_ir = inst->ir;
2907          current_annotation = inst->annotation;
2908
2909          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
2910
2911          emit_pull_constant_load(inst, temp, inst->src[i],
2912                                  pull_constant_loc[uniform]);
2913
2914          inst->src[i].file = temp.file;
2915          inst->src[i].reg = temp.reg;
2916          inst->src[i].reg_offset = temp.reg_offset;
2917          inst->src[i].reladdr = NULL;
2918       }
2919    }
2920
2921    /* Now there are no accesses of the UNIFORM file with a reladdr, so
2922     * no need to track them as larger-than-vec4 objects.  This will be
2923     * relied on in cutting out unused uniform vectors from push
2924     * constants.
2925     */
2926    split_uniform_registers();
2927 }
2928
2929 void
2930 vec4_visitor::resolve_ud_negate(src_reg *reg)
2931 {
2932    if (reg->type != BRW_REGISTER_TYPE_UD ||
2933        !reg->negate)
2934       return;
2935
2936    src_reg temp = src_reg(this, glsl_type::uvec4_type);
2937    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
2938    *reg = temp;
2939 }
2940
2941 vec4_visitor::vec4_visitor(struct brw_context *brw,
2942                            struct brw_vs_compile *c,
2943                            struct gl_shader_program *prog,
2944                            struct brw_shader *shader,
2945                            void *mem_ctx)
2946 {
2947    this->c = c;
2948    this->brw = brw;
2949    this->intel = &brw->intel;
2950    this->ctx = &intel->ctx;
2951    this->prog = prog;
2952    this->shader = shader;
2953
2954    this->mem_ctx = mem_ctx;
2955    this->failed = false;
2956
2957    this->base_ir = NULL;
2958    this->current_annotation = NULL;
2959    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
2960
2961    this->c = c;
2962    this->vp = &c->vp->program;
2963    this->prog_data = &c->prog_data;
2964
2965    this->variable_ht = hash_table_ctor(0,
2966                                        hash_table_pointer_hash,
2967                                        hash_table_pointer_compare);
2968
2969    this->virtual_grf_def = NULL;
2970    this->virtual_grf_use = NULL;
2971    this->virtual_grf_sizes = NULL;
2972    this->virtual_grf_count = 0;
2973    this->virtual_grf_reg_map = NULL;
2974    this->virtual_grf_reg_count = 0;
2975    this->virtual_grf_array_size = 0;
2976    this->live_intervals_valid = false;
2977
2978    this->max_grf = intel->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
2979
2980    this->uniforms = 0;
2981 }
2982
2983 vec4_visitor::~vec4_visitor()
2984 {
2985    hash_table_dtor(this->variable_ht);
2986 }
2987
2988
2989 void
2990 vec4_visitor::fail(const char *format, ...)
2991 {
2992    va_list va;
2993    char *msg;
2994
2995    if (failed)
2996       return;
2997
2998    failed = true;
2999
3000    va_start(va, format);
3001    msg = ralloc_vasprintf(mem_ctx, format, va);
3002    va_end(va);
3003    msg = ralloc_asprintf(mem_ctx, "VS compile failed: %s\n", msg);
3004
3005    this->fail_msg = msg;
3006
3007    if (INTEL_DEBUG & DEBUG_VS) {
3008       fprintf(stderr, "%s",  msg);
3009    }
3010 }
3011
3012 } /* namespace brw */