src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "glsl/ir_uniform.h"
  26 extern "C" {
  27 #include "program/sampler.h"
  28 }
  29
  30 namespace brw {
  31
  32 vec4_instruction::vec4_instruction(vec4_visitor *v,
  33                                    enum opcode opcode, dst_reg dst,
  34                                    src_reg src0, src_reg src1, src_reg src2)
  35 {
  36    this->opcode = opcode;
  37    this->dst = dst;
  38    this->src[0] = src0;
  39    this->src[1] = src1;
  40    this->src[2] = src2;
  41    this->ir = v->base_ir;
  42    this->annotation = v->current_annotation;
  43 }
  44
  45 vec4_instruction *
  46 vec4_visitor::emit(vec4_instruction *inst)
  47 {
  48    this->instructions.push_tail(inst);
  49
  50    return inst;
  51 }
  52
  53 vec4_instruction *
  54 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
  55 {
  56    new_inst->ir = inst->ir;
  57    new_inst->annotation = inst->annotation;
  58
  59    inst->insert_before(new_inst);
  60
  61    return inst;
  62 }
  63
  64 vec4_instruction *
  65 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
  66                    src_reg src0, src_reg src1, src_reg src2)
  67 {
  68    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
  69                                              src0, src1, src2));
  70 }
  71
  72
  73 vec4_instruction *
  74 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
  75 {
  76    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
  77 }
  78
  79 vec4_instruction *
  80 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
  81 {
  82    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
  83 }
  84
  85 vec4_instruction *
  86 vec4_visitor::emit(enum opcode opcode)
  87 {
  88    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
  89 }
  90
  91 #define ALU1(op)                                                        \
  92    vec4_instruction *                                                   \
  93    vec4_visitor::op(dst_reg dst, src_reg src0)                          \
  94    {                                                                    \
  95       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
  96                                            src0);                       \
  97    }
  98
  99 #define ALU2(op)                                                        \
 100    vec4_instruction *                                                   \
 101    vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1)            \
 102    {                                                                    \
 103       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 104                                            src0, src1);                 \
 105    }
 106
 107 #define ALU3(op)                                                        \
 108    vec4_instruction *                                                   \
 109    vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1, src_reg src2)\
 110    {                                                                    \
 111       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 112                                            src0, src1, src2);           \
 113    }
 114
 115 ALU1(NOT)
 116 ALU1(MOV)
 117 ALU1(FRC)
 118 ALU1(RNDD)
 119 ALU1(RNDE)
 120 ALU1(RNDZ)
 121 ALU1(F32TO16)
 122 ALU1(F16TO32)
 123 ALU2(ADD)
 124 ALU2(MUL)
 125 ALU2(MACH)
 126 ALU2(AND)
 127 ALU2(OR)
 128 ALU2(XOR)
 129 ALU2(DP3)
 130 ALU2(DP4)
 131 ALU2(DPH)
 132 ALU2(SHL)
 133 ALU2(SHR)
 134 ALU2(ASR)
 135 ALU3(LRP)
 136 ALU1(BFREV)
 137 ALU3(BFE)
 138 ALU2(BFI1)
 139 ALU3(BFI2)
 140 ALU1(FBH)
 141 ALU1(FBL)
 142 ALU1(CBIT)
 143 ALU3(MAD)
 144
 145 /** Gen4 predicated IF. */
 146 vec4_instruction *
 147 vec4_visitor::IF(uint32_t predicate)
 148 {
 149    vec4_instruction *inst;
 150
 151    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
 152    inst->predicate = predicate;
 153
 154    return inst;
 155 }
 156
 157 /** Gen6+ IF with embedded comparison. */
 158 vec4_instruction *
 159 vec4_visitor::IF(src_reg src0, src_reg src1, uint32_t condition)
 160 {
 161    assert(brw->gen >= 6);
 162
 163    vec4_instruction *inst;
 164
 165    resolve_ud_negate(&src0);
 166    resolve_ud_negate(&src1);
 167
 168    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
 169                                         src0, src1);
 170    inst->conditional_mod = condition;
 171
 172    return inst;
 173 }
 174
 175 /**
 176  * CMP: Sets the low bit of the destination channels with the result
 177  * of the comparison, while the upper bits are undefined, and updates
 178  * the flag register with the packed 16 bits of the result.
 179  */
 180 vec4_instruction *
 181 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, uint32_t condition)
 182 {
 183    vec4_instruction *inst;
 184
 185    /* original gen4 does type conversion to the destination type
 186     * before before comparison, producing garbage results for floating
 187     * point comparisons.
 188     */
 189    if (brw->gen == 4) {
 190       dst.type = src0.type;
 191       if (dst.file == HW_REG)
 192          dst.fixed_hw_reg.type = dst.type;
 193    }
 194
 195    resolve_ud_negate(&src0);
 196    resolve_ud_negate(&src1);
 197
 198    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
 199    inst->conditional_mod = condition;
 200
 201    return inst;
 202 }
 203
 204 vec4_instruction *
 205 vec4_visitor::SCRATCH_READ(dst_reg dst, src_reg index)
 206 {
 207    vec4_instruction *inst;
 208
 209    inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_READ,
 210                                         dst, index);
 211    inst->base_mrf = 14;
 212    inst->mlen = 2;
 213
 214    return inst;
 215 }
 216
 217 vec4_instruction *
 218 vec4_visitor::SCRATCH_WRITE(dst_reg dst, src_reg src, src_reg index)
 219 {
 220    vec4_instruction *inst;
 221
 222    inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_WRITE,
 223                                         dst, src, index);
 224    inst->base_mrf = 13;
 225    inst->mlen = 3;
 226
 227    return inst;
 228 }
 229
 230 void
 231 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
 232 {
 233    static enum opcode dot_opcodes[] = {
 234       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
 235    };
 236
 237    emit(dot_opcodes[elements - 2], dst, src0, src1);
 238 }
 239
 240 src_reg
 241 vec4_visitor::fix_3src_operand(src_reg src)
 242 {
 243    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
 244     * able to use vertical stride of zero to replicate the vec4 uniform, like
 245     *
 246     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
 247     *
 248     * But you can't, since vertical stride is always four in three-source
 249     * instructions. Instead, insert a MOV instruction to do the replication so
 250     * that the three-source instruction can consume it.
 251     */
 252
 253    /* The MOV is only needed if the source is a uniform or immediate. */
 254    if (src.file != UNIFORM && src.file != IMM)
 255       return src;
 256
 257    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 258    expanded.type = src.type;
 259    emit(MOV(expanded, src));
 260    return src_reg(expanded);
 261 }
 262
 263 src_reg
 264 vec4_visitor::fix_math_operand(src_reg src)
 265 {
 266    /* The gen6 math instruction ignores the source modifiers --
 267     * swizzle, abs, negate, and at least some parts of the register
 268     * region description.
 269     *
 270     * Rather than trying to enumerate all these cases, *always* expand the
 271     * operand to a temp GRF for gen6.
 272     *
 273     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 274     * can't use.
 275     */
 276
 277    if (brw->gen == 7 && src.file != IMM)
 278       return src;
 279
 280    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 281    expanded.type = src.type;
 282    emit(MOV(expanded, src));
 283    return src_reg(expanded);
 284 }
 285
 286 void
 287 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
 288 {
 289    src = fix_math_operand(src);
 290
 291    if (dst.writemask != WRITEMASK_XYZW) {
 292       /* The gen6 math instruction must be align1, so we can't do
 293        * writemasks.
 294        */
 295       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 296
 297       emit(opcode, temp_dst, src);
 298
 299       emit(MOV(dst, src_reg(temp_dst)));
 300    } else {
 301       emit(opcode, dst, src);
 302    }
 303 }
 304
 305 void
 306 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
 307 {
 308    vec4_instruction *inst = emit(opcode, dst, src);
 309    inst->base_mrf = 1;
 310    inst->mlen = 1;
 311 }
 312
 313 void
 314 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
 315 {
 316    switch (opcode) {
 317    case SHADER_OPCODE_RCP:
 318    case SHADER_OPCODE_RSQ:
 319    case SHADER_OPCODE_SQRT:
 320    case SHADER_OPCODE_EXP2:
 321    case SHADER_OPCODE_LOG2:
 322    case SHADER_OPCODE_SIN:
 323    case SHADER_OPCODE_COS:
 324       break;
 325    default:
 326       assert(!"not reached: bad math opcode");
 327       return;
 328    }
 329
 330    if (brw->gen >= 6) {
 331       return emit_math1_gen6(opcode, dst, src);
 332    } else {
 333       return emit_math1_gen4(opcode, dst, src);
 334    }
 335 }
 336
 337 void
 338 vec4_visitor::emit_math2_gen6(enum opcode opcode,
 339                               dst_reg dst, src_reg src0, src_reg src1)
 340 {
 341    src0 = fix_math_operand(src0);
 342    src1 = fix_math_operand(src1);
 343
 344    if (dst.writemask != WRITEMASK_XYZW) {
 345       /* The gen6 math instruction must be align1, so we can't do
 346        * writemasks.
 347        */
 348       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 349       temp_dst.type = dst.type;
 350
 351       emit(opcode, temp_dst, src0, src1);
 352
 353       emit(MOV(dst, src_reg(temp_dst)));
 354    } else {
 355       emit(opcode, dst, src0, src1);
 356    }
 357 }
 358
 359 void
 360 vec4_visitor::emit_math2_gen4(enum opcode opcode,
 361                               dst_reg dst, src_reg src0, src_reg src1)
 362 {
 363    vec4_instruction *inst = emit(opcode, dst, src0, src1);
 364    inst->base_mrf = 1;
 365    inst->mlen = 2;
 366 }
 367
 368 void
 369 vec4_visitor::emit_math(enum opcode opcode,
 370                         dst_reg dst, src_reg src0, src_reg src1)
 371 {
 372    switch (opcode) {
 373    case SHADER_OPCODE_POW:
 374    case SHADER_OPCODE_INT_QUOTIENT:
 375    case SHADER_OPCODE_INT_REMAINDER:
 376       break;
 377    default:
 378       assert(!"not reached: unsupported binary math opcode");
 379       return;
 380    }
 381
 382    if (brw->gen >= 6) {
 383       return emit_math2_gen6(opcode, dst, src0, src1);
 384    } else {
 385       return emit_math2_gen4(opcode, dst, src0, src1);
 386    }
 387 }
 388
 389 void
 390 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 391 {
 392    if (brw->gen < 7)
 393       assert(!"ir_unop_pack_half_2x16 should be lowered");
 394
 395    assert(dst.type == BRW_REGISTER_TYPE_UD);
 396    assert(src0.type == BRW_REGISTER_TYPE_F);
 397
 398    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
 399     *
 400     *   Because this instruction does not have a 16-bit floating-point type,
 401     *   the destination data type must be Word (W).
 402     *
 403     *   The destination must be DWord-aligned and specify a horizontal stride
 404     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
 405     *   each destination channel and the upper word is not modified.
 406     *
 407     * The above restriction implies that the f32to16 instruction must use
 408     * align1 mode, because only in align1 mode is it possible to specify
 409     * horizontal stride.  We choose here to defy the hardware docs and emit
 410     * align16 instructions.
 411     *
 412     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
 413     * instructions. I was partially successful in that the code passed all
 414     * tests.  However, the code was dubiously correct and fragile, and the
 415     * tests were not harsh enough to probe that frailty. Not trusting the
 416     * code, I chose instead to remain in align16 mode in defiance of the hw
 417     * docs).
 418     *
 419     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
 420     * simulator, emitting a f32to16 in align16 mode with UD as destination
 421     * data type is safe. The behavior differs from that specified in the PRM
 422     * in that the upper word of each destination channel is cleared to 0.
 423     */
 424
 425    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 426    src_reg tmp_src(tmp_dst);
 427
 428 #if 0
 429    /* Verify the undocumented behavior on which the following instructions
 430     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
 431     * then the result of the bit-or instruction below will be incorrect.
 432     *
 433     * You should inspect the disasm output in order to verify that the MOV is
 434     * not optimized away.
 435     */
 436    emit(MOV(tmp_dst, src_reg(0x12345678u)));
 437 #endif
 438
 439    /* Give tmp the form below, where "." means untouched.
 440     *
 441     *     w z          y          x w z          y          x
 442     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
 443     *
 444     * That the upper word of each write-channel be 0 is required for the
 445     * following bit-shift and bit-or instructions to work. Note that this
 446     * relies on the undocumented hardware behavior mentioned above.
 447     */
 448    tmp_dst.writemask = WRITEMASK_XY;
 449    emit(F32TO16(tmp_dst, src0));
 450
 451    /* Give the write-channels of dst the form:
 452     *   0xhhhh0000
 453     */
 454    tmp_src.swizzle = SWIZZLE_Y;
 455    emit(SHL(dst, tmp_src, src_reg(16u)));
 456
 457    /* Finally, give the write-channels of dst the form of packHalf2x16's
 458     * output:
 459     *   0xhhhhllll
 460     */
 461    tmp_src.swizzle = SWIZZLE_X;
 462    emit(OR(dst, src_reg(dst), tmp_src));
 463 }
 464
 465 void
 466 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 467 {
 468    if (brw->gen < 7)
 469       assert(!"ir_unop_unpack_half_2x16 should be lowered");
 470
 471    assert(dst.type == BRW_REGISTER_TYPE_F);
 472    assert(src0.type == BRW_REGISTER_TYPE_UD);
 473
 474    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
 475     *
 476     *   Because this instruction does not have a 16-bit floating-point type,
 477     *   the source data type must be Word (W). The destination type must be
 478     *   F (Float).
 479     *
 480     * To use W as the source data type, we must adjust horizontal strides,
 481     * which is only possible in align1 mode. All my [chadv] attempts at
 482     * emitting align1 instructions for unpackHalf2x16 failed to pass the
 483     * Piglit tests, so I gave up.
 484     *
 485     * I've verified that, on gen7 hardware and the simulator, it is safe to
 486     * emit f16to32 in align16 mode with UD as source data type.
 487     */
 488
 489    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 490    src_reg tmp_src(tmp_dst);
 491
 492    tmp_dst.writemask = WRITEMASK_X;
 493    emit(AND(tmp_dst, src0, src_reg(0xffffu)));
 494
 495    tmp_dst.writemask = WRITEMASK_Y;
 496    emit(SHR(tmp_dst, src0, src_reg(16u)));
 497
 498    dst.writemask = WRITEMASK_XY;
 499    emit(F16TO32(dst, tmp_src));
 500 }
 501
 502 void
 503 vec4_visitor::visit_instructions(const exec_list *list)
 504 {
 505    foreach_list(node, list) {
 506       ir_instruction *ir = (ir_instruction *)node;
 507
 508       base_ir = ir;
 509       ir->accept(this);
 510    }
 511 }
 512
 513
 514 static int
 515 type_size(const struct glsl_type *type)
 516 {
 517    unsigned int i;
 518    int size;
 519
 520    switch (type->base_type) {
 521    case GLSL_TYPE_UINT:
 522    case GLSL_TYPE_INT:
 523    case GLSL_TYPE_FLOAT:
 524    case GLSL_TYPE_BOOL:
 525       if (type->is_matrix()) {
 526          return type->matrix_columns;
 527       } else {
 528          /* Regardless of size of vector, it gets a vec4. This is bad
 529           * packing for things like floats, but otherwise arrays become a
 530           * mess.  Hopefully a later pass over the code can pack scalars
 531           * down if appropriate.
 532           */
 533          return 1;
 534       }
 535    case GLSL_TYPE_ARRAY:
 536       assert(type->length > 0);
 537       return type_size(type->fields.array) * type->length;
 538    case GLSL_TYPE_STRUCT:
 539       size = 0;
 540       for (i = 0; i < type->length; i++) {
 541          size += type_size(type->fields.structure[i].type);
 542       }
 543       return size;
 544    case GLSL_TYPE_SAMPLER:
 545       /* Samplers take up one slot in UNIFORMS[], but they're baked in
 546        * at link time.
 547        */
 548       return 1;
 549    case GLSL_TYPE_VOID:
 550    case GLSL_TYPE_ERROR:
 551    case GLSL_TYPE_INTERFACE:
 552       assert(0);
 553       break;
 554    }
 555
 556    return 0;
 557 }
 558
 559 int
 560 vec4_visitor::virtual_grf_alloc(int size)
 561 {
 562    if (virtual_grf_array_size <= virtual_grf_count) {
 563       if (virtual_grf_array_size == 0)
 564          virtual_grf_array_size = 16;
 565       else
 566          virtual_grf_array_size *= 2;
 567       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 568                                    virtual_grf_array_size);
 569       virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
 570                                      virtual_grf_array_size);
 571    }
 572    virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
 573    virtual_grf_reg_count += size;
 574    virtual_grf_sizes[virtual_grf_count] = size;
 575    return virtual_grf_count++;
 576 }
 577
 578 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 579 {
 580    init();
 581
 582    this->file = GRF;
 583    this->reg = v->virtual_grf_alloc(type_size(type));
 584
 585    if (type->is_array() || type->is_record()) {
 586       this->swizzle = BRW_SWIZZLE_NOOP;
 587    } else {
 588       this->swizzle = swizzle_for_size(type->vector_elements);
 589    }
 590
 591    this->type = brw_type_for_base_type(type);
 592 }
 593
 594 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 595 {
 596    init();
 597
 598    this->file = GRF;
 599    this->reg = v->virtual_grf_alloc(type_size(type));
 600
 601    if (type->is_array() || type->is_record()) {
 602       this->writemask = WRITEMASK_XYZW;
 603    } else {
 604       this->writemask = (1 << type->vector_elements) - 1;
 605    }
 606
 607    this->type = brw_type_for_base_type(type);
 608 }
 609
 610 /* Our support for uniforms is piggy-backed on the struct
 611  * gl_fragment_program, because that's where the values actually
 612  * get stored, rather than in some global gl_shader_program uniform
 613  * store.
 614  */
 615 void
 616 vec4_visitor::setup_uniform_values(ir_variable *ir)
 617 {
 618    int namelen = strlen(ir->name);
 619
 620    /* The data for our (non-builtin) uniforms is stored in a series of
 621     * gl_uniform_driver_storage structs for each subcomponent that
 622     * glGetUniformLocation() could name.  We know it's been set up in the same
 623     * order we'd walk the type, so walk the list of storage and find anything
 624     * with our name, or the prefix of a component that starts with our name.
 625     */
 626    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 627       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 628
 629       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 630           (storage->name[namelen] != 0 &&
 631            storage->name[namelen] != '.' &&
 632            storage->name[namelen] != '[')) {
 633          continue;
 634       }
 635
 636       gl_constant_value *components = storage->storage;
 637       unsigned vector_count = (MAX2(storage->array_elements, 1) *
 638                                storage->type->matrix_columns);
 639
 640       for (unsigned s = 0; s < vector_count; s++) {
 641          uniform_vector_size[uniforms] = storage->type->vector_elements;
 642
 643          int i;
 644          for (i = 0; i < uniform_vector_size[uniforms]; i++) {
 645             prog_data->param[uniforms * 4 + i] = &components->f;
 646             components++;
 647          }
 648          for (; i < 4; i++) {
 649             static float zero = 0;
 650             prog_data->param[uniforms * 4 + i] = &zero;
 651          }
 652
 653          uniforms++;
 654       }
 655    }
 656 }
 657
 658 void
 659 vec4_visitor::setup_uniform_clipplane_values()
 660 {
 661    gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
 662
 663    for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
 664       this->uniform_vector_size[this->uniforms] = 4;
 665       this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
 666       this->userplane[i].type = BRW_REGISTER_TYPE_F;
 667       for (int j = 0; j < 4; ++j) {
 668          prog_data->param[this->uniforms * 4 + j] = &clip_planes[i][j];
 669       }
 670       ++this->uniforms;
 671    }
 672 }
 673
 674 /* Our support for builtin uniforms is even scarier than non-builtin.
 675  * It sits on top of the PROG_STATE_VAR parameters that are
 676  * automatically updated from GL context state.
 677  */
 678 void
 679 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
 680 {
 681    const ir_state_slot *const slots = ir->state_slots;
 682    assert(ir->state_slots != NULL);
 683
 684    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 685       /* This state reference has already been setup by ir_to_mesa,
 686        * but we'll get the same index back here.  We can reference
 687        * ParameterValues directly, since unlike brw_fs.cpp, we never
 688        * add new state references during compile.
 689        */
 690       int index = _mesa_add_state_reference(this->prog->Parameters,
 691                                             (gl_state_index *)slots[i].tokens);
 692       float *values = &this->prog->Parameters->ParameterValues[index][0].f;
 693
 694       this->uniform_vector_size[this->uniforms] = 0;
 695       /* Add each of the unique swizzled channels of the element.
 696        * This will end up matching the size of the glsl_type of this field.
 697        */
 698       int last_swiz = -1;
 699       for (unsigned int j = 0; j < 4; j++) {
 700          int swiz = GET_SWZ(slots[i].swizzle, j);
 701          last_swiz = swiz;
 702
 703          prog_data->param[this->uniforms * 4 + j] = &values[swiz];
 704          if (swiz <= last_swiz)
 705             this->uniform_vector_size[this->uniforms]++;
 706       }
 707       this->uniforms++;
 708    }
 709 }
 710
 711 dst_reg *
 712 vec4_visitor::variable_storage(ir_variable *var)
 713 {
 714    return (dst_reg *)hash_table_find(this->variable_ht, var);
 715 }
 716
 717 void
 718 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir, uint32_t *predicate)
 719 {
 720    ir_expression *expr = ir->as_expression();
 721
 722    *predicate = BRW_PREDICATE_NORMAL;
 723
 724    if (expr) {
 725       src_reg op[2];
 726       vec4_instruction *inst;
 727
 728       assert(expr->get_num_operands() <= 2);
 729       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 730          expr->operands[i]->accept(this);
 731          op[i] = this->result;
 732
 733          resolve_ud_negate(&op[i]);
 734       }
 735
 736       switch (expr->operation) {
 737       case ir_unop_logic_not:
 738          inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
 739          inst->conditional_mod = BRW_CONDITIONAL_Z;
 740          break;
 741
 742       case ir_binop_logic_xor:
 743          inst = emit(XOR(dst_null_d(), op[0], op[1]));
 744          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 745          break;
 746
 747       case ir_binop_logic_or:
 748          inst = emit(OR(dst_null_d(), op[0], op[1]));
 749          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 750          break;
 751
 752       case ir_binop_logic_and:
 753          inst = emit(AND(dst_null_d(), op[0], op[1]));
 754          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 755          break;
 756
 757       case ir_unop_f2b:
 758          if (brw->gen >= 6) {
 759             emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
 760          } else {
 761             inst = emit(MOV(dst_null_f(), op[0]));
 762             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 763          }
 764          break;
 765
 766       case ir_unop_i2b:
 767          if (brw->gen >= 6) {
 768             emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 769          } else {
 770             inst = emit(MOV(dst_null_d(), op[0]));
 771             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 772          }
 773          break;
 774
 775       case ir_binop_all_equal:
 776          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 777          *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
 778          break;
 779
 780       case ir_binop_any_nequal:
 781          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 782          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 783          break;
 784
 785       case ir_unop_any:
 786          inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 787          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 788          break;
 789
 790       case ir_binop_greater:
 791       case ir_binop_gequal:
 792       case ir_binop_less:
 793       case ir_binop_lequal:
 794       case ir_binop_equal:
 795       case ir_binop_nequal:
 796          emit(CMP(dst_null_d(), op[0], op[1],
 797                   brw_conditional_for_comparison(expr->operation)));
 798          break;
 799
 800       default:
 801          assert(!"not reached");
 802          break;
 803       }
 804       return;
 805    }
 806
 807    ir->accept(this);
 808
 809    resolve_ud_negate(&this->result);
 810
 811    if (brw->gen >= 6) {
 812       vec4_instruction *inst = emit(AND(dst_null_d(),
 813                                         this->result, src_reg(1)));
 814       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 815    } else {
 816       vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
 817       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 818    }
 819 }
 820
 821 /**
 822  * Emit a gen6 IF statement with the comparison folded into the IF
 823  * instruction.
 824  */
 825 void
 826 vec4_visitor::emit_if_gen6(ir_if *ir)
 827 {
 828    ir_expression *expr = ir->condition->as_expression();
 829
 830    if (expr) {
 831       src_reg op[2];
 832       dst_reg temp;
 833
 834       assert(expr->get_num_operands() <= 2);
 835       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 836          expr->operands[i]->accept(this);
 837          op[i] = this->result;
 838       }
 839
 840       switch (expr->operation) {
 841       case ir_unop_logic_not:
 842          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
 843          return;
 844
 845       case ir_binop_logic_xor:
 846          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
 847          return;
 848
 849       case ir_binop_logic_or:
 850          temp = dst_reg(this, glsl_type::bool_type);
 851          emit(OR(temp, op[0], op[1]));
 852          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 853          return;
 854
 855       case ir_binop_logic_and:
 856          temp = dst_reg(this, glsl_type::bool_type);
 857          emit(AND(temp, op[0], op[1]));
 858          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 859          return;
 860
 861       case ir_unop_f2b:
 862          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 863          return;
 864
 865       case ir_unop_i2b:
 866          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 867          return;
 868
 869       case ir_binop_greater:
 870       case ir_binop_gequal:
 871       case ir_binop_less:
 872       case ir_binop_lequal:
 873       case ir_binop_equal:
 874       case ir_binop_nequal:
 875          emit(IF(op[0], op[1],
 876                  brw_conditional_for_comparison(expr->operation)));
 877          return;
 878
 879       case ir_binop_all_equal:
 880          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 881          emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
 882          return;
 883
 884       case ir_binop_any_nequal:
 885          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 886          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 887          return;
 888
 889       case ir_unop_any:
 890          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 891          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 892          return;
 893
 894       default:
 895          assert(!"not reached");
 896          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 897          return;
 898       }
 899       return;
 900    }
 901
 902    ir->condition->accept(this);
 903
 904    emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
 905 }
 906
 907 dst_reg
 908 with_writemask(dst_reg const & r, int mask)
 909 {
 910    dst_reg result = r;
 911    result.writemask = mask;
 912    return result;
 913 }
 914
 915
 916 void
 917 vec4_visitor::visit(ir_variable *ir)
 918 {
 919    dst_reg *reg = NULL;
 920
 921    if (variable_storage(ir))
 922       return;
 923
 924    switch (ir->mode) {
 925    case ir_var_shader_in:
 926       reg = new(mem_ctx) dst_reg(ATTR, ir->location);
 927       break;
 928
 929    case ir_var_shader_out:
 930       reg = new(mem_ctx) dst_reg(this, ir->type);
 931
 932       for (int i = 0; i < type_size(ir->type); i++) {
 933          output_reg[ir->location + i] = *reg;
 934          output_reg[ir->location + i].reg_offset = i;
 935          output_reg[ir->location + i].type =
 936             brw_type_for_base_type(ir->type->get_scalar_type());
 937          output_reg_annotation[ir->location + i] = ir->name;
 938       }
 939       break;
 940
 941    case ir_var_auto:
 942    case ir_var_temporary:
 943       reg = new(mem_ctx) dst_reg(this, ir->type);
 944       break;
 945
 946    case ir_var_uniform:
 947       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
 948
 949       /* Thanks to the lower_ubo_reference pass, we will see only
 950        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
 951        * variables, so no need for them to be in variable_ht.
 952        */
 953       if (ir->is_in_uniform_block())
 954          return;
 955
 956       /* Track how big the whole uniform variable is, in case we need to put a
 957        * copy of its data into pull constants for array access.
 958        */
 959       this->uniform_size[this->uniforms] = type_size(ir->type);
 960
 961       if (!strncmp(ir->name, "gl_", 3)) {
 962          setup_builtin_uniform_values(ir);
 963       } else {
 964          setup_uniform_values(ir);
 965       }
 966       break;
 967
 968    case ir_var_system_value:
 969       reg = make_reg_for_system_value(ir);
 970       break;
 971
 972    default:
 973       assert(!"not reached");
 974    }
 975
 976    reg->type = brw_type_for_base_type(ir->type);
 977    hash_table_insert(this->variable_ht, reg, ir);
 978 }
 979
 980 void
 981 vec4_visitor::visit(ir_loop *ir)
 982 {
 983    dst_reg counter;
 984
 985    /* We don't want debugging output to print the whole body of the
 986     * loop as the annotation.
 987     */
 988    this->base_ir = NULL;
 989
 990    if (ir->counter != NULL) {
 991       this->base_ir = ir->counter;
 992       ir->counter->accept(this);
 993       counter = *(variable_storage(ir->counter));
 994
 995       if (ir->from != NULL) {
 996          this->base_ir = ir->from;
 997          ir->from->accept(this);
 998
 999          emit(MOV(counter, this->result));
1000       }
1001    }
1002
1003    emit(BRW_OPCODE_DO);
1004
1005    if (ir->to) {
1006       this->base_ir = ir->to;
1007       ir->to->accept(this);
1008
1009       emit(CMP(dst_null_d(), src_reg(counter), this->result,
1010                brw_conditional_for_comparison(ir->cmp)));
1011
1012       vec4_instruction *inst = emit(BRW_OPCODE_BREAK);
1013       inst->predicate = BRW_PREDICATE_NORMAL;
1014    }
1015
1016    visit_instructions(&ir->body_instructions);
1017
1018
1019    if (ir->increment) {
1020       this->base_ir = ir->increment;
1021       ir->increment->accept(this);
1022       emit(ADD(counter, src_reg(counter), this->result));
1023    }
1024
1025    emit(BRW_OPCODE_WHILE);
1026 }
1027
1028 void
1029 vec4_visitor::visit(ir_loop_jump *ir)
1030 {
1031    switch (ir->mode) {
1032    case ir_loop_jump::jump_break:
1033       emit(BRW_OPCODE_BREAK);
1034       break;
1035    case ir_loop_jump::jump_continue:
1036       emit(BRW_OPCODE_CONTINUE);
1037       break;
1038    }
1039 }
1040
1041
1042 void
1043 vec4_visitor::visit(ir_function_signature *ir)
1044 {
1045    assert(0);
1046    (void)ir;
1047 }
1048
1049 void
1050 vec4_visitor::visit(ir_function *ir)
1051 {
1052    /* Ignore function bodies other than main() -- we shouldn't see calls to
1053     * them since they should all be inlined.
1054     */
1055    if (strcmp(ir->name, "main") == 0) {
1056       const ir_function_signature *sig;
1057       exec_list empty;
1058
1059       sig = ir->matching_signature(&empty);
1060
1061       assert(sig);
1062
1063       visit_instructions(&sig->body);
1064    }
1065 }
1066
1067 bool
1068 vec4_visitor::try_emit_sat(ir_expression *ir)
1069 {
1070    ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
1071    if (!sat_src)
1072       return false;
1073
1074    sat_src->accept(this);
1075    src_reg src = this->result;
1076
1077    this->result = src_reg(this, ir->type);
1078    vec4_instruction *inst;
1079    inst = emit(MOV(dst_reg(this->result), src));
1080    inst->saturate = true;
1081
1082    return true;
1083 }
1084
1085 bool
1086 vec4_visitor::try_emit_mad(ir_expression *ir, int mul_arg)
1087 {
1088    /* 3-src instructions were introduced in gen6. */
1089    if (brw->gen < 6)
1090       return false;
1091
1092    /* MAD can only handle floating-point data. */
1093    if (ir->type->base_type != GLSL_TYPE_FLOAT)
1094       return false;
1095
1096    ir_rvalue *nonmul = ir->operands[1 - mul_arg];
1097    ir_expression *mul = ir->operands[mul_arg]->as_expression();
1098
1099    if (!mul || mul->operation != ir_binop_mul)
1100       return false;
1101
1102    nonmul->accept(this);
1103    src_reg src0 = fix_3src_operand(this->result);
1104
1105    mul->operands[0]->accept(this);
1106    src_reg src1 = fix_3src_operand(this->result);
1107
1108    mul->operands[1]->accept(this);
1109    src_reg src2 = fix_3src_operand(this->result);
1110
1111    this->result = src_reg(this, ir->type);
1112    emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1113
1114    return true;
1115 }
1116
1117 void
1118 vec4_visitor::emit_bool_comparison(unsigned int op,
1119                                  dst_reg dst, src_reg src0, src_reg src1)
1120 {
1121    /* original gen4 does destination conversion before comparison. */
1122    if (brw->gen < 5)
1123       dst.type = src0.type;
1124
1125    emit(CMP(dst, src0, src1, brw_conditional_for_comparison(op)));
1126
1127    dst.type = BRW_REGISTER_TYPE_D;
1128    emit(AND(dst, src_reg(dst), src_reg(0x1)));
1129 }
1130
1131 void
1132 vec4_visitor::emit_minmax(uint32_t conditionalmod, dst_reg dst,
1133                           src_reg src0, src_reg src1)
1134 {
1135    vec4_instruction *inst;
1136
1137    if (brw->gen >= 6) {
1138       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1139       inst->conditional_mod = conditionalmod;
1140    } else {
1141       emit(CMP(dst, src0, src1, conditionalmod));
1142
1143       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1144       inst->predicate = BRW_PREDICATE_NORMAL;
1145    }
1146 }
1147
1148 static bool
1149 is_16bit_constant(ir_rvalue *rvalue)
1150 {
1151    ir_constant *constant = rvalue->as_constant();
1152    if (!constant)
1153       return false;
1154
1155    if (constant->type != glsl_type::int_type &&
1156        constant->type != glsl_type::uint_type)
1157       return false;
1158
1159    return constant->value.u[0] < (1 << 16);
1160 }
1161
1162 void
1163 vec4_visitor::visit(ir_expression *ir)
1164 {
1165    unsigned int operand;
1166    src_reg op[Elements(ir->operands)];
1167    src_reg result_src;
1168    dst_reg result_dst;
1169    vec4_instruction *inst;
1170
1171    if (try_emit_sat(ir))
1172       return;
1173
1174    if (ir->operation == ir_binop_add) {
1175       if (try_emit_mad(ir, 0) || try_emit_mad(ir, 1))
1176          return;
1177    }
1178
1179    for (operand = 0; operand < ir->get_num_operands(); operand++) {
1180       this->result.file = BAD_FILE;
1181       ir->operands[operand]->accept(this);
1182       if (this->result.file == BAD_FILE) {
1183          printf("Failed to get tree for expression operand:\n");
1184          ir->operands[operand]->print();
1185          exit(1);
1186       }
1187       op[operand] = this->result;
1188
1189       /* Matrix expression operands should have been broken down to vector
1190        * operations already.
1191        */
1192       assert(!ir->operands[operand]->type->is_matrix());
1193    }
1194
1195    int vector_elements = ir->operands[0]->type->vector_elements;
1196    if (ir->operands[1]) {
1197       vector_elements = MAX2(vector_elements,
1198                              ir->operands[1]->type->vector_elements);
1199    }
1200
1201    this->result.file = BAD_FILE;
1202
1203    /* Storage for our result.  Ideally for an assignment we'd be using
1204     * the actual storage for the result here, instead.
1205     */
1206    result_src = src_reg(this, ir->type);
1207    /* convenience for the emit functions below. */
1208    result_dst = dst_reg(result_src);
1209    /* If nothing special happens, this is the result. */
1210    this->result = result_src;
1211    /* Limit writes to the channels that will be used by result_src later.
1212     * This does limit this temp's use as a temporary for multi-instruction
1213     * sequences.
1214     */
1215    result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1216
1217    switch (ir->operation) {
1218    case ir_unop_logic_not:
1219       /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
1220        * ones complement of the whole register, not just bit 0.
1221        */
1222       emit(XOR(result_dst, op[0], src_reg(1)));
1223       break;
1224    case ir_unop_neg:
1225       op[0].negate = !op[0].negate;
1226       emit(MOV(result_dst, op[0]));
1227       break;
1228    case ir_unop_abs:
1229       op[0].abs = true;
1230       op[0].negate = false;
1231       emit(MOV(result_dst, op[0]));
1232       break;
1233
1234    case ir_unop_sign:
1235       emit(MOV(result_dst, src_reg(0.0f)));
1236
1237       emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_G));
1238       inst = emit(MOV(result_dst, src_reg(1.0f)));
1239       inst->predicate = BRW_PREDICATE_NORMAL;
1240
1241       emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_L));
1242       inst = emit(MOV(result_dst, src_reg(-1.0f)));
1243       inst->predicate = BRW_PREDICATE_NORMAL;
1244
1245       break;
1246
1247    case ir_unop_rcp:
1248       emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1249       break;
1250
1251    case ir_unop_exp2:
1252       emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1253       break;
1254    case ir_unop_log2:
1255       emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1256       break;
1257    case ir_unop_exp:
1258    case ir_unop_log:
1259       assert(!"not reached: should be handled by ir_explog_to_explog2");
1260       break;
1261    case ir_unop_sin:
1262    case ir_unop_sin_reduced:
1263       emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1264       break;
1265    case ir_unop_cos:
1266    case ir_unop_cos_reduced:
1267       emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1268       break;
1269
1270    case ir_unop_dFdx:
1271    case ir_unop_dFdy:
1272       assert(!"derivatives not valid in vertex shader");
1273       break;
1274
1275    case ir_unop_bitfield_reverse:
1276       emit(BFREV(result_dst, op[0]));
1277       break;
1278    case ir_unop_bit_count:
1279       emit(CBIT(result_dst, op[0]));
1280       break;
1281    case ir_unop_find_msb: {
1282       src_reg temp = src_reg(this, glsl_type::uint_type);
1283
1284       inst = emit(FBH(dst_reg(temp), op[0]));
1285       inst->dst.writemask = WRITEMASK_XYZW;
1286
1287       /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1288        * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1289        * subtract the result from 31 to convert the MSB count into an LSB count.
1290        */
1291
1292       /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1293       temp.swizzle = BRW_SWIZZLE_NOOP;
1294       emit(MOV(result_dst, temp));
1295
1296       src_reg src_tmp = src_reg(result_dst);
1297       emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1298
1299       src_tmp.negate = true;
1300       inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1301       inst->predicate = BRW_PREDICATE_NORMAL;
1302       break;
1303    }
1304    case ir_unop_find_lsb:
1305       emit(FBL(result_dst, op[0]));
1306       break;
1307
1308    case ir_unop_noise:
1309       assert(!"not reached: should be handled by lower_noise");
1310       break;
1311
1312    case ir_binop_add:
1313       emit(ADD(result_dst, op[0], op[1]));
1314       break;
1315    case ir_binop_sub:
1316       assert(!"not reached: should be handled by ir_sub_to_add_neg");
1317       break;
1318
1319    case ir_binop_mul:
1320       if (ir->type->is_integer()) {
1321          /* For integer multiplication, the MUL uses the low 16 bits of one of
1322           * the operands (src0 through SNB, src1 on IVB and later).  The MACH
1323           * accumulates in the contribution of the upper 16 bits of that
1324           * operand.  If we can determine that one of the args is in the low
1325           * 16 bits, though, we can just emit a single MUL.
1326           */
1327          if (is_16bit_constant(ir->operands[0])) {
1328             if (brw->gen < 7)
1329                emit(MUL(result_dst, op[0], op[1]));
1330             else
1331                emit(MUL(result_dst, op[1], op[0]));
1332          } else if (is_16bit_constant(ir->operands[1])) {
1333             if (brw->gen < 7)
1334                emit(MUL(result_dst, op[1], op[0]));
1335             else
1336                emit(MUL(result_dst, op[0], op[1]));
1337          } else {
1338             struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_D);
1339
1340             emit(MUL(acc, op[0], op[1]));
1341             emit(MACH(dst_null_d(), op[0], op[1]));
1342             emit(MOV(result_dst, src_reg(acc)));
1343          }
1344       } else {
1345          emit(MUL(result_dst, op[0], op[1]));
1346       }
1347       break;
1348    case ir_binop_div:
1349       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1350       assert(ir->type->is_integer());
1351       emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1352       break;
1353    case ir_binop_mod:
1354       /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1355       assert(ir->type->is_integer());
1356       emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1357       break;
1358
1359    case ir_binop_less:
1360    case ir_binop_greater:
1361    case ir_binop_lequal:
1362    case ir_binop_gequal:
1363    case ir_binop_equal:
1364    case ir_binop_nequal: {
1365       emit(CMP(result_dst, op[0], op[1],
1366                brw_conditional_for_comparison(ir->operation)));
1367       emit(AND(result_dst, result_src, src_reg(0x1)));
1368       break;
1369    }
1370
1371    case ir_binop_all_equal:
1372       /* "==" operator producing a scalar boolean. */
1373       if (ir->operands[0]->type->is_vector() ||
1374           ir->operands[1]->type->is_vector()) {
1375          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1376          emit(MOV(result_dst, src_reg(0)));
1377          inst = emit(MOV(result_dst, src_reg(1)));
1378          inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1379       } else {
1380          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1381          emit(AND(result_dst, result_src, src_reg(0x1)));
1382       }
1383       break;
1384    case ir_binop_any_nequal:
1385       /* "!=" operator producing a scalar boolean. */
1386       if (ir->operands[0]->type->is_vector() ||
1387           ir->operands[1]->type->is_vector()) {
1388          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1389
1390          emit(MOV(result_dst, src_reg(0)));
1391          inst = emit(MOV(result_dst, src_reg(1)));
1392          inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1393       } else {
1394          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1395          emit(AND(result_dst, result_src, src_reg(0x1)));
1396       }
1397       break;
1398
1399    case ir_unop_any:
1400       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1401       emit(MOV(result_dst, src_reg(0)));
1402
1403       inst = emit(MOV(result_dst, src_reg(1)));
1404       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1405       break;
1406
1407    case ir_binop_logic_xor:
1408       emit(XOR(result_dst, op[0], op[1]));
1409       break;
1410
1411    case ir_binop_logic_or:
1412       emit(OR(result_dst, op[0], op[1]));
1413       break;
1414
1415    case ir_binop_logic_and:
1416       emit(AND(result_dst, op[0], op[1]));
1417       break;
1418
1419    case ir_binop_dot:
1420       assert(ir->operands[0]->type->is_vector());
1421       assert(ir->operands[0]->type == ir->operands[1]->type);
1422       emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1423       break;
1424
1425    case ir_unop_sqrt:
1426       emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1427       break;
1428    case ir_unop_rsq:
1429       emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1430       break;
1431
1432    case ir_unop_bitcast_i2f:
1433    case ir_unop_bitcast_u2f:
1434       this->result = op[0];
1435       this->result.type = BRW_REGISTER_TYPE_F;
1436       break;
1437
1438    case ir_unop_bitcast_f2i:
1439       this->result = op[0];
1440       this->result.type = BRW_REGISTER_TYPE_D;
1441       break;
1442
1443    case ir_unop_bitcast_f2u:
1444       this->result = op[0];
1445       this->result.type = BRW_REGISTER_TYPE_UD;
1446       break;
1447
1448    case ir_unop_i2f:
1449    case ir_unop_i2u:
1450    case ir_unop_u2i:
1451    case ir_unop_u2f:
1452    case ir_unop_b2f:
1453    case ir_unop_b2i:
1454    case ir_unop_f2i:
1455    case ir_unop_f2u:
1456       emit(MOV(result_dst, op[0]));
1457       break;
1458    case ir_unop_f2b:
1459    case ir_unop_i2b: {
1460       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1461       emit(AND(result_dst, result_src, src_reg(1)));
1462       break;
1463    }
1464
1465    case ir_unop_trunc:
1466       emit(RNDZ(result_dst, op[0]));
1467       break;
1468    case ir_unop_ceil:
1469       op[0].negate = !op[0].negate;
1470       inst = emit(RNDD(result_dst, op[0]));
1471       this->result.negate = true;
1472       break;
1473    case ir_unop_floor:
1474       inst = emit(RNDD(result_dst, op[0]));
1475       break;
1476    case ir_unop_fract:
1477       inst = emit(FRC(result_dst, op[0]));
1478       break;
1479    case ir_unop_round_even:
1480       emit(RNDE(result_dst, op[0]));
1481       break;
1482
1483    case ir_binop_min:
1484       emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1485       break;
1486    case ir_binop_max:
1487       emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1488       break;
1489
1490    case ir_binop_pow:
1491       emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1492       break;
1493
1494    case ir_unop_bit_not:
1495       inst = emit(NOT(result_dst, op[0]));
1496       break;
1497    case ir_binop_bit_and:
1498       inst = emit(AND(result_dst, op[0], op[1]));
1499       break;
1500    case ir_binop_bit_xor:
1501       inst = emit(XOR(result_dst, op[0], op[1]));
1502       break;
1503    case ir_binop_bit_or:
1504       inst = emit(OR(result_dst, op[0], op[1]));
1505       break;
1506
1507    case ir_binop_lshift:
1508       inst = emit(SHL(result_dst, op[0], op[1]));
1509       break;
1510
1511    case ir_binop_rshift:
1512       if (ir->type->base_type == GLSL_TYPE_INT)
1513          inst = emit(ASR(result_dst, op[0], op[1]));
1514       else
1515          inst = emit(SHR(result_dst, op[0], op[1]));
1516       break;
1517
1518    case ir_binop_bfm:
1519       emit(BFI1(result_dst, op[0], op[1]));
1520       break;
1521
1522    case ir_binop_ubo_load: {
1523       ir_constant *uniform_block = ir->operands[0]->as_constant();
1524       ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1525       unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1526       src_reg offset = op[1];
1527
1528       /* Now, load the vector from that offset. */
1529       assert(ir->type->is_vector() || ir->type->is_scalar());
1530
1531       src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1532       packed_consts.type = result.type;
1533       src_reg surf_index =
1534          src_reg(SURF_INDEX_VEC4_UBO(uniform_block->value.u[0]));
1535       if (const_offset_ir) {
1536          offset = src_reg(const_offset / 16);
1537       } else {
1538          emit(SHR(dst_reg(offset), offset, src_reg(4)));
1539       }
1540
1541       vec4_instruction *pull =
1542          emit(new(mem_ctx) vec4_instruction(this,
1543                                             VS_OPCODE_PULL_CONSTANT_LOAD,
1544                                             dst_reg(packed_consts),
1545                                             surf_index,
1546                                             offset));
1547       pull->base_mrf = 14;
1548       pull->mlen = 1;
1549
1550       packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1551       packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1552                                             const_offset % 16 / 4,
1553                                             const_offset % 16 / 4,
1554                                             const_offset % 16 / 4);
1555
1556       /* UBO bools are any nonzero int.  We store bools as either 0 or 1. */
1557       if (ir->type->base_type == GLSL_TYPE_BOOL) {
1558          emit(CMP(result_dst, packed_consts, src_reg(0u),
1559                   BRW_CONDITIONAL_NZ));
1560          emit(AND(result_dst, result, src_reg(0x1)));
1561       } else {
1562          emit(MOV(result_dst, packed_consts));
1563       }
1564       break;
1565    }
1566
1567    case ir_binop_vector_extract:
1568       assert(!"should have been lowered by vec_index_to_cond_assign");
1569       break;
1570
1571    case ir_triop_fma:
1572       op[0] = fix_3src_operand(op[0]);
1573       op[1] = fix_3src_operand(op[1]);
1574       op[2] = fix_3src_operand(op[2]);
1575       /* Note that the instruction's argument order is reversed from GLSL
1576        * and the IR.
1577        */
1578       emit(MAD(result_dst, op[2], op[1], op[0]));
1579       break;
1580
1581    case ir_triop_lrp:
1582       op[0] = fix_3src_operand(op[0]);
1583       op[1] = fix_3src_operand(op[1]);
1584       op[2] = fix_3src_operand(op[2]);
1585       /* Note that the instruction's argument order is reversed from GLSL
1586        * and the IR.
1587        */
1588       emit(LRP(result_dst, op[2], op[1], op[0]));
1589       break;
1590
1591    case ir_triop_bfi:
1592       op[0] = fix_3src_operand(op[0]);
1593       op[1] = fix_3src_operand(op[1]);
1594       op[2] = fix_3src_operand(op[2]);
1595       emit(BFI2(result_dst, op[0], op[1], op[2]));
1596       break;
1597
1598    case ir_triop_bitfield_extract:
1599       op[0] = fix_3src_operand(op[0]);
1600       op[1] = fix_3src_operand(op[1]);
1601       op[2] = fix_3src_operand(op[2]);
1602       /* Note that the instruction's argument order is reversed from GLSL
1603        * and the IR.
1604        */
1605       emit(BFE(result_dst, op[2], op[1], op[0]));
1606       break;
1607
1608    case ir_triop_vector_insert:
1609       assert(!"should have been lowered by lower_vector_insert");
1610       break;
1611
1612    case ir_quadop_bitfield_insert:
1613       assert(!"not reached: should be handled by "
1614               "bitfield_insert_to_bfm_bfi\n");
1615       break;
1616
1617    case ir_quadop_vector:
1618       assert(!"not reached: should be handled by lower_quadop_vector");
1619       break;
1620
1621    case ir_unop_pack_half_2x16:
1622       emit_pack_half_2x16(result_dst, op[0]);
1623       break;
1624    case ir_unop_unpack_half_2x16:
1625       emit_unpack_half_2x16(result_dst, op[0]);
1626       break;
1627    case ir_unop_pack_snorm_2x16:
1628    case ir_unop_pack_snorm_4x8:
1629    case ir_unop_pack_unorm_2x16:
1630    case ir_unop_pack_unorm_4x8:
1631    case ir_unop_unpack_snorm_2x16:
1632    case ir_unop_unpack_snorm_4x8:
1633    case ir_unop_unpack_unorm_2x16:
1634    case ir_unop_unpack_unorm_4x8:
1635       assert(!"not reached: should be handled by lower_packing_builtins");
1636       break;
1637    case ir_unop_unpack_half_2x16_split_x:
1638    case ir_unop_unpack_half_2x16_split_y:
1639    case ir_binop_pack_half_2x16_split:
1640       assert(!"not reached: should not occur in vertex shader");
1641       break;
1642    }
1643 }
1644
1645
1646 void
1647 vec4_visitor::visit(ir_swizzle *ir)
1648 {
1649    src_reg src;
1650    int i = 0;
1651    int swizzle[4];
1652
1653    /* Note that this is only swizzles in expressions, not those on the left
1654     * hand side of an assignment, which do write masking.  See ir_assignment
1655     * for that.
1656     */
1657
1658    ir->val->accept(this);
1659    src = this->result;
1660    assert(src.file != BAD_FILE);
1661
1662    for (i = 0; i < ir->type->vector_elements; i++) {
1663       switch (i) {
1664       case 0:
1665          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1666          break;
1667       case 1:
1668          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1669          break;
1670       case 2:
1671          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1672          break;
1673       case 3:
1674          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1675             break;
1676       }
1677    }
1678    for (; i < 4; i++) {
1679       /* Replicate the last channel out. */
1680       swizzle[i] = swizzle[ir->type->vector_elements - 1];
1681    }
1682
1683    src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1684
1685    this->result = src;
1686 }
1687
1688 void
1689 vec4_visitor::visit(ir_dereference_variable *ir)
1690 {
1691    const struct glsl_type *type = ir->type;
1692    dst_reg *reg = variable_storage(ir->var);
1693
1694    if (!reg) {
1695       fail("Failed to find variable storage for %s\n", ir->var->name);
1696       this->result = src_reg(brw_null_reg());
1697       return;
1698    }
1699
1700    this->result = src_reg(*reg);
1701
1702    /* System values get their swizzle from the dst_reg writemask */
1703    if (ir->var->mode == ir_var_system_value)
1704       return;
1705
1706    if (type->is_scalar() || type->is_vector() || type->is_matrix())
1707       this->result.swizzle = swizzle_for_size(type->vector_elements);
1708 }
1709
1710
1711 int
1712 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1713 {
1714    /* Under normal circumstances array elements are stored consecutively, so
1715     * the stride is equal to the size of the array element.
1716     */
1717    return type_size(ir->type);
1718 }
1719
1720
1721 void
1722 vec4_visitor::visit(ir_dereference_array *ir)
1723 {
1724    ir_constant *constant_index;
1725    src_reg src;
1726    int array_stride = compute_array_stride(ir);
1727
1728    constant_index = ir->array_index->constant_expression_value();
1729
1730    ir->array->accept(this);
1731    src = this->result;
1732
1733    if (constant_index) {
1734       src.reg_offset += constant_index->value.i[0] * array_stride;
1735    } else {
1736       /* Variable index array dereference.  It eats the "vec4" of the
1737        * base of the array and an index that offsets the Mesa register
1738        * index.
1739        */
1740       ir->array_index->accept(this);
1741
1742       src_reg index_reg;
1743
1744       if (array_stride == 1) {
1745          index_reg = this->result;
1746       } else {
1747          index_reg = src_reg(this, glsl_type::int_type);
1748
1749          emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
1750       }
1751
1752       if (src.reladdr) {
1753          src_reg temp = src_reg(this, glsl_type::int_type);
1754
1755          emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1756
1757          index_reg = temp;
1758       }
1759
1760       src.reladdr = ralloc(mem_ctx, src_reg);
1761       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1762    }
1763
1764    /* If the type is smaller than a vec4, replicate the last channel out. */
1765    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1766       src.swizzle = swizzle_for_size(ir->type->vector_elements);
1767    else
1768       src.swizzle = BRW_SWIZZLE_NOOP;
1769    src.type = brw_type_for_base_type(ir->type);
1770
1771    this->result = src;
1772 }
1773
1774 void
1775 vec4_visitor::visit(ir_dereference_record *ir)
1776 {
1777    unsigned int i;
1778    const glsl_type *struct_type = ir->record->type;
1779    int offset = 0;
1780
1781    ir->record->accept(this);
1782
1783    for (i = 0; i < struct_type->length; i++) {
1784       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1785          break;
1786       offset += type_size(struct_type->fields.structure[i].type);
1787    }
1788
1789    /* If the type is smaller than a vec4, replicate the last channel out. */
1790    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1791       this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1792    else
1793       this->result.swizzle = BRW_SWIZZLE_NOOP;
1794    this->result.type = brw_type_for_base_type(ir->type);
1795
1796    this->result.reg_offset += offset;
1797 }
1798
1799 /**
1800  * We want to be careful in assignment setup to hit the actual storage
1801  * instead of potentially using a temporary like we might with the
1802  * ir_dereference handler.
1803  */
1804 static dst_reg
1805 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1806 {
1807    /* The LHS must be a dereference.  If the LHS is a variable indexed array
1808     * access of a vector, it must be separated into a series conditional moves
1809     * before reaching this point (see ir_vec_index_to_cond_assign).
1810     */
1811    assert(ir->as_dereference());
1812    ir_dereference_array *deref_array = ir->as_dereference_array();
1813    if (deref_array) {
1814       assert(!deref_array->array->type->is_vector());
1815    }
1816
1817    /* Use the rvalue deref handler for the most part.  We'll ignore
1818     * swizzles in it and write swizzles using writemask, though.
1819     */
1820    ir->accept(v);
1821    return dst_reg(v->result);
1822 }
1823
1824 void
1825 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1826                               const struct glsl_type *type, uint32_t predicate)
1827 {
1828    if (type->base_type == GLSL_TYPE_STRUCT) {
1829       for (unsigned int i = 0; i < type->length; i++) {
1830          emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1831       }
1832       return;
1833    }
1834
1835    if (type->is_array()) {
1836       for (unsigned int i = 0; i < type->length; i++) {
1837          emit_block_move(dst, src, type->fields.array, predicate);
1838       }
1839       return;
1840    }
1841
1842    if (type->is_matrix()) {
1843       const struct glsl_type *vec_type;
1844
1845       vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1846                                          type->vector_elements, 1);
1847
1848       for (int i = 0; i < type->matrix_columns; i++) {
1849          emit_block_move(dst, src, vec_type, predicate);
1850       }
1851       return;
1852    }
1853
1854    assert(type->is_scalar() || type->is_vector());
1855
1856    dst->type = brw_type_for_base_type(type);
1857    src->type = dst->type;
1858
1859    dst->writemask = (1 << type->vector_elements) - 1;
1860
1861    src->swizzle = swizzle_for_size(type->vector_elements);
1862
1863    vec4_instruction *inst = emit(MOV(*dst, *src));
1864    inst->predicate = predicate;
1865
1866    dst->reg_offset++;
1867    src->reg_offset++;
1868 }
1869
1870
1871 /* If the RHS processing resulted in an instruction generating a
1872  * temporary value, and it would be easy to rewrite the instruction to
1873  * generate its result right into the LHS instead, do so.  This ends
1874  * up reliably removing instructions where it can be tricky to do so
1875  * later without real UD chain information.
1876  */
1877 bool
1878 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
1879                                      dst_reg dst,
1880                                      src_reg src,
1881                                      vec4_instruction *pre_rhs_inst,
1882                                      vec4_instruction *last_rhs_inst)
1883 {
1884    /* This could be supported, but it would take more smarts. */
1885    if (ir->condition)
1886       return false;
1887
1888    if (pre_rhs_inst == last_rhs_inst)
1889       return false; /* No instructions generated to work with. */
1890
1891    /* Make sure the last instruction generated our source reg. */
1892    if (src.file != GRF ||
1893        src.file != last_rhs_inst->dst.file ||
1894        src.reg != last_rhs_inst->dst.reg ||
1895        src.reg_offset != last_rhs_inst->dst.reg_offset ||
1896        src.reladdr ||
1897        src.abs ||
1898        src.negate ||
1899        last_rhs_inst->predicate != BRW_PREDICATE_NONE)
1900       return false;
1901
1902    /* Check that that last instruction fully initialized the channels
1903     * we want to use, in the order we want to use them.  We could
1904     * potentially reswizzle the operands of many instructions so that
1905     * we could handle out of order channels, but don't yet.
1906     */
1907
1908    for (unsigned i = 0; i < 4; i++) {
1909       if (dst.writemask & (1 << i)) {
1910          if (!(last_rhs_inst->dst.writemask & (1 << i)))
1911             return false;
1912
1913          if (BRW_GET_SWZ(src.swizzle, i) != i)
1914             return false;
1915       }
1916    }
1917
1918    /* Success!  Rewrite the instruction. */
1919    last_rhs_inst->dst.file = dst.file;
1920    last_rhs_inst->dst.reg = dst.reg;
1921    last_rhs_inst->dst.reg_offset = dst.reg_offset;
1922    last_rhs_inst->dst.reladdr = dst.reladdr;
1923    last_rhs_inst->dst.writemask &= dst.writemask;
1924
1925    return true;
1926 }
1927
1928 void
1929 vec4_visitor::visit(ir_assignment *ir)
1930 {
1931    dst_reg dst = get_assignment_lhs(ir->lhs, this);
1932    uint32_t predicate = BRW_PREDICATE_NONE;
1933
1934    if (!ir->lhs->type->is_scalar() &&
1935        !ir->lhs->type->is_vector()) {
1936       ir->rhs->accept(this);
1937       src_reg src = this->result;
1938
1939       if (ir->condition) {
1940          emit_bool_to_cond_code(ir->condition, &predicate);
1941       }
1942
1943       /* emit_block_move doesn't account for swizzles in the source register.
1944        * This should be ok, since the source register is a structure or an
1945        * array, and those can't be swizzled.  But double-check to be sure.
1946        */
1947       assert(src.swizzle ==
1948              (ir->rhs->type->is_matrix()
1949               ? swizzle_for_size(ir->rhs->type->vector_elements)
1950               : BRW_SWIZZLE_NOOP));
1951
1952       emit_block_move(&dst, &src, ir->rhs->type, predicate);
1953       return;
1954    }
1955
1956    /* Now we're down to just a scalar/vector with writemasks. */
1957    int i;
1958
1959    vec4_instruction *pre_rhs_inst, *last_rhs_inst;
1960    pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1961
1962    ir->rhs->accept(this);
1963
1964    last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1965
1966    src_reg src = this->result;
1967
1968    int swizzles[4];
1969    int first_enabled_chan = 0;
1970    int src_chan = 0;
1971
1972    assert(ir->lhs->type->is_vector() ||
1973           ir->lhs->type->is_scalar());
1974    dst.writemask = ir->write_mask;
1975
1976    for (int i = 0; i < 4; i++) {
1977       if (dst.writemask & (1 << i)) {
1978          first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
1979          break;
1980       }
1981    }
1982
1983    /* Swizzle a small RHS vector into the channels being written.
1984     *
1985     * glsl ir treats write_mask as dictating how many channels are
1986     * present on the RHS while in our instructions we need to make
1987     * those channels appear in the slots of the vec4 they're written to.
1988     */
1989    for (int i = 0; i < 4; i++) {
1990       if (dst.writemask & (1 << i))
1991          swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
1992       else
1993          swizzles[i] = first_enabled_chan;
1994    }
1995    src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
1996                               swizzles[2], swizzles[3]);
1997
1998    if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
1999       return;
2000    }
2001
2002    if (ir->condition) {
2003       emit_bool_to_cond_code(ir->condition, &predicate);
2004    }
2005
2006    for (i = 0; i < type_size(ir->lhs->type); i++) {
2007       vec4_instruction *inst = emit(MOV(dst, src));
2008       inst->predicate = predicate;
2009
2010       dst.reg_offset++;
2011       src.reg_offset++;
2012    }
2013 }
2014
2015 void
2016 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2017 {
2018    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2019       foreach_list(node, &ir->components) {
2020          ir_constant *field_value = (ir_constant *)node;
2021
2022          emit_constant_values(dst, field_value);
2023       }
2024       return;
2025    }
2026
2027    if (ir->type->is_array()) {
2028       for (unsigned int i = 0; i < ir->type->length; i++) {
2029          emit_constant_values(dst, ir->array_elements[i]);
2030       }
2031       return;
2032    }
2033
2034    if (ir->type->is_matrix()) {
2035       for (int i = 0; i < ir->type->matrix_columns; i++) {
2036          float *vec = &ir->value.f[i * ir->type->vector_elements];
2037
2038          for (int j = 0; j < ir->type->vector_elements; j++) {
2039             dst->writemask = 1 << j;
2040             dst->type = BRW_REGISTER_TYPE_F;
2041
2042             emit(MOV(*dst, src_reg(vec[j])));
2043          }
2044          dst->reg_offset++;
2045       }
2046       return;
2047    }
2048
2049    int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2050
2051    for (int i = 0; i < ir->type->vector_elements; i++) {
2052       if (!(remaining_writemask & (1 << i)))
2053          continue;
2054
2055       dst->writemask = 1 << i;
2056       dst->type = brw_type_for_base_type(ir->type);
2057
2058       /* Find other components that match the one we're about to
2059        * write.  Emits fewer instructions for things like vec4(0.5,
2060        * 1.5, 1.5, 1.5).
2061        */
2062       for (int j = i + 1; j < ir->type->vector_elements; j++) {
2063          if (ir->type->base_type == GLSL_TYPE_BOOL) {
2064             if (ir->value.b[i] == ir->value.b[j])
2065                dst->writemask |= (1 << j);
2066          } else {
2067             /* u, i, and f storage all line up, so no need for a
2068              * switch case for comparing each type.
2069              */
2070             if (ir->value.u[i] == ir->value.u[j])
2071                dst->writemask |= (1 << j);
2072          }
2073       }
2074
2075       switch (ir->type->base_type) {
2076       case GLSL_TYPE_FLOAT:
2077          emit(MOV(*dst, src_reg(ir->value.f[i])));
2078          break;
2079       case GLSL_TYPE_INT:
2080          emit(MOV(*dst, src_reg(ir->value.i[i])));
2081          break;
2082       case GLSL_TYPE_UINT:
2083          emit(MOV(*dst, src_reg(ir->value.u[i])));
2084          break;
2085       case GLSL_TYPE_BOOL:
2086          emit(MOV(*dst, src_reg(ir->value.b[i])));
2087          break;
2088       default:
2089          assert(!"Non-float/uint/int/bool constant");
2090          break;
2091       }
2092
2093       remaining_writemask &= ~dst->writemask;
2094    }
2095    dst->reg_offset++;
2096 }
2097
2098 void
2099 vec4_visitor::visit(ir_constant *ir)
2100 {
2101    dst_reg dst = dst_reg(this, ir->type);
2102    this->result = src_reg(dst);
2103
2104    emit_constant_values(&dst, ir);
2105 }
2106
2107 void
2108 vec4_visitor::visit(ir_call *ir)
2109 {
2110    assert(!"not reached");
2111 }
2112
2113 void
2114 vec4_visitor::visit(ir_texture *ir)
2115 {
2116    int sampler =
2117       _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2118
2119    /* Should be lowered by do_lower_texture_projection */
2120    assert(!ir->projector);
2121
2122    /* Generate code to compute all the subexpression trees.  This has to be
2123     * done before loading any values into MRFs for the sampler message since
2124     * generating these values may involve SEND messages that need the MRFs.
2125     */
2126    src_reg coordinate;
2127    if (ir->coordinate) {
2128       ir->coordinate->accept(this);
2129       coordinate = this->result;
2130    }
2131
2132    src_reg shadow_comparitor;
2133    if (ir->shadow_comparitor) {
2134       ir->shadow_comparitor->accept(this);
2135       shadow_comparitor = this->result;
2136    }
2137
2138    const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2139    src_reg lod, dPdx, dPdy, sample_index;
2140    switch (ir->op) {
2141    case ir_tex:
2142       lod = src_reg(0.0f);
2143       lod_type = glsl_type::float_type;
2144       break;
2145    case ir_txf:
2146    case ir_txl:
2147    case ir_txs:
2148       ir->lod_info.lod->accept(this);
2149       lod = this->result;
2150       lod_type = ir->lod_info.lod->type;
2151       break;
2152    case ir_txf_ms:
2153       ir->lod_info.sample_index->accept(this);
2154       sample_index = this->result;
2155       sample_index_type = ir->lod_info.sample_index->type;
2156       break;
2157    case ir_txd:
2158       ir->lod_info.grad.dPdx->accept(this);
2159       dPdx = this->result;
2160
2161       ir->lod_info.grad.dPdy->accept(this);
2162       dPdy = this->result;
2163
2164       lod_type = ir->lod_info.grad.dPdx->type;
2165       break;
2166    case ir_txb:
2167    case ir_lod:
2168       break;
2169    }
2170
2171    vec4_instruction *inst = NULL;
2172    switch (ir->op) {
2173    case ir_tex:
2174    case ir_txl:
2175       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXL);
2176       break;
2177    case ir_txd:
2178       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXD);
2179       break;
2180    case ir_txf:
2181       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF);
2182       break;
2183    case ir_txf_ms:
2184       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_MS);
2185       break;
2186    case ir_txs:
2187       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
2188       break;
2189    case ir_txb:
2190       assert(!"TXB is not valid for vertex shaders.");
2191       break;
2192    case ir_lod:
2193       assert(!"LOD is not valid for vertex shaders.");
2194       break;
2195    }
2196
2197    bool use_texture_offset = ir->offset != NULL && ir->op != ir_txf;
2198
2199    /* Texel offsets go in the message header; Gen4 also requires headers. */
2200    inst->header_present = use_texture_offset || brw->gen < 5;
2201    inst->base_mrf = 2;
2202    inst->mlen = inst->header_present + 1; /* always at least one */
2203    inst->sampler = sampler;
2204    inst->dst = dst_reg(this, ir->type);
2205    inst->dst.writemask = WRITEMASK_XYZW;
2206    inst->shadow_compare = ir->shadow_comparitor != NULL;
2207
2208    if (use_texture_offset)
2209       inst->texture_offset = brw_texture_offset(ir->offset->as_constant());
2210
2211    /* MRF for the first parameter */
2212    int param_base = inst->base_mrf + inst->header_present;
2213
2214    if (ir->op == ir_txs) {
2215       int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2216       emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2217    } else {
2218       int i, coord_mask = 0, zero_mask = 0;
2219       /* Load the coordinate */
2220       /* FINISHME: gl_clamp_mask and saturate */
2221       for (i = 0; i < ir->coordinate->type->vector_elements; i++)
2222          coord_mask |= (1 << i);
2223       for (; i < 4; i++)
2224          zero_mask |= (1 << i);
2225
2226       if (ir->offset && ir->op == ir_txf) {
2227          /* It appears that the ld instruction used for txf does its
2228           * address bounds check before adding in the offset.  To work
2229           * around this, just add the integer offset to the integer
2230           * texel coordinate, and don't put the offset in the header.
2231           */
2232          ir_constant *offset = ir->offset->as_constant();
2233          assert(offset);
2234
2235          for (int j = 0; j < ir->coordinate->type->vector_elements; j++) {
2236             src_reg src = coordinate;
2237             src.swizzle = BRW_SWIZZLE4(BRW_GET_SWZ(src.swizzle, j),
2238                                        BRW_GET_SWZ(src.swizzle, j),
2239                                        BRW_GET_SWZ(src.swizzle, j),
2240                                        BRW_GET_SWZ(src.swizzle, j));
2241             emit(ADD(dst_reg(MRF, param_base, ir->coordinate->type, 1 << j),
2242                      src, offset->value.i[j]));
2243          }
2244       } else {
2245          emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2246                   coordinate));
2247       }
2248       emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2249                src_reg(0)));
2250       /* Load the shadow comparitor */
2251       if (ir->shadow_comparitor && ir->op != ir_txd) {
2252          emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2253                           WRITEMASK_X),
2254                   shadow_comparitor));
2255          inst->mlen++;
2256       }
2257
2258       /* Load the LOD info */
2259       if (ir->op == ir_tex || ir->op == ir_txl) {
2260          int mrf, writemask;
2261          if (brw->gen >= 5) {
2262             mrf = param_base + 1;
2263             if (ir->shadow_comparitor) {
2264                writemask = WRITEMASK_Y;
2265                /* mlen already incremented */
2266             } else {
2267                writemask = WRITEMASK_X;
2268                inst->mlen++;
2269             }
2270          } else /* brw->gen == 4 */ {
2271             mrf = param_base;
2272             writemask = WRITEMASK_W;
2273          }
2274          emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2275       } else if (ir->op == ir_txf) {
2276          emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2277       } else if (ir->op == ir_txf_ms) {
2278          emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2279                   sample_index));
2280          inst->mlen++;
2281
2282          /* on Gen7, there is an additional MCS parameter here after SI,
2283           * but we don't bother to emit it since it's always zero. If
2284           * we start supporting texturing from CMS surfaces, this will have
2285           * to change
2286           */
2287       } else if (ir->op == ir_txd) {
2288          const glsl_type *type = lod_type;
2289
2290          if (brw->gen >= 5) {
2291             dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2292             dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2293             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2294             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2295             inst->mlen++;
2296
2297             if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2298                dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2299                dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2300                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2301                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2302                inst->mlen++;
2303
2304                if (ir->shadow_comparitor) {
2305                   emit(MOV(dst_reg(MRF, param_base + 2,
2306                                    ir->shadow_comparitor->type, WRITEMASK_Z),
2307                            shadow_comparitor));
2308                }
2309             }
2310          } else /* brw->gen == 4 */ {
2311             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2312             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2313             inst->mlen += 2;
2314          }
2315       }
2316    }
2317
2318    emit(inst);
2319
2320    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2321     * spec requires layers.
2322     */
2323    if (ir->op == ir_txs) {
2324       glsl_type const *type = ir->sampler->type;
2325       if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2326           type->sampler_array) {
2327          emit_math(SHADER_OPCODE_INT_QUOTIENT,
2328                    with_writemask(inst->dst, WRITEMASK_Z),
2329                    src_reg(inst->dst), src_reg(6));
2330       }
2331    }
2332
2333    swizzle_result(ir, src_reg(inst->dst), sampler);
2334 }
2335
2336 void
2337 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, int sampler)
2338 {
2339    int s = key->tex.swizzles[sampler];
2340
2341    this->result = src_reg(this, ir->type);
2342    dst_reg swizzled_result(this->result);
2343
2344    if (ir->op == ir_txs || ir->type == glsl_type::float_type
2345                         || s == SWIZZLE_NOOP) {
2346       emit(MOV(swizzled_result, orig_val));
2347       return;
2348    }
2349
2350    int zero_mask = 0, one_mask = 0, copy_mask = 0;
2351    int swizzle[4] = {0};
2352
2353    for (int i = 0; i < 4; i++) {
2354       switch (GET_SWZ(s, i)) {
2355       case SWIZZLE_ZERO:
2356          zero_mask |= (1 << i);
2357          break;
2358       case SWIZZLE_ONE:
2359          one_mask |= (1 << i);
2360          break;
2361       default:
2362          copy_mask |= (1 << i);
2363          swizzle[i] = GET_SWZ(s, i);
2364          break;
2365       }
2366    }
2367
2368    if (copy_mask) {
2369       orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2370       swizzled_result.writemask = copy_mask;
2371       emit(MOV(swizzled_result, orig_val));
2372    }
2373
2374    if (zero_mask) {
2375       swizzled_result.writemask = zero_mask;
2376       emit(MOV(swizzled_result, src_reg(0.0f)));
2377    }
2378
2379    if (one_mask) {
2380       swizzled_result.writemask = one_mask;
2381       emit(MOV(swizzled_result, src_reg(1.0f)));
2382    }
2383 }
2384
2385 void
2386 vec4_visitor::visit(ir_return *ir)
2387 {
2388    assert(!"not reached");
2389 }
2390
2391 void
2392 vec4_visitor::visit(ir_discard *ir)
2393 {
2394    assert(!"not reached");
2395 }
2396
2397 void
2398 vec4_visitor::visit(ir_if *ir)
2399 {
2400    /* Don't point the annotation at the if statement, because then it plus
2401     * the then and else blocks get printed.
2402     */
2403    this->base_ir = ir->condition;
2404
2405    if (brw->gen == 6) {
2406       emit_if_gen6(ir);
2407    } else {
2408       uint32_t predicate;
2409       emit_bool_to_cond_code(ir->condition, &predicate);
2410       emit(IF(predicate));
2411    }
2412
2413    visit_instructions(&ir->then_instructions);
2414
2415    if (!ir->else_instructions.is_empty()) {
2416       this->base_ir = ir->condition;
2417       emit(BRW_OPCODE_ELSE);
2418
2419       visit_instructions(&ir->else_instructions);
2420    }
2421
2422    this->base_ir = ir->condition;
2423    emit(BRW_OPCODE_ENDIF);
2424 }
2425
2426 void
2427 vec4_visitor::visit(ir_emit_vertex *)
2428 {
2429    assert(!"not reached");
2430 }
2431
2432 void
2433 vec4_visitor::visit(ir_end_primitive *)
2434 {
2435    assert(!"not reached");
2436 }
2437
2438 void
2439 vec4_visitor::emit_ndc_computation()
2440 {
2441    /* Get the position */
2442    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2443
2444    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2445    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2446    output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2447
2448    current_annotation = "NDC";
2449    dst_reg ndc_w = ndc;
2450    ndc_w.writemask = WRITEMASK_W;
2451    src_reg pos_w = pos;
2452    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2453    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2454
2455    dst_reg ndc_xyz = ndc;
2456    ndc_xyz.writemask = WRITEMASK_XYZ;
2457
2458    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2459 }
2460
2461 void
2462 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2463 {
2464    if (brw->gen < 6 &&
2465        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2466         key->userclip_active || brw->has_negative_rhw_bug)) {
2467       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2468       dst_reg header1_w = header1;
2469       header1_w.writemask = WRITEMASK_W;
2470
2471       emit(MOV(header1, 0u));
2472
2473       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2474          src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2475
2476          current_annotation = "Point size";
2477          emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2478          emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2479       }
2480
2481       if (key->userclip_active) {
2482          current_annotation = "Clipping flags";
2483          dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
2484          dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
2485
2486          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
2487          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
2488          emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
2489
2490          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
2491          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
2492          emit(SHL(flags1, src_reg(flags1), src_reg(4)));
2493          emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
2494       }
2495
2496       /* i965 clipping workaround:
2497        * 1) Test for -ve rhw
2498        * 2) If set,
2499        *      set ndc = (0,0,0,0)
2500        *      set ucp[6] = 1
2501        *
2502        * Later, clipping will detect ucp[6] and ensure the primitive is
2503        * clipped against all fixed planes.
2504        */
2505       if (brw->has_negative_rhw_bug) {
2506          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
2507          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
2508          emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
2509          vec4_instruction *inst;
2510          inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
2511          inst->predicate = BRW_PREDICATE_NORMAL;
2512          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
2513          inst->predicate = BRW_PREDICATE_NORMAL;
2514       }
2515
2516       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2517    } else if (brw->gen < 6) {
2518       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2519    } else {
2520       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2521       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2522          emit(MOV(brw_writemask(reg, WRITEMASK_W),
2523                   src_reg(output_reg[VARYING_SLOT_PSIZ])));
2524       }
2525       if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
2526          emit(MOV(retype(brw_writemask(reg, WRITEMASK_Y), BRW_REGISTER_TYPE_D),
2527                   src_reg(output_reg[VARYING_SLOT_LAYER])));
2528       }
2529    }
2530 }
2531
2532 void
2533 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
2534 {
2535    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2536     *
2537     *     "If a linked set of shaders forming the vertex stage contains no
2538     *     static write to gl_ClipVertex or gl_ClipDistance, but the
2539     *     application has requested clipping against user clip planes through
2540     *     the API, then the coordinate written to gl_Position is used for
2541     *     comparison against the user clip planes."
2542     *
2543     * This function is only called if the shader didn't write to
2544     * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
2545     * if the user wrote to it; otherwise we use gl_Position.
2546     */
2547    gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
2548    if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
2549       clip_vertex = VARYING_SLOT_POS;
2550    }
2551
2552    for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
2553         ++i) {
2554       reg.writemask = 1 << i;
2555       emit(DP4(reg,
2556                src_reg(output_reg[clip_vertex]),
2557                src_reg(this->userplane[i + offset])));
2558    }
2559 }
2560
2561 void
2562 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
2563 {
2564    assert (varying < VARYING_SLOT_MAX);
2565    reg.type = output_reg[varying].type;
2566    current_annotation = output_reg_annotation[varying];
2567    /* Copy the register, saturating if necessary */
2568    vec4_instruction *inst = emit(MOV(reg,
2569                                      src_reg(output_reg[varying])));
2570    if ((varying == VARYING_SLOT_COL0 ||
2571         varying == VARYING_SLOT_COL1 ||
2572         varying == VARYING_SLOT_BFC0 ||
2573         varying == VARYING_SLOT_BFC1) &&
2574        key->clamp_vertex_color) {
2575       inst->saturate = true;
2576    }
2577 }
2578
2579 void
2580 vec4_visitor::emit_urb_slot(int mrf, int varying)
2581 {
2582    struct brw_reg hw_reg = brw_message_reg(mrf);
2583    dst_reg reg = dst_reg(MRF, mrf);
2584    reg.type = BRW_REGISTER_TYPE_F;
2585
2586    switch (varying) {
2587    case VARYING_SLOT_PSIZ:
2588       /* PSIZ is always in slot 0, and is coupled with other flags. */
2589       current_annotation = "indices, point width, clip flags";
2590       emit_psiz_and_flags(hw_reg);
2591       break;
2592    case BRW_VARYING_SLOT_NDC:
2593       current_annotation = "NDC";
2594       emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
2595       break;
2596    case VARYING_SLOT_POS:
2597       current_annotation = "gl_Position";
2598       emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
2599       break;
2600    case VARYING_SLOT_EDGE:
2601       /* This is present when doing unfilled polygons.  We're supposed to copy
2602        * the edge flag from the user-provided vertex array
2603        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
2604        * of that attribute (starts as 1.0f).  This is then used in clipping to
2605        * determine which edges should be drawn as wireframe.
2606        */
2607       current_annotation = "edge flag";
2608       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
2609                                     glsl_type::float_type, WRITEMASK_XYZW))));
2610       break;
2611    case BRW_VARYING_SLOT_PAD:
2612       /* No need to write to this slot */
2613       break;
2614    default:
2615       emit_generic_urb_slot(reg, varying);
2616       break;
2617    }
2618 }
2619
2620 static int
2621 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
2622 {
2623    if (brw->gen >= 6) {
2624       /* URB data written (does not include the message header reg) must
2625        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
2626        * section 5.4.3.2.2: URB_INTERLEAVED.
2627        *
2628        * URB entries are allocated on a multiple of 1024 bits, so an
2629        * extra 128 bits written here to make the end align to 256 is
2630        * no problem.
2631        */
2632       if ((mlen % 2) != 1)
2633          mlen++;
2634    }
2635
2636    return mlen;
2637 }
2638
2639
2640 /**
2641  * Generates the VUE payload plus the necessary URB write instructions to
2642  * output it.
2643  *
2644  * The VUE layout is documented in Volume 2a.
2645  */
2646 void
2647 vec4_visitor::emit_vertex()
2648 {
2649    /* MRF 0 is reserved for the debugger, so start with message header
2650     * in MRF 1.
2651     */
2652    int base_mrf = 1;
2653    int mrf = base_mrf;
2654    /* In the process of generating our URB write message contents, we
2655     * may need to unspill a register or load from an array.  Those
2656     * reads would use MRFs 14-15.
2657     */
2658    int max_usable_mrf = 13;
2659
2660    /* The following assertion verifies that max_usable_mrf causes an
2661     * even-numbered amount of URB write data, which will meet gen6's
2662     * requirements for length alignment.
2663     */
2664    assert ((max_usable_mrf - base_mrf) % 2 == 0);
2665
2666    /* First mrf is the g0-based message header containing URB handles and
2667     * such.
2668     */
2669    emit_urb_write_header(mrf++);
2670
2671    if (brw->gen < 6) {
2672       emit_ndc_computation();
2673    }
2674
2675    /* Lower legacy ff and ClipVertex clipping to clip distances */
2676    if (key->userclip_active && !key->uses_clip_distance) {
2677       current_annotation = "user clip distances";
2678
2679       output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
2680       output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
2681
2682       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
2683       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
2684    }
2685
2686    /* Set up the VUE data for the first URB write */
2687    int slot;
2688    for (slot = 0; slot < prog_data->vue_map.num_slots; ++slot) {
2689       emit_urb_slot(mrf++, prog_data->vue_map.slot_to_varying[slot]);
2690
2691       /* If this was max_usable_mrf, we can't fit anything more into this URB
2692        * WRITE.
2693        */
2694       if (mrf > max_usable_mrf) {
2695          slot++;
2696          break;
2697       }
2698    }
2699
2700    bool complete = slot >= prog_data->vue_map.num_slots;
2701    current_annotation = "URB write";
2702    vec4_instruction *inst = emit_urb_write_opcode(complete);
2703    inst->base_mrf = base_mrf;
2704    inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2705
2706    /* Optional second URB write */
2707    if (!complete) {
2708       mrf = base_mrf + 1;
2709
2710       for (; slot < prog_data->vue_map.num_slots; ++slot) {
2711          assert(mrf < max_usable_mrf);
2712
2713          emit_urb_slot(mrf++, prog_data->vue_map.slot_to_varying[slot]);
2714       }
2715
2716       current_annotation = "URB write";
2717       inst = emit_urb_write_opcode(true /* complete */);
2718       inst->base_mrf = base_mrf;
2719       inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2720       /* URB destination offset.  In the previous write, we got MRFs
2721        * 2-13 minus the one header MRF, so 12 regs.  URB offset is in
2722        * URB row increments, and each of our MRFs is half of one of
2723        * those, since we're doing interleaved writes.
2724        */
2725       inst->offset = (max_usable_mrf - base_mrf) / 2;
2726    }
2727 }
2728
2729
2730 src_reg
2731 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
2732                                  src_reg *reladdr, int reg_offset)
2733 {
2734    /* Because we store the values to scratch interleaved like our
2735     * vertex data, we need to scale the vec4 index by 2.
2736     */
2737    int message_header_scale = 2;
2738
2739    /* Pre-gen6, the message header uses byte offsets instead of vec4
2740     * (16-byte) offset units.
2741     */
2742    if (brw->gen < 6)
2743       message_header_scale *= 16;
2744
2745    if (reladdr) {
2746       src_reg index = src_reg(this, glsl_type::int_type);
2747
2748       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2749       emit_before(inst, MUL(dst_reg(index),
2750                             index, src_reg(message_header_scale)));
2751
2752       return index;
2753    } else {
2754       return src_reg(reg_offset * message_header_scale);
2755    }
2756 }
2757
2758 src_reg
2759 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
2760                                        src_reg *reladdr, int reg_offset)
2761 {
2762    if (reladdr) {
2763       src_reg index = src_reg(this, glsl_type::int_type);
2764
2765       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2766
2767       /* Pre-gen6, the message header uses byte offsets instead of vec4
2768        * (16-byte) offset units.
2769        */
2770       if (brw->gen < 6) {
2771          emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
2772       }
2773
2774       return index;
2775    } else {
2776       int message_header_scale = brw->gen < 6 ? 16 : 1;
2777       return src_reg(reg_offset * message_header_scale);
2778    }
2779 }
2780
2781 /**
2782  * Emits an instruction before @inst to load the value named by @orig_src
2783  * from scratch space at @base_offset to @temp.
2784  *
2785  * @base_offset is measured in 32-byte units (the size of a register).
2786  */
2787 void
2788 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
2789                                 dst_reg temp, src_reg orig_src,
2790                                 int base_offset)
2791 {
2792    int reg_offset = base_offset + orig_src.reg_offset;
2793    src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
2794
2795    emit_before(inst, SCRATCH_READ(temp, index));
2796 }
2797
2798 /**
2799  * Emits an instruction after @inst to store the value to be written
2800  * to @orig_dst to scratch space at @base_offset, from @temp.
2801  *
2802  * @base_offset is measured in 32-byte units (the size of a register).
2803  */
2804 void
2805 vec4_visitor::emit_scratch_write(vec4_instruction *inst, int base_offset)
2806 {
2807    int reg_offset = base_offset + inst->dst.reg_offset;
2808    src_reg index = get_scratch_offset(inst, inst->dst.reladdr, reg_offset);
2809
2810    /* Create a temporary register to store *inst's result in.
2811     *
2812     * We have to be careful in MOVing from our temporary result register in
2813     * the scratch write.  If we swizzle from channels of the temporary that
2814     * weren't initialized, it will confuse live interval analysis, which will
2815     * make spilling fail to make progress.
2816     */
2817    src_reg temp = src_reg(this, glsl_type::vec4_type);
2818    temp.type = inst->dst.type;
2819    int first_writemask_chan = ffs(inst->dst.writemask) - 1;
2820    int swizzles[4];
2821    for (int i = 0; i < 4; i++)
2822       if (inst->dst.writemask & (1 << i))
2823          swizzles[i] = i;
2824       else
2825          swizzles[i] = first_writemask_chan;
2826    temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2827                                swizzles[2], swizzles[3]);
2828
2829    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
2830                                        inst->dst.writemask));
2831    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
2832    write->predicate = inst->predicate;
2833    write->ir = inst->ir;
2834    write->annotation = inst->annotation;
2835    inst->insert_after(write);
2836
2837    inst->dst.file = temp.file;
2838    inst->dst.reg = temp.reg;
2839    inst->dst.reg_offset = temp.reg_offset;
2840    inst->dst.reladdr = NULL;
2841 }
2842
2843 /**
2844  * We can't generally support array access in GRF space, because a
2845  * single instruction's destination can only span 2 contiguous
2846  * registers.  So, we send all GRF arrays that get variable index
2847  * access to scratch space.
2848  */
2849 void
2850 vec4_visitor::move_grf_array_access_to_scratch()
2851 {
2852    int scratch_loc[this->virtual_grf_count];
2853
2854    for (int i = 0; i < this->virtual_grf_count; i++) {
2855       scratch_loc[i] = -1;
2856    }
2857
2858    /* First, calculate the set of virtual GRFs that need to be punted
2859     * to scratch due to having any array access on them, and where in
2860     * scratch.
2861     */
2862    foreach_list(node, &this->instructions) {
2863       vec4_instruction *inst = (vec4_instruction *)node;
2864
2865       if (inst->dst.file == GRF && inst->dst.reladdr &&
2866           scratch_loc[inst->dst.reg] == -1) {
2867          scratch_loc[inst->dst.reg] = c->last_scratch;
2868          c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
2869       }
2870
2871       for (int i = 0 ; i < 3; i++) {
2872          src_reg *src = &inst->src[i];
2873
2874          if (src->file == GRF && src->reladdr &&
2875              scratch_loc[src->reg] == -1) {
2876             scratch_loc[src->reg] = c->last_scratch;
2877             c->last_scratch += this->virtual_grf_sizes[src->reg];
2878          }
2879       }
2880    }
2881
2882    /* Now, for anything that will be accessed through scratch, rewrite
2883     * it to load/store.  Note that this is a _safe list walk, because
2884     * we may generate a new scratch_write instruction after the one
2885     * we're processing.
2886     */
2887    foreach_list_safe(node, &this->instructions) {
2888       vec4_instruction *inst = (vec4_instruction *)node;
2889
2890       /* Set up the annotation tracking for new generated instructions. */
2891       base_ir = inst->ir;
2892       current_annotation = inst->annotation;
2893
2894       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
2895          emit_scratch_write(inst, scratch_loc[inst->dst.reg]);
2896       }
2897
2898       for (int i = 0 ; i < 3; i++) {
2899          if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
2900             continue;
2901
2902          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
2903
2904          emit_scratch_read(inst, temp, inst->src[i],
2905                            scratch_loc[inst->src[i].reg]);
2906
2907          inst->src[i].file = temp.file;
2908          inst->src[i].reg = temp.reg;
2909          inst->src[i].reg_offset = temp.reg_offset;
2910          inst->src[i].reladdr = NULL;
2911       }
2912    }
2913 }
2914
2915 /**
2916  * Emits an instruction before @inst to load the value named by @orig_src
2917  * from the pull constant buffer (surface) at @base_offset to @temp.
2918  */
2919 void
2920 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
2921                                       dst_reg temp, src_reg orig_src,
2922                                       int base_offset)
2923 {
2924    int reg_offset = base_offset + orig_src.reg_offset;
2925    src_reg index = src_reg((unsigned)SURF_INDEX_VEC4_CONST_BUFFER);
2926    src_reg offset = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
2927    vec4_instruction *load;
2928
2929    if (brw->gen >= 7) {
2930       dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
2931       grf_offset.type = offset.type;
2932       emit_before(inst, MOV(grf_offset, offset));
2933
2934       load = new(mem_ctx) vec4_instruction(this,
2935                                            VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
2936                                            temp, index, src_reg(grf_offset));
2937    } else {
2938       load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
2939                                            temp, index, offset);
2940       load->base_mrf = 14;
2941       load->mlen = 1;
2942    }
2943    emit_before(inst, load);
2944 }
2945
2946 /**
2947  * Implements array access of uniforms by inserting a
2948  * PULL_CONSTANT_LOAD instruction.
2949  *
2950  * Unlike temporary GRF array access (where we don't support it due to
2951  * the difficulty of doing relative addressing on instruction
2952  * destinations), we could potentially do array access of uniforms
2953  * that were loaded in GRF space as push constants.  In real-world
2954  * usage we've seen, though, the arrays being used are always larger
2955  * than we could load as push constants, so just always move all
2956  * uniform array access out to a pull constant buffer.
2957  */
2958 void
2959 vec4_visitor::move_uniform_array_access_to_pull_constants()
2960 {
2961    int pull_constant_loc[this->uniforms];
2962
2963    for (int i = 0; i < this->uniforms; i++) {
2964       pull_constant_loc[i] = -1;
2965    }
2966
2967    /* Walk through and find array access of uniforms.  Put a copy of that
2968     * uniform in the pull constant buffer.
2969     *
2970     * Note that we don't move constant-indexed accesses to arrays.  No
2971     * testing has been done of the performance impact of this choice.
2972     */
2973    foreach_list_safe(node, &this->instructions) {
2974       vec4_instruction *inst = (vec4_instruction *)node;
2975
2976       for (int i = 0 ; i < 3; i++) {
2977          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2978             continue;
2979
2980          int uniform = inst->src[i].reg;
2981
2982          /* If this array isn't already present in the pull constant buffer,
2983           * add it.
2984           */
2985          if (pull_constant_loc[uniform] == -1) {
2986             const float **values = &prog_data->param[uniform * 4];
2987
2988             pull_constant_loc[uniform] = prog_data->nr_pull_params / 4;
2989
2990             for (int j = 0; j < uniform_size[uniform] * 4; j++) {
2991                prog_data->pull_param[prog_data->nr_pull_params++]
2992                   = values[j];
2993             }
2994          }
2995
2996          /* Set up the annotation tracking for new generated instructions. */
2997          base_ir = inst->ir;
2998          current_annotation = inst->annotation;
2999
3000          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3001
3002          emit_pull_constant_load(inst, temp, inst->src[i],
3003                                  pull_constant_loc[uniform]);
3004
3005          inst->src[i].file = temp.file;
3006          inst->src[i].reg = temp.reg;
3007          inst->src[i].reg_offset = temp.reg_offset;
3008          inst->src[i].reladdr = NULL;
3009       }
3010    }
3011
3012    /* Now there are no accesses of the UNIFORM file with a reladdr, so
3013     * no need to track them as larger-than-vec4 objects.  This will be
3014     * relied on in cutting out unused uniform vectors from push
3015     * constants.
3016     */
3017    split_uniform_registers();
3018 }
3019
3020 void
3021 vec4_visitor::resolve_ud_negate(src_reg *reg)
3022 {
3023    if (reg->type != BRW_REGISTER_TYPE_UD ||
3024        !reg->negate)
3025       return;
3026
3027    src_reg temp = src_reg(this, glsl_type::uvec4_type);
3028    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3029    *reg = temp;
3030 }
3031
3032 vec4_visitor::vec4_visitor(struct brw_context *brw,
3033                            struct brw_vec4_compile *c,
3034                            struct gl_program *prog,
3035                            const struct brw_vec4_prog_key *key,
3036                            struct brw_vec4_prog_data *prog_data,
3037                            struct gl_shader_program *shader_prog,
3038                            struct brw_shader *shader,
3039                            void *mem_ctx,
3040                            bool debug_flag)
3041    : debug_flag(debug_flag)
3042 {
3043    this->brw = brw;
3044    this->ctx = &brw->ctx;
3045    this->shader_prog = shader_prog;
3046    this->shader = shader;
3047
3048    this->mem_ctx = mem_ctx;
3049    this->failed = false;
3050
3051    this->base_ir = NULL;
3052    this->current_annotation = NULL;
3053    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3054
3055    this->c = c;
3056    this->prog = prog;
3057    this->key = key;
3058    this->prog_data = prog_data;
3059
3060    this->variable_ht = hash_table_ctor(0,
3061                                        hash_table_pointer_hash,
3062                                        hash_table_pointer_compare);
3063
3064    this->virtual_grf_start = NULL;
3065    this->virtual_grf_end = NULL;
3066    this->virtual_grf_sizes = NULL;
3067    this->virtual_grf_count = 0;
3068    this->virtual_grf_reg_map = NULL;
3069    this->virtual_grf_reg_count = 0;
3070    this->virtual_grf_array_size = 0;
3071    this->live_intervals_valid = false;
3072
3073    this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3074
3075    this->uniforms = 0;
3076 }
3077
3078 vec4_visitor::~vec4_visitor()
3079 {
3080    hash_table_dtor(this->variable_ht);
3081 }
3082
3083
3084 void
3085 vec4_visitor::fail(const char *format, ...)
3086 {
3087    va_list va;
3088    char *msg;
3089
3090    if (failed)
3091       return;
3092
3093    failed = true;
3094
3095    va_start(va, format);
3096    msg = ralloc_vasprintf(mem_ctx, format, va);
3097    va_end(va);
3098    msg = ralloc_asprintf(mem_ctx, "VS compile failed: %s\n", msg);
3099
3100    this->fail_msg = msg;
3101
3102    if (debug_flag) {
3103       fprintf(stderr, "%s",  msg);
3104    }
3105 }
3106
3107 } /* namespace brw */