src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "glsl/ir_uniform.h"
  26 extern "C" {
  27 #include "program/sampler.h"
  28 }
  29
  30 namespace brw {
  31
  32 vec4_instruction::vec4_instruction(vec4_visitor *v,
  33                                    enum opcode opcode, dst_reg dst,
  34                                    src_reg src0, src_reg src1, src_reg src2)
  35 {
  36    this->opcode = opcode;
  37    this->dst = dst;
  38    this->src[0] = src0;
  39    this->src[1] = src1;
  40    this->src[2] = src2;
  41    this->ir = v->base_ir;
  42    this->annotation = v->current_annotation;
  43 }
  44
  45 vec4_instruction *
  46 vec4_visitor::emit(vec4_instruction *inst)
  47 {
  48    this->instructions.push_tail(inst);
  49
  50    return inst;
  51 }
  52
  53 vec4_instruction *
  54 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
  55 {
  56    new_inst->ir = inst->ir;
  57    new_inst->annotation = inst->annotation;
  58
  59    inst->insert_before(new_inst);
  60
  61    return inst;
  62 }
  63
  64 vec4_instruction *
  65 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
  66                    src_reg src0, src_reg src1, src_reg src2)
  67 {
  68    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
  69                                              src0, src1, src2));
  70 }
  71
  72
  73 vec4_instruction *
  74 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
  75 {
  76    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
  77 }
  78
  79 vec4_instruction *
  80 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
  81 {
  82    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
  83 }
  84
  85 vec4_instruction *
  86 vec4_visitor::emit(enum opcode opcode)
  87 {
  88    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
  89 }
  90
  91 #define ALU1(op)                                                        \
  92    vec4_instruction *                                                   \
  93    vec4_visitor::op(dst_reg dst, src_reg src0)                          \
  94    {                                                                    \
  95       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
  96                                            src0);                       \
  97    }
  98
  99 #define ALU2(op)                                                        \
 100    vec4_instruction *                                                   \
 101    vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1)            \
 102    {                                                                    \
 103       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 104                                            src0, src1);                 \
 105    }
 106
 107 #define ALU3(op)                                                        \
 108    vec4_instruction *                                                   \
 109    vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1, src_reg src2)\
 110    {                                                                    \
 111       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 112                                            src0, src1, src2);           \
 113    }
 114
 115 ALU1(NOT)
 116 ALU1(MOV)
 117 ALU1(FRC)
 118 ALU1(RNDD)
 119 ALU1(RNDE)
 120 ALU1(RNDZ)
 121 ALU1(F32TO16)
 122 ALU1(F16TO32)
 123 ALU2(ADD)
 124 ALU2(MUL)
 125 ALU2(MACH)
 126 ALU2(AND)
 127 ALU2(OR)
 128 ALU2(XOR)
 129 ALU2(DP3)
 130 ALU2(DP4)
 131 ALU2(DPH)
 132 ALU2(SHL)
 133 ALU2(SHR)
 134 ALU2(ASR)
 135 ALU3(LRP)
 136 ALU1(BFREV)
 137 ALU3(BFE)
 138 ALU2(BFI1)
 139 ALU3(BFI2)
 140 ALU1(FBH)
 141 ALU1(FBL)
 142 ALU1(CBIT)
 143 ALU3(MAD)
 144
 145 /** Gen4 predicated IF. */
 146 vec4_instruction *
 147 vec4_visitor::IF(uint32_t predicate)
 148 {
 149    vec4_instruction *inst;
 150
 151    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
 152    inst->predicate = predicate;
 153
 154    return inst;
 155 }
 156
 157 /** Gen6+ IF with embedded comparison. */
 158 vec4_instruction *
 159 vec4_visitor::IF(src_reg src0, src_reg src1, uint32_t condition)
 160 {
 161    assert(brw->gen >= 6);
 162
 163    vec4_instruction *inst;
 164
 165    resolve_ud_negate(&src0);
 166    resolve_ud_negate(&src1);
 167
 168    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
 169                                         src0, src1);
 170    inst->conditional_mod = condition;
 171
 172    return inst;
 173 }
 174
 175 /**
 176  * CMP: Sets the low bit of the destination channels with the result
 177  * of the comparison, while the upper bits are undefined, and updates
 178  * the flag register with the packed 16 bits of the result.
 179  */
 180 vec4_instruction *
 181 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, uint32_t condition)
 182 {
 183    vec4_instruction *inst;
 184
 185    /* original gen4 does type conversion to the destination type
 186     * before before comparison, producing garbage results for floating
 187     * point comparisons.
 188     */
 189    if (brw->gen == 4) {
 190       dst.type = src0.type;
 191       if (dst.file == HW_REG)
 192          dst.fixed_hw_reg.type = dst.type;
 193    }
 194
 195    resolve_ud_negate(&src0);
 196    resolve_ud_negate(&src1);
 197
 198    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
 199    inst->conditional_mod = condition;
 200
 201    return inst;
 202 }
 203
 204 vec4_instruction *
 205 vec4_visitor::SCRATCH_READ(dst_reg dst, src_reg index)
 206 {
 207    vec4_instruction *inst;
 208
 209    inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_READ,
 210                                         dst, index);
 211    inst->base_mrf = 14;
 212    inst->mlen = 2;
 213
 214    return inst;
 215 }
 216
 217 vec4_instruction *
 218 vec4_visitor::SCRATCH_WRITE(dst_reg dst, src_reg src, src_reg index)
 219 {
 220    vec4_instruction *inst;
 221
 222    inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_WRITE,
 223                                         dst, src, index);
 224    inst->base_mrf = 13;
 225    inst->mlen = 3;
 226
 227    return inst;
 228 }
 229
 230 void
 231 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
 232 {
 233    static enum opcode dot_opcodes[] = {
 234       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
 235    };
 236
 237    emit(dot_opcodes[elements - 2], dst, src0, src1);
 238 }
 239
 240 src_reg
 241 vec4_visitor::fix_3src_operand(src_reg src)
 242 {
 243    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
 244     * able to use vertical stride of zero to replicate the vec4 uniform, like
 245     *
 246     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
 247     *
 248     * But you can't, since vertical stride is always four in three-source
 249     * instructions. Instead, insert a MOV instruction to do the replication so
 250     * that the three-source instruction can consume it.
 251     */
 252
 253    /* The MOV is only needed if the source is a uniform or immediate. */
 254    if (src.file != UNIFORM && src.file != IMM)
 255       return src;
 256
 257    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 258    expanded.type = src.type;
 259    emit(MOV(expanded, src));
 260    return src_reg(expanded);
 261 }
 262
 263 src_reg
 264 vec4_visitor::fix_math_operand(src_reg src)
 265 {
 266    /* The gen6 math instruction ignores the source modifiers --
 267     * swizzle, abs, negate, and at least some parts of the register
 268     * region description.
 269     *
 270     * Rather than trying to enumerate all these cases, *always* expand the
 271     * operand to a temp GRF for gen6.
 272     *
 273     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 274     * can't use.
 275     */
 276
 277    if (brw->gen == 7 && src.file != IMM)
 278       return src;
 279
 280    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 281    expanded.type = src.type;
 282    emit(MOV(expanded, src));
 283    return src_reg(expanded);
 284 }
 285
 286 void
 287 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
 288 {
 289    src = fix_math_operand(src);
 290
 291    if (dst.writemask != WRITEMASK_XYZW) {
 292       /* The gen6 math instruction must be align1, so we can't do
 293        * writemasks.
 294        */
 295       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 296
 297       emit(opcode, temp_dst, src);
 298
 299       emit(MOV(dst, src_reg(temp_dst)));
 300    } else {
 301       emit(opcode, dst, src);
 302    }
 303 }
 304
 305 void
 306 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
 307 {
 308    vec4_instruction *inst = emit(opcode, dst, src);
 309    inst->base_mrf = 1;
 310    inst->mlen = 1;
 311 }
 312
 313 void
 314 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
 315 {
 316    switch (opcode) {
 317    case SHADER_OPCODE_RCP:
 318    case SHADER_OPCODE_RSQ:
 319    case SHADER_OPCODE_SQRT:
 320    case SHADER_OPCODE_EXP2:
 321    case SHADER_OPCODE_LOG2:
 322    case SHADER_OPCODE_SIN:
 323    case SHADER_OPCODE_COS:
 324       break;
 325    default:
 326       assert(!"not reached: bad math opcode");
 327       return;
 328    }
 329
 330    if (brw->gen >= 6) {
 331       return emit_math1_gen6(opcode, dst, src);
 332    } else {
 333       return emit_math1_gen4(opcode, dst, src);
 334    }
 335 }
 336
 337 void
 338 vec4_visitor::emit_math2_gen6(enum opcode opcode,
 339                               dst_reg dst, src_reg src0, src_reg src1)
 340 {
 341    src0 = fix_math_operand(src0);
 342    src1 = fix_math_operand(src1);
 343
 344    if (dst.writemask != WRITEMASK_XYZW) {
 345       /* The gen6 math instruction must be align1, so we can't do
 346        * writemasks.
 347        */
 348       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 349       temp_dst.type = dst.type;
 350
 351       emit(opcode, temp_dst, src0, src1);
 352
 353       emit(MOV(dst, src_reg(temp_dst)));
 354    } else {
 355       emit(opcode, dst, src0, src1);
 356    }
 357 }
 358
 359 void
 360 vec4_visitor::emit_math2_gen4(enum opcode opcode,
 361                               dst_reg dst, src_reg src0, src_reg src1)
 362 {
 363    vec4_instruction *inst = emit(opcode, dst, src0, src1);
 364    inst->base_mrf = 1;
 365    inst->mlen = 2;
 366 }
 367
 368 void
 369 vec4_visitor::emit_math(enum opcode opcode,
 370                         dst_reg dst, src_reg src0, src_reg src1)
 371 {
 372    switch (opcode) {
 373    case SHADER_OPCODE_POW:
 374    case SHADER_OPCODE_INT_QUOTIENT:
 375    case SHADER_OPCODE_INT_REMAINDER:
 376       break;
 377    default:
 378       assert(!"not reached: unsupported binary math opcode");
 379       return;
 380    }
 381
 382    if (brw->gen >= 6) {
 383       return emit_math2_gen6(opcode, dst, src0, src1);
 384    } else {
 385       return emit_math2_gen4(opcode, dst, src0, src1);
 386    }
 387 }
 388
 389 void
 390 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 391 {
 392    if (brw->gen < 7)
 393       assert(!"ir_unop_pack_half_2x16 should be lowered");
 394
 395    assert(dst.type == BRW_REGISTER_TYPE_UD);
 396    assert(src0.type == BRW_REGISTER_TYPE_F);
 397
 398    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
 399     *
 400     *   Because this instruction does not have a 16-bit floating-point type,
 401     *   the destination data type must be Word (W).
 402     *
 403     *   The destination must be DWord-aligned and specify a horizontal stride
 404     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
 405     *   each destination channel and the upper word is not modified.
 406     *
 407     * The above restriction implies that the f32to16 instruction must use
 408     * align1 mode, because only in align1 mode is it possible to specify
 409     * horizontal stride.  We choose here to defy the hardware docs and emit
 410     * align16 instructions.
 411     *
 412     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
 413     * instructions. I was partially successful in that the code passed all
 414     * tests.  However, the code was dubiously correct and fragile, and the
 415     * tests were not harsh enough to probe that frailty. Not trusting the
 416     * code, I chose instead to remain in align16 mode in defiance of the hw
 417     * docs).
 418     *
 419     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
 420     * simulator, emitting a f32to16 in align16 mode with UD as destination
 421     * data type is safe. The behavior differs from that specified in the PRM
 422     * in that the upper word of each destination channel is cleared to 0.
 423     */
 424
 425    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 426    src_reg tmp_src(tmp_dst);
 427
 428 #if 0
 429    /* Verify the undocumented behavior on which the following instructions
 430     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
 431     * then the result of the bit-or instruction below will be incorrect.
 432     *
 433     * You should inspect the disasm output in order to verify that the MOV is
 434     * not optimized away.
 435     */
 436    emit(MOV(tmp_dst, src_reg(0x12345678u)));
 437 #endif
 438
 439    /* Give tmp the form below, where "." means untouched.
 440     *
 441     *     w z          y          x w z          y          x
 442     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
 443     *
 444     * That the upper word of each write-channel be 0 is required for the
 445     * following bit-shift and bit-or instructions to work. Note that this
 446     * relies on the undocumented hardware behavior mentioned above.
 447     */
 448    tmp_dst.writemask = WRITEMASK_XY;
 449    emit(F32TO16(tmp_dst, src0));
 450
 451    /* Give the write-channels of dst the form:
 452     *   0xhhhh0000
 453     */
 454    tmp_src.swizzle = SWIZZLE_Y;
 455    emit(SHL(dst, tmp_src, src_reg(16u)));
 456
 457    /* Finally, give the write-channels of dst the form of packHalf2x16's
 458     * output:
 459     *   0xhhhhllll
 460     */
 461    tmp_src.swizzle = SWIZZLE_X;
 462    emit(OR(dst, src_reg(dst), tmp_src));
 463 }
 464
 465 void
 466 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 467 {
 468    if (brw->gen < 7)
 469       assert(!"ir_unop_unpack_half_2x16 should be lowered");
 470
 471    assert(dst.type == BRW_REGISTER_TYPE_F);
 472    assert(src0.type == BRW_REGISTER_TYPE_UD);
 473
 474    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
 475     *
 476     *   Because this instruction does not have a 16-bit floating-point type,
 477     *   the source data type must be Word (W). The destination type must be
 478     *   F (Float).
 479     *
 480     * To use W as the source data type, we must adjust horizontal strides,
 481     * which is only possible in align1 mode. All my [chadv] attempts at
 482     * emitting align1 instructions for unpackHalf2x16 failed to pass the
 483     * Piglit tests, so I gave up.
 484     *
 485     * I've verified that, on gen7 hardware and the simulator, it is safe to
 486     * emit f16to32 in align16 mode with UD as source data type.
 487     */
 488
 489    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 490    src_reg tmp_src(tmp_dst);
 491
 492    tmp_dst.writemask = WRITEMASK_X;
 493    emit(AND(tmp_dst, src0, src_reg(0xffffu)));
 494
 495    tmp_dst.writemask = WRITEMASK_Y;
 496    emit(SHR(tmp_dst, src0, src_reg(16u)));
 497
 498    dst.writemask = WRITEMASK_XY;
 499    emit(F16TO32(dst, tmp_src));
 500 }
 501
 502 void
 503 vec4_visitor::visit_instructions(const exec_list *list)
 504 {
 505    foreach_list(node, list) {
 506       ir_instruction *ir = (ir_instruction *)node;
 507
 508       base_ir = ir;
 509       ir->accept(this);
 510    }
 511 }
 512
 513
 514 static int
 515 type_size(const struct glsl_type *type)
 516 {
 517    unsigned int i;
 518    int size;
 519
 520    switch (type->base_type) {
 521    case GLSL_TYPE_UINT:
 522    case GLSL_TYPE_INT:
 523    case GLSL_TYPE_FLOAT:
 524    case GLSL_TYPE_BOOL:
 525       if (type->is_matrix()) {
 526          return type->matrix_columns;
 527       } else {
 528          /* Regardless of size of vector, it gets a vec4. This is bad
 529           * packing for things like floats, but otherwise arrays become a
 530           * mess.  Hopefully a later pass over the code can pack scalars
 531           * down if appropriate.
 532           */
 533          return 1;
 534       }
 535    case GLSL_TYPE_ARRAY:
 536       assert(type->length > 0);
 537       return type_size(type->fields.array) * type->length;
 538    case GLSL_TYPE_STRUCT:
 539       size = 0;
 540       for (i = 0; i < type->length; i++) {
 541          size += type_size(type->fields.structure[i].type);
 542       }
 543       return size;
 544    case GLSL_TYPE_SAMPLER:
 545       /* Samplers take up one slot in UNIFORMS[], but they're baked in
 546        * at link time.
 547        */
 548       return 1;
 549    case GLSL_TYPE_VOID:
 550    case GLSL_TYPE_ERROR:
 551    case GLSL_TYPE_INTERFACE:
 552       assert(0);
 553       break;
 554    }
 555
 556    return 0;
 557 }
 558
 559 int
 560 vec4_visitor::virtual_grf_alloc(int size)
 561 {
 562    if (virtual_grf_array_size <= virtual_grf_count) {
 563       if (virtual_grf_array_size == 0)
 564          virtual_grf_array_size = 16;
 565       else
 566          virtual_grf_array_size *= 2;
 567       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 568                                    virtual_grf_array_size);
 569       virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
 570                                      virtual_grf_array_size);
 571    }
 572    virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
 573    virtual_grf_reg_count += size;
 574    virtual_grf_sizes[virtual_grf_count] = size;
 575    return virtual_grf_count++;
 576 }
 577
 578 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 579 {
 580    init();
 581
 582    this->file = GRF;
 583    this->reg = v->virtual_grf_alloc(type_size(type));
 584
 585    if (type->is_array() || type->is_record()) {
 586       this->swizzle = BRW_SWIZZLE_NOOP;
 587    } else {
 588       this->swizzle = swizzle_for_size(type->vector_elements);
 589    }
 590
 591    this->type = brw_type_for_base_type(type);
 592 }
 593
 594 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 595 {
 596    init();
 597
 598    this->file = GRF;
 599    this->reg = v->virtual_grf_alloc(type_size(type));
 600
 601    if (type->is_array() || type->is_record()) {
 602       this->writemask = WRITEMASK_XYZW;
 603    } else {
 604       this->writemask = (1 << type->vector_elements) - 1;
 605    }
 606
 607    this->type = brw_type_for_base_type(type);
 608 }
 609
 610 /* Our support for uniforms is piggy-backed on the struct
 611  * gl_fragment_program, because that's where the values actually
 612  * get stored, rather than in some global gl_shader_program uniform
 613  * store.
 614  */
 615 void
 616 vec4_visitor::setup_uniform_values(ir_variable *ir)
 617 {
 618    int namelen = strlen(ir->name);
 619
 620    /* The data for our (non-builtin) uniforms is stored in a series of
 621     * gl_uniform_driver_storage structs for each subcomponent that
 622     * glGetUniformLocation() could name.  We know it's been set up in the same
 623     * order we'd walk the type, so walk the list of storage and find anything
 624     * with our name, or the prefix of a component that starts with our name.
 625     */
 626    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 627       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 628
 629       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 630           (storage->name[namelen] != 0 &&
 631            storage->name[namelen] != '.' &&
 632            storage->name[namelen] != '[')) {
 633          continue;
 634       }
 635
 636       gl_constant_value *components = storage->storage;
 637       unsigned vector_count = (MAX2(storage->array_elements, 1) *
 638                                storage->type->matrix_columns);
 639
 640       for (unsigned s = 0; s < vector_count; s++) {
 641          uniform_vector_size[uniforms] = storage->type->vector_elements;
 642
 643          int i;
 644          for (i = 0; i < uniform_vector_size[uniforms]; i++) {
 645             prog_data->param[uniforms * 4 + i] = &components->f;
 646             components++;
 647          }
 648          for (; i < 4; i++) {
 649             static float zero = 0;
 650             prog_data->param[uniforms * 4 + i] = &zero;
 651          }
 652
 653          uniforms++;
 654       }
 655    }
 656 }
 657
 658 void
 659 vec4_visitor::setup_uniform_clipplane_values()
 660 {
 661    gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
 662
 663    for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
 664       this->uniform_vector_size[this->uniforms] = 4;
 665       this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
 666       this->userplane[i].type = BRW_REGISTER_TYPE_F;
 667       for (int j = 0; j < 4; ++j) {
 668          prog_data->param[this->uniforms * 4 + j] = &clip_planes[i][j];
 669       }
 670       ++this->uniforms;
 671    }
 672 }
 673
 674 /* Our support for builtin uniforms is even scarier than non-builtin.
 675  * It sits on top of the PROG_STATE_VAR parameters that are
 676  * automatically updated from GL context state.
 677  */
 678 void
 679 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
 680 {
 681    const ir_state_slot *const slots = ir->state_slots;
 682    assert(ir->state_slots != NULL);
 683
 684    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 685       /* This state reference has already been setup by ir_to_mesa,
 686        * but we'll get the same index back here.  We can reference
 687        * ParameterValues directly, since unlike brw_fs.cpp, we never
 688        * add new state references during compile.
 689        */
 690       int index = _mesa_add_state_reference(this->prog->Parameters,
 691                                             (gl_state_index *)slots[i].tokens);
 692       float *values = &this->prog->Parameters->ParameterValues[index][0].f;
 693
 694       this->uniform_vector_size[this->uniforms] = 0;
 695       /* Add each of the unique swizzled channels of the element.
 696        * This will end up matching the size of the glsl_type of this field.
 697        */
 698       int last_swiz = -1;
 699       for (unsigned int j = 0; j < 4; j++) {
 700          int swiz = GET_SWZ(slots[i].swizzle, j);
 701          last_swiz = swiz;
 702
 703          prog_data->param[this->uniforms * 4 + j] = &values[swiz];
 704          if (swiz <= last_swiz)
 705             this->uniform_vector_size[this->uniforms]++;
 706       }
 707       this->uniforms++;
 708    }
 709 }
 710
 711 dst_reg *
 712 vec4_visitor::variable_storage(ir_variable *var)
 713 {
 714    return (dst_reg *)hash_table_find(this->variable_ht, var);
 715 }
 716
 717 void
 718 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir, uint32_t *predicate)
 719 {
 720    ir_expression *expr = ir->as_expression();
 721
 722    *predicate = BRW_PREDICATE_NORMAL;
 723
 724    if (expr) {
 725       src_reg op[2];
 726       vec4_instruction *inst;
 727
 728       assert(expr->get_num_operands() <= 2);
 729       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 730          expr->operands[i]->accept(this);
 731          op[i] = this->result;
 732
 733          resolve_ud_negate(&op[i]);
 734       }
 735
 736       switch (expr->operation) {
 737       case ir_unop_logic_not:
 738          inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
 739          inst->conditional_mod = BRW_CONDITIONAL_Z;
 740          break;
 741
 742       case ir_binop_logic_xor:
 743          inst = emit(XOR(dst_null_d(), op[0], op[1]));
 744          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 745          break;
 746
 747       case ir_binop_logic_or:
 748          inst = emit(OR(dst_null_d(), op[0], op[1]));
 749          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 750          break;
 751
 752       case ir_binop_logic_and:
 753          inst = emit(AND(dst_null_d(), op[0], op[1]));
 754          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 755          break;
 756
 757       case ir_unop_f2b:
 758          if (brw->gen >= 6) {
 759             emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
 760          } else {
 761             inst = emit(MOV(dst_null_f(), op[0]));
 762             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 763          }
 764          break;
 765
 766       case ir_unop_i2b:
 767          if (brw->gen >= 6) {
 768             emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 769          } else {
 770             inst = emit(MOV(dst_null_d(), op[0]));
 771             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 772          }
 773          break;
 774
 775       case ir_binop_all_equal:
 776          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 777          *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
 778          break;
 779
 780       case ir_binop_any_nequal:
 781          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 782          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 783          break;
 784
 785       case ir_unop_any:
 786          inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 787          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 788          break;
 789
 790       case ir_binop_greater:
 791       case ir_binop_gequal:
 792       case ir_binop_less:
 793       case ir_binop_lequal:
 794       case ir_binop_equal:
 795       case ir_binop_nequal:
 796          emit(CMP(dst_null_d(), op[0], op[1],
 797                   brw_conditional_for_comparison(expr->operation)));
 798          break;
 799
 800       default:
 801          assert(!"not reached");
 802          break;
 803       }
 804       return;
 805    }
 806
 807    ir->accept(this);
 808
 809    resolve_ud_negate(&this->result);
 810
 811    if (brw->gen >= 6) {
 812       vec4_instruction *inst = emit(AND(dst_null_d(),
 813                                         this->result, src_reg(1)));
 814       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 815    } else {
 816       vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
 817       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 818    }
 819 }
 820
 821 /**
 822  * Emit a gen6 IF statement with the comparison folded into the IF
 823  * instruction.
 824  */
 825 void
 826 vec4_visitor::emit_if_gen6(ir_if *ir)
 827 {
 828    ir_expression *expr = ir->condition->as_expression();
 829
 830    if (expr) {
 831       src_reg op[2];
 832       dst_reg temp;
 833
 834       assert(expr->get_num_operands() <= 2);
 835       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 836          expr->operands[i]->accept(this);
 837          op[i] = this->result;
 838       }
 839
 840       switch (expr->operation) {
 841       case ir_unop_logic_not:
 842          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
 843          return;
 844
 845       case ir_binop_logic_xor:
 846          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
 847          return;
 848
 849       case ir_binop_logic_or:
 850          temp = dst_reg(this, glsl_type::bool_type);
 851          emit(OR(temp, op[0], op[1]));
 852          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 853          return;
 854
 855       case ir_binop_logic_and:
 856          temp = dst_reg(this, glsl_type::bool_type);
 857          emit(AND(temp, op[0], op[1]));
 858          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 859          return;
 860
 861       case ir_unop_f2b:
 862          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 863          return;
 864
 865       case ir_unop_i2b:
 866          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 867          return;
 868
 869       case ir_binop_greater:
 870       case ir_binop_gequal:
 871       case ir_binop_less:
 872       case ir_binop_lequal:
 873       case ir_binop_equal:
 874       case ir_binop_nequal:
 875          emit(IF(op[0], op[1],
 876                  brw_conditional_for_comparison(expr->operation)));
 877          return;
 878
 879       case ir_binop_all_equal:
 880          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 881          emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
 882          return;
 883
 884       case ir_binop_any_nequal:
 885          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 886          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 887          return;
 888
 889       case ir_unop_any:
 890          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 891          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 892          return;
 893
 894       default:
 895          assert(!"not reached");
 896          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 897          return;
 898       }
 899       return;
 900    }
 901
 902    ir->condition->accept(this);
 903
 904    emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
 905 }
 906
 907 dst_reg
 908 with_writemask(dst_reg const & r, int mask)
 909 {
 910    dst_reg result = r;
 911    result.writemask = mask;
 912    return result;
 913 }
 914
 915
 916 void
 917 vec4_visitor::visit(ir_variable *ir)
 918 {
 919    dst_reg *reg = NULL;
 920
 921    if (variable_storage(ir))
 922       return;
 923
 924    switch (ir->mode) {
 925    case ir_var_shader_in:
 926       reg = new(mem_ctx) dst_reg(ATTR, ir->location);
 927       break;
 928
 929    case ir_var_shader_out:
 930       reg = new(mem_ctx) dst_reg(this, ir->type);
 931
 932       for (int i = 0; i < type_size(ir->type); i++) {
 933          output_reg[ir->location + i] = *reg;
 934          output_reg[ir->location + i].reg_offset = i;
 935          output_reg[ir->location + i].type =
 936             brw_type_for_base_type(ir->type->get_scalar_type());
 937          output_reg_annotation[ir->location + i] = ir->name;
 938       }
 939       break;
 940
 941    case ir_var_auto:
 942    case ir_var_temporary:
 943       reg = new(mem_ctx) dst_reg(this, ir->type);
 944       break;
 945
 946    case ir_var_uniform:
 947       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
 948
 949       /* Thanks to the lower_ubo_reference pass, we will see only
 950        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
 951        * variables, so no need for them to be in variable_ht.
 952        */
 953       if (ir->is_in_uniform_block())
 954          return;
 955
 956       /* Track how big the whole uniform variable is, in case we need to put a
 957        * copy of its data into pull constants for array access.
 958        */
 959       this->uniform_size[this->uniforms] = type_size(ir->type);
 960
 961       if (!strncmp(ir->name, "gl_", 3)) {
 962          setup_builtin_uniform_values(ir);
 963       } else {
 964          setup_uniform_values(ir);
 965       }
 966       break;
 967
 968    case ir_var_system_value:
 969       reg = make_reg_for_system_value(ir);
 970       break;
 971
 972    default:
 973       assert(!"not reached");
 974    }
 975
 976    reg->type = brw_type_for_base_type(ir->type);
 977    hash_table_insert(this->variable_ht, reg, ir);
 978 }
 979
 980 void
 981 vec4_visitor::visit(ir_loop *ir)
 982 {
 983    dst_reg counter;
 984
 985    /* We don't want debugging output to print the whole body of the
 986     * loop as the annotation.
 987     */
 988    this->base_ir = NULL;
 989
 990    if (ir->counter != NULL) {
 991       this->base_ir = ir->counter;
 992       ir->counter->accept(this);
 993       counter = *(variable_storage(ir->counter));
 994
 995       if (ir->from != NULL) {
 996          this->base_ir = ir->from;
 997          ir->from->accept(this);
 998
 999          emit(MOV(counter, this->result));
1000       }
1001    }
1002
1003    emit(BRW_OPCODE_DO);
1004
1005    if (ir->to) {
1006       this->base_ir = ir->to;
1007       ir->to->accept(this);
1008
1009       emit(CMP(dst_null_d(), src_reg(counter), this->result,
1010                brw_conditional_for_comparison(ir->cmp)));
1011
1012       vec4_instruction *inst = emit(BRW_OPCODE_BREAK);
1013       inst->predicate = BRW_PREDICATE_NORMAL;
1014    }
1015
1016    visit_instructions(&ir->body_instructions);
1017
1018
1019    if (ir->increment) {
1020       this->base_ir = ir->increment;
1021       ir->increment->accept(this);
1022       emit(ADD(counter, src_reg(counter), this->result));
1023    }
1024
1025    emit(BRW_OPCODE_WHILE);
1026 }
1027
1028 void
1029 vec4_visitor::visit(ir_loop_jump *ir)
1030 {
1031    switch (ir->mode) {
1032    case ir_loop_jump::jump_break:
1033       emit(BRW_OPCODE_BREAK);
1034       break;
1035    case ir_loop_jump::jump_continue:
1036       emit(BRW_OPCODE_CONTINUE);
1037       break;
1038    }
1039 }
1040
1041
1042 void
1043 vec4_visitor::visit(ir_function_signature *ir)
1044 {
1045    assert(0);
1046    (void)ir;
1047 }
1048
1049 void
1050 vec4_visitor::visit(ir_function *ir)
1051 {
1052    /* Ignore function bodies other than main() -- we shouldn't see calls to
1053     * them since they should all be inlined.
1054     */
1055    if (strcmp(ir->name, "main") == 0) {
1056       const ir_function_signature *sig;
1057       exec_list empty;
1058
1059       sig = ir->matching_signature(NULL, &empty);
1060
1061       assert(sig);
1062
1063       visit_instructions(&sig->body);
1064    }
1065 }
1066
1067 bool
1068 vec4_visitor::try_emit_sat(ir_expression *ir)
1069 {
1070    ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
1071    if (!sat_src)
1072       return false;
1073
1074    sat_src->accept(this);
1075    src_reg src = this->result;
1076
1077    this->result = src_reg(this, ir->type);
1078    vec4_instruction *inst;
1079    inst = emit(MOV(dst_reg(this->result), src));
1080    inst->saturate = true;
1081
1082    return true;
1083 }
1084
1085 bool
1086 vec4_visitor::try_emit_mad(ir_expression *ir, int mul_arg)
1087 {
1088    /* 3-src instructions were introduced in gen6. */
1089    if (brw->gen < 6)
1090       return false;
1091
1092    /* MAD can only handle floating-point data. */
1093    if (ir->type->base_type != GLSL_TYPE_FLOAT)
1094       return false;
1095
1096    ir_rvalue *nonmul = ir->operands[1 - mul_arg];
1097    ir_expression *mul = ir->operands[mul_arg]->as_expression();
1098
1099    if (!mul || mul->operation != ir_binop_mul)
1100       return false;
1101
1102    nonmul->accept(this);
1103    src_reg src0 = fix_3src_operand(this->result);
1104
1105    mul->operands[0]->accept(this);
1106    src_reg src1 = fix_3src_operand(this->result);
1107
1108    mul->operands[1]->accept(this);
1109    src_reg src2 = fix_3src_operand(this->result);
1110
1111    this->result = src_reg(this, ir->type);
1112    emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1113
1114    return true;
1115 }
1116
1117 void
1118 vec4_visitor::emit_bool_comparison(unsigned int op,
1119                                  dst_reg dst, src_reg src0, src_reg src1)
1120 {
1121    /* original gen4 does destination conversion before comparison. */
1122    if (brw->gen < 5)
1123       dst.type = src0.type;
1124
1125    emit(CMP(dst, src0, src1, brw_conditional_for_comparison(op)));
1126
1127    dst.type = BRW_REGISTER_TYPE_D;
1128    emit(AND(dst, src_reg(dst), src_reg(0x1)));
1129 }
1130
1131 void
1132 vec4_visitor::emit_minmax(uint32_t conditionalmod, dst_reg dst,
1133                           src_reg src0, src_reg src1)
1134 {
1135    vec4_instruction *inst;
1136
1137    if (brw->gen >= 6) {
1138       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1139       inst->conditional_mod = conditionalmod;
1140    } else {
1141       emit(CMP(dst, src0, src1, conditionalmod));
1142
1143       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1144       inst->predicate = BRW_PREDICATE_NORMAL;
1145    }
1146 }
1147
1148 static bool
1149 is_16bit_constant(ir_rvalue *rvalue)
1150 {
1151    ir_constant *constant = rvalue->as_constant();
1152    if (!constant)
1153       return false;
1154
1155    if (constant->type != glsl_type::int_type &&
1156        constant->type != glsl_type::uint_type)
1157       return false;
1158
1159    return constant->value.u[0] < (1 << 16);
1160 }
1161
1162 void
1163 vec4_visitor::visit(ir_expression *ir)
1164 {
1165    unsigned int operand;
1166    src_reg op[Elements(ir->operands)];
1167    src_reg result_src;
1168    dst_reg result_dst;
1169    vec4_instruction *inst;
1170
1171    if (try_emit_sat(ir))
1172       return;
1173
1174    if (ir->operation == ir_binop_add) {
1175       if (try_emit_mad(ir, 0) || try_emit_mad(ir, 1))
1176          return;
1177    }
1178
1179    for (operand = 0; operand < ir->get_num_operands(); operand++) {
1180       this->result.file = BAD_FILE;
1181       ir->operands[operand]->accept(this);
1182       if (this->result.file == BAD_FILE) {
1183          printf("Failed to get tree for expression operand:\n");
1184          ir->operands[operand]->print();
1185          exit(1);
1186       }
1187       op[operand] = this->result;
1188
1189       /* Matrix expression operands should have been broken down to vector
1190        * operations already.
1191        */
1192       assert(!ir->operands[operand]->type->is_matrix());
1193    }
1194
1195    int vector_elements = ir->operands[0]->type->vector_elements;
1196    if (ir->operands[1]) {
1197       vector_elements = MAX2(vector_elements,
1198                              ir->operands[1]->type->vector_elements);
1199    }
1200
1201    this->result.file = BAD_FILE;
1202
1203    /* Storage for our result.  Ideally for an assignment we'd be using
1204     * the actual storage for the result here, instead.
1205     */
1206    result_src = src_reg(this, ir->type);
1207    /* convenience for the emit functions below. */
1208    result_dst = dst_reg(result_src);
1209    /* If nothing special happens, this is the result. */
1210    this->result = result_src;
1211    /* Limit writes to the channels that will be used by result_src later.
1212     * This does limit this temp's use as a temporary for multi-instruction
1213     * sequences.
1214     */
1215    result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1216
1217    switch (ir->operation) {
1218    case ir_unop_logic_not:
1219       /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
1220        * ones complement of the whole register, not just bit 0.
1221        */
1222       emit(XOR(result_dst, op[0], src_reg(1)));
1223       break;
1224    case ir_unop_neg:
1225       op[0].negate = !op[0].negate;
1226       emit(MOV(result_dst, op[0]));
1227       break;
1228    case ir_unop_abs:
1229       op[0].abs = true;
1230       op[0].negate = false;
1231       emit(MOV(result_dst, op[0]));
1232       break;
1233
1234    case ir_unop_sign:
1235       emit(MOV(result_dst, src_reg(0.0f)));
1236
1237       emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_G));
1238       inst = emit(MOV(result_dst, src_reg(1.0f)));
1239       inst->predicate = BRW_PREDICATE_NORMAL;
1240
1241       emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_L));
1242       inst = emit(MOV(result_dst, src_reg(-1.0f)));
1243       inst->predicate = BRW_PREDICATE_NORMAL;
1244
1245       break;
1246
1247    case ir_unop_rcp:
1248       emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1249       break;
1250
1251    case ir_unop_exp2:
1252       emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1253       break;
1254    case ir_unop_log2:
1255       emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1256       break;
1257    case ir_unop_exp:
1258    case ir_unop_log:
1259       assert(!"not reached: should be handled by ir_explog_to_explog2");
1260       break;
1261    case ir_unop_sin:
1262    case ir_unop_sin_reduced:
1263       emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1264       break;
1265    case ir_unop_cos:
1266    case ir_unop_cos_reduced:
1267       emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1268       break;
1269
1270    case ir_unop_dFdx:
1271    case ir_unop_dFdy:
1272       assert(!"derivatives not valid in vertex shader");
1273       break;
1274
1275    case ir_unop_bitfield_reverse:
1276       emit(BFREV(result_dst, op[0]));
1277       break;
1278    case ir_unop_bit_count:
1279       emit(CBIT(result_dst, op[0]));
1280       break;
1281    case ir_unop_find_msb: {
1282       src_reg temp = src_reg(this, glsl_type::uint_type);
1283
1284       inst = emit(FBH(dst_reg(temp), op[0]));
1285       inst->dst.writemask = WRITEMASK_XYZW;
1286
1287       /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1288        * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1289        * subtract the result from 31 to convert the MSB count into an LSB count.
1290        */
1291
1292       /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1293       temp.swizzle = BRW_SWIZZLE_NOOP;
1294       emit(MOV(result_dst, temp));
1295
1296       src_reg src_tmp = src_reg(result_dst);
1297       emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1298
1299       src_tmp.negate = true;
1300       inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1301       inst->predicate = BRW_PREDICATE_NORMAL;
1302       break;
1303    }
1304    case ir_unop_find_lsb:
1305       emit(FBL(result_dst, op[0]));
1306       break;
1307
1308    case ir_unop_noise:
1309       assert(!"not reached: should be handled by lower_noise");
1310       break;
1311
1312    case ir_binop_add:
1313       emit(ADD(result_dst, op[0], op[1]));
1314       break;
1315    case ir_binop_sub:
1316       assert(!"not reached: should be handled by ir_sub_to_add_neg");
1317       break;
1318
1319    case ir_binop_mul:
1320       if (ir->type->is_integer()) {
1321          /* For integer multiplication, the MUL uses the low 16 bits of one of
1322           * the operands (src0 through SNB, src1 on IVB and later).  The MACH
1323           * accumulates in the contribution of the upper 16 bits of that
1324           * operand.  If we can determine that one of the args is in the low
1325           * 16 bits, though, we can just emit a single MUL.
1326           */
1327          if (is_16bit_constant(ir->operands[0])) {
1328             if (brw->gen < 7)
1329                emit(MUL(result_dst, op[0], op[1]));
1330             else
1331                emit(MUL(result_dst, op[1], op[0]));
1332          } else if (is_16bit_constant(ir->operands[1])) {
1333             if (brw->gen < 7)
1334                emit(MUL(result_dst, op[1], op[0]));
1335             else
1336                emit(MUL(result_dst, op[0], op[1]));
1337          } else {
1338             struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_D);
1339
1340             emit(MUL(acc, op[0], op[1]));
1341             emit(MACH(dst_null_d(), op[0], op[1]));
1342             emit(MOV(result_dst, src_reg(acc)));
1343          }
1344       } else {
1345          emit(MUL(result_dst, op[0], op[1]));
1346       }
1347       break;
1348    case ir_binop_div:
1349       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1350       assert(ir->type->is_integer());
1351       emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1352       break;
1353    case ir_binop_mod:
1354       /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1355       assert(ir->type->is_integer());
1356       emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1357       break;
1358
1359    case ir_binop_less:
1360    case ir_binop_greater:
1361    case ir_binop_lequal:
1362    case ir_binop_gequal:
1363    case ir_binop_equal:
1364    case ir_binop_nequal: {
1365       emit(CMP(result_dst, op[0], op[1],
1366                brw_conditional_for_comparison(ir->operation)));
1367       emit(AND(result_dst, result_src, src_reg(0x1)));
1368       break;
1369    }
1370
1371    case ir_binop_all_equal:
1372       /* "==" operator producing a scalar boolean. */
1373       if (ir->operands[0]->type->is_vector() ||
1374           ir->operands[1]->type->is_vector()) {
1375          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1376          emit(MOV(result_dst, src_reg(0)));
1377          inst = emit(MOV(result_dst, src_reg(1)));
1378          inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1379       } else {
1380          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1381          emit(AND(result_dst, result_src, src_reg(0x1)));
1382       }
1383       break;
1384    case ir_binop_any_nequal:
1385       /* "!=" operator producing a scalar boolean. */
1386       if (ir->operands[0]->type->is_vector() ||
1387           ir->operands[1]->type->is_vector()) {
1388          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1389
1390          emit(MOV(result_dst, src_reg(0)));
1391          inst = emit(MOV(result_dst, src_reg(1)));
1392          inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1393       } else {
1394          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1395          emit(AND(result_dst, result_src, src_reg(0x1)));
1396       }
1397       break;
1398
1399    case ir_unop_any:
1400       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1401       emit(MOV(result_dst, src_reg(0)));
1402
1403       inst = emit(MOV(result_dst, src_reg(1)));
1404       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1405       break;
1406
1407    case ir_binop_logic_xor:
1408       emit(XOR(result_dst, op[0], op[1]));
1409       break;
1410
1411    case ir_binop_logic_or:
1412       emit(OR(result_dst, op[0], op[1]));
1413       break;
1414
1415    case ir_binop_logic_and:
1416       emit(AND(result_dst, op[0], op[1]));
1417       break;
1418
1419    case ir_binop_dot:
1420       assert(ir->operands[0]->type->is_vector());
1421       assert(ir->operands[0]->type == ir->operands[1]->type);
1422       emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1423       break;
1424
1425    case ir_unop_sqrt:
1426       emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1427       break;
1428    case ir_unop_rsq:
1429       emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1430       break;
1431
1432    case ir_unop_bitcast_i2f:
1433    case ir_unop_bitcast_u2f:
1434       this->result = op[0];
1435       this->result.type = BRW_REGISTER_TYPE_F;
1436       break;
1437
1438    case ir_unop_bitcast_f2i:
1439       this->result = op[0];
1440       this->result.type = BRW_REGISTER_TYPE_D;
1441       break;
1442
1443    case ir_unop_bitcast_f2u:
1444       this->result = op[0];
1445       this->result.type = BRW_REGISTER_TYPE_UD;
1446       break;
1447
1448    case ir_unop_i2f:
1449    case ir_unop_i2u:
1450    case ir_unop_u2i:
1451    case ir_unop_u2f:
1452    case ir_unop_b2f:
1453    case ir_unop_b2i:
1454    case ir_unop_f2i:
1455    case ir_unop_f2u:
1456       emit(MOV(result_dst, op[0]));
1457       break;
1458    case ir_unop_f2b:
1459    case ir_unop_i2b: {
1460       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1461       emit(AND(result_dst, result_src, src_reg(1)));
1462       break;
1463    }
1464
1465    case ir_unop_trunc:
1466       emit(RNDZ(result_dst, op[0]));
1467       break;
1468    case ir_unop_ceil:
1469       op[0].negate = !op[0].negate;
1470       inst = emit(RNDD(result_dst, op[0]));
1471       this->result.negate = true;
1472       break;
1473    case ir_unop_floor:
1474       inst = emit(RNDD(result_dst, op[0]));
1475       break;
1476    case ir_unop_fract:
1477       inst = emit(FRC(result_dst, op[0]));
1478       break;
1479    case ir_unop_round_even:
1480       emit(RNDE(result_dst, op[0]));
1481       break;
1482
1483    case ir_binop_min:
1484       emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1485       break;
1486    case ir_binop_max:
1487       emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1488       break;
1489
1490    case ir_binop_pow:
1491       emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1492       break;
1493
1494    case ir_unop_bit_not:
1495       inst = emit(NOT(result_dst, op[0]));
1496       break;
1497    case ir_binop_bit_and:
1498       inst = emit(AND(result_dst, op[0], op[1]));
1499       break;
1500    case ir_binop_bit_xor:
1501       inst = emit(XOR(result_dst, op[0], op[1]));
1502       break;
1503    case ir_binop_bit_or:
1504       inst = emit(OR(result_dst, op[0], op[1]));
1505       break;
1506
1507    case ir_binop_lshift:
1508       inst = emit(SHL(result_dst, op[0], op[1]));
1509       break;
1510
1511    case ir_binop_rshift:
1512       if (ir->type->base_type == GLSL_TYPE_INT)
1513          inst = emit(ASR(result_dst, op[0], op[1]));
1514       else
1515          inst = emit(SHR(result_dst, op[0], op[1]));
1516       break;
1517
1518    case ir_binop_bfm:
1519       emit(BFI1(result_dst, op[0], op[1]));
1520       break;
1521
1522    case ir_binop_ubo_load: {
1523       ir_constant *uniform_block = ir->operands[0]->as_constant();
1524       ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1525       unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1526       src_reg offset = op[1];
1527
1528       /* Now, load the vector from that offset. */
1529       assert(ir->type->is_vector() || ir->type->is_scalar());
1530
1531       src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1532       packed_consts.type = result.type;
1533       src_reg surf_index =
1534          src_reg(SURF_INDEX_VEC4_UBO(uniform_block->value.u[0]));
1535       if (const_offset_ir) {
1536          offset = src_reg(const_offset / 16);
1537       } else {
1538          emit(SHR(dst_reg(offset), offset, src_reg(4)));
1539       }
1540
1541       vec4_instruction *pull =
1542          emit(new(mem_ctx) vec4_instruction(this,
1543                                             VS_OPCODE_PULL_CONSTANT_LOAD,
1544                                             dst_reg(packed_consts),
1545                                             surf_index,
1546                                             offset));
1547       pull->base_mrf = 14;
1548       pull->mlen = 1;
1549
1550       packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1551       packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1552                                             const_offset % 16 / 4,
1553                                             const_offset % 16 / 4,
1554                                             const_offset % 16 / 4);
1555
1556       /* UBO bools are any nonzero int.  We store bools as either 0 or 1. */
1557       if (ir->type->base_type == GLSL_TYPE_BOOL) {
1558          emit(CMP(result_dst, packed_consts, src_reg(0u),
1559                   BRW_CONDITIONAL_NZ));
1560          emit(AND(result_dst, result, src_reg(0x1)));
1561       } else {
1562          emit(MOV(result_dst, packed_consts));
1563       }
1564       break;
1565    }
1566
1567    case ir_binop_vector_extract:
1568       assert(!"should have been lowered by vec_index_to_cond_assign");
1569       break;
1570
1571    case ir_triop_fma:
1572       op[0] = fix_3src_operand(op[0]);
1573       op[1] = fix_3src_operand(op[1]);
1574       op[2] = fix_3src_operand(op[2]);
1575       /* Note that the instruction's argument order is reversed from GLSL
1576        * and the IR.
1577        */
1578       emit(MAD(result_dst, op[2], op[1], op[0]));
1579       break;
1580
1581    case ir_triop_lrp:
1582       op[0] = fix_3src_operand(op[0]);
1583       op[1] = fix_3src_operand(op[1]);
1584       op[2] = fix_3src_operand(op[2]);
1585       /* Note that the instruction's argument order is reversed from GLSL
1586        * and the IR.
1587        */
1588       emit(LRP(result_dst, op[2], op[1], op[0]));
1589       break;
1590
1591    case ir_triop_bfi:
1592       op[0] = fix_3src_operand(op[0]);
1593       op[1] = fix_3src_operand(op[1]);
1594       op[2] = fix_3src_operand(op[2]);
1595       emit(BFI2(result_dst, op[0], op[1], op[2]));
1596       break;
1597
1598    case ir_triop_bitfield_extract:
1599       op[0] = fix_3src_operand(op[0]);
1600       op[1] = fix_3src_operand(op[1]);
1601       op[2] = fix_3src_operand(op[2]);
1602       /* Note that the instruction's argument order is reversed from GLSL
1603        * and the IR.
1604        */
1605       emit(BFE(result_dst, op[2], op[1], op[0]));
1606       break;
1607
1608    case ir_triop_vector_insert:
1609       assert(!"should have been lowered by lower_vector_insert");
1610       break;
1611
1612    case ir_quadop_bitfield_insert:
1613       assert(!"not reached: should be handled by "
1614               "bitfield_insert_to_bfm_bfi\n");
1615       break;
1616
1617    case ir_quadop_vector:
1618       assert(!"not reached: should be handled by lower_quadop_vector");
1619       break;
1620
1621    case ir_unop_pack_half_2x16:
1622       emit_pack_half_2x16(result_dst, op[0]);
1623       break;
1624    case ir_unop_unpack_half_2x16:
1625       emit_unpack_half_2x16(result_dst, op[0]);
1626       break;
1627    case ir_unop_pack_snorm_2x16:
1628    case ir_unop_pack_snorm_4x8:
1629    case ir_unop_pack_unorm_2x16:
1630    case ir_unop_pack_unorm_4x8:
1631    case ir_unop_unpack_snorm_2x16:
1632    case ir_unop_unpack_snorm_4x8:
1633    case ir_unop_unpack_unorm_2x16:
1634    case ir_unop_unpack_unorm_4x8:
1635       assert(!"not reached: should be handled by lower_packing_builtins");
1636       break;
1637    case ir_unop_unpack_half_2x16_split_x:
1638    case ir_unop_unpack_half_2x16_split_y:
1639    case ir_binop_pack_half_2x16_split:
1640       assert(!"not reached: should not occur in vertex shader");
1641       break;
1642    }
1643 }
1644
1645
1646 void
1647 vec4_visitor::visit(ir_swizzle *ir)
1648 {
1649    src_reg src;
1650    int i = 0;
1651    int swizzle[4];
1652
1653    /* Note that this is only swizzles in expressions, not those on the left
1654     * hand side of an assignment, which do write masking.  See ir_assignment
1655     * for that.
1656     */
1657
1658    ir->val->accept(this);
1659    src = this->result;
1660    assert(src.file != BAD_FILE);
1661
1662    for (i = 0; i < ir->type->vector_elements; i++) {
1663       switch (i) {
1664       case 0:
1665          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1666          break;
1667       case 1:
1668          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1669          break;
1670       case 2:
1671          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1672          break;
1673       case 3:
1674          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1675             break;
1676       }
1677    }
1678    for (; i < 4; i++) {
1679       /* Replicate the last channel out. */
1680       swizzle[i] = swizzle[ir->type->vector_elements - 1];
1681    }
1682
1683    src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1684
1685    this->result = src;
1686 }
1687
1688 void
1689 vec4_visitor::visit(ir_dereference_variable *ir)
1690 {
1691    const struct glsl_type *type = ir->type;
1692    dst_reg *reg = variable_storage(ir->var);
1693
1694    if (!reg) {
1695       fail("Failed to find variable storage for %s\n", ir->var->name);
1696       this->result = src_reg(brw_null_reg());
1697       return;
1698    }
1699
1700    this->result = src_reg(*reg);
1701
1702    /* System values get their swizzle from the dst_reg writemask */
1703    if (ir->var->mode == ir_var_system_value)
1704       return;
1705
1706    if (type->is_scalar() || type->is_vector() || type->is_matrix())
1707       this->result.swizzle = swizzle_for_size(type->vector_elements);
1708 }
1709
1710
1711 int
1712 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1713 {
1714    /* Under normal circumstances array elements are stored consecutively, so
1715     * the stride is equal to the size of the array element.
1716     */
1717    return type_size(ir->type);
1718 }
1719
1720
1721 void
1722 vec4_visitor::visit(ir_dereference_array *ir)
1723 {
1724    ir_constant *constant_index;
1725    src_reg src;
1726    int array_stride = compute_array_stride(ir);
1727
1728    constant_index = ir->array_index->constant_expression_value();
1729
1730    ir->array->accept(this);
1731    src = this->result;
1732
1733    if (constant_index) {
1734       src.reg_offset += constant_index->value.i[0] * array_stride;
1735    } else {
1736       /* Variable index array dereference.  It eats the "vec4" of the
1737        * base of the array and an index that offsets the Mesa register
1738        * index.
1739        */
1740       ir->array_index->accept(this);
1741
1742       src_reg index_reg;
1743
1744       if (array_stride == 1) {
1745          index_reg = this->result;
1746       } else {
1747          index_reg = src_reg(this, glsl_type::int_type);
1748
1749          emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
1750       }
1751
1752       if (src.reladdr) {
1753          src_reg temp = src_reg(this, glsl_type::int_type);
1754
1755          emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1756
1757          index_reg = temp;
1758       }
1759
1760       src.reladdr = ralloc(mem_ctx, src_reg);
1761       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1762    }
1763
1764    /* If the type is smaller than a vec4, replicate the last channel out. */
1765    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1766       src.swizzle = swizzle_for_size(ir->type->vector_elements);
1767    else
1768       src.swizzle = BRW_SWIZZLE_NOOP;
1769    src.type = brw_type_for_base_type(ir->type);
1770
1771    this->result = src;
1772 }
1773
1774 void
1775 vec4_visitor::visit(ir_dereference_record *ir)
1776 {
1777    unsigned int i;
1778    const glsl_type *struct_type = ir->record->type;
1779    int offset = 0;
1780
1781    ir->record->accept(this);
1782
1783    for (i = 0; i < struct_type->length; i++) {
1784       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1785          break;
1786       offset += type_size(struct_type->fields.structure[i].type);
1787    }
1788
1789    /* If the type is smaller than a vec4, replicate the last channel out. */
1790    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1791       this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1792    else
1793       this->result.swizzle = BRW_SWIZZLE_NOOP;
1794    this->result.type = brw_type_for_base_type(ir->type);
1795
1796    this->result.reg_offset += offset;
1797 }
1798
1799 /**
1800  * We want to be careful in assignment setup to hit the actual storage
1801  * instead of potentially using a temporary like we might with the
1802  * ir_dereference handler.
1803  */
1804 static dst_reg
1805 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1806 {
1807    /* The LHS must be a dereference.  If the LHS is a variable indexed array
1808     * access of a vector, it must be separated into a series conditional moves
1809     * before reaching this point (see ir_vec_index_to_cond_assign).
1810     */
1811    assert(ir->as_dereference());
1812    ir_dereference_array *deref_array = ir->as_dereference_array();
1813    if (deref_array) {
1814       assert(!deref_array->array->type->is_vector());
1815    }
1816
1817    /* Use the rvalue deref handler for the most part.  We'll ignore
1818     * swizzles in it and write swizzles using writemask, though.
1819     */
1820    ir->accept(v);
1821    return dst_reg(v->result);
1822 }
1823
1824 void
1825 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1826                               const struct glsl_type *type, uint32_t predicate)
1827 {
1828    if (type->base_type == GLSL_TYPE_STRUCT) {
1829       for (unsigned int i = 0; i < type->length; i++) {
1830          emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1831       }
1832       return;
1833    }
1834
1835    if (type->is_array()) {
1836       for (unsigned int i = 0; i < type->length; i++) {
1837          emit_block_move(dst, src, type->fields.array, predicate);
1838       }
1839       return;
1840    }
1841
1842    if (type->is_matrix()) {
1843       const struct glsl_type *vec_type;
1844
1845       vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1846                                          type->vector_elements, 1);
1847
1848       for (int i = 0; i < type->matrix_columns; i++) {
1849          emit_block_move(dst, src, vec_type, predicate);
1850       }
1851       return;
1852    }
1853
1854    assert(type->is_scalar() || type->is_vector());
1855
1856    dst->type = brw_type_for_base_type(type);
1857    src->type = dst->type;
1858
1859    dst->writemask = (1 << type->vector_elements) - 1;
1860
1861    src->swizzle = swizzle_for_size(type->vector_elements);
1862
1863    vec4_instruction *inst = emit(MOV(*dst, *src));
1864    inst->predicate = predicate;
1865
1866    dst->reg_offset++;
1867    src->reg_offset++;
1868 }
1869
1870
1871 /* If the RHS processing resulted in an instruction generating a
1872  * temporary value, and it would be easy to rewrite the instruction to
1873  * generate its result right into the LHS instead, do so.  This ends
1874  * up reliably removing instructions where it can be tricky to do so
1875  * later without real UD chain information.
1876  */
1877 bool
1878 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
1879                                      dst_reg dst,
1880                                      src_reg src,
1881                                      vec4_instruction *pre_rhs_inst,
1882                                      vec4_instruction *last_rhs_inst)
1883 {
1884    /* This could be supported, but it would take more smarts. */
1885    if (ir->condition)
1886       return false;
1887
1888    if (pre_rhs_inst == last_rhs_inst)
1889       return false; /* No instructions generated to work with. */
1890
1891    /* Make sure the last instruction generated our source reg. */
1892    if (src.file != GRF ||
1893        src.file != last_rhs_inst->dst.file ||
1894        src.reg != last_rhs_inst->dst.reg ||
1895        src.reg_offset != last_rhs_inst->dst.reg_offset ||
1896        src.reladdr ||
1897        src.abs ||
1898        src.negate ||
1899        last_rhs_inst->predicate != BRW_PREDICATE_NONE)
1900       return false;
1901
1902    /* Check that that last instruction fully initialized the channels
1903     * we want to use, in the order we want to use them.  We could
1904     * potentially reswizzle the operands of many instructions so that
1905     * we could handle out of order channels, but don't yet.
1906     */
1907
1908    for (unsigned i = 0; i < 4; i++) {
1909       if (dst.writemask & (1 << i)) {
1910          if (!(last_rhs_inst->dst.writemask & (1 << i)))
1911             return false;
1912
1913          if (BRW_GET_SWZ(src.swizzle, i) != i)
1914             return false;
1915       }
1916    }
1917
1918    /* Success!  Rewrite the instruction. */
1919    last_rhs_inst->dst.file = dst.file;
1920    last_rhs_inst->dst.reg = dst.reg;
1921    last_rhs_inst->dst.reg_offset = dst.reg_offset;
1922    last_rhs_inst->dst.reladdr = dst.reladdr;
1923    last_rhs_inst->dst.writemask &= dst.writemask;
1924
1925    return true;
1926 }
1927
1928 void
1929 vec4_visitor::visit(ir_assignment *ir)
1930 {
1931    dst_reg dst = get_assignment_lhs(ir->lhs, this);
1932    uint32_t predicate = BRW_PREDICATE_NONE;
1933
1934    if (!ir->lhs->type->is_scalar() &&
1935        !ir->lhs->type->is_vector()) {
1936       ir->rhs->accept(this);
1937       src_reg src = this->result;
1938
1939       if (ir->condition) {
1940          emit_bool_to_cond_code(ir->condition, &predicate);
1941       }
1942
1943       /* emit_block_move doesn't account for swizzles in the source register.
1944        * This should be ok, since the source register is a structure or an
1945        * array, and those can't be swizzled.  But double-check to be sure.
1946        */
1947       assert(src.swizzle ==
1948              (ir->rhs->type->is_matrix()
1949               ? swizzle_for_size(ir->rhs->type->vector_elements)
1950               : BRW_SWIZZLE_NOOP));
1951
1952       emit_block_move(&dst, &src, ir->rhs->type, predicate);
1953       return;
1954    }
1955
1956    /* Now we're down to just a scalar/vector with writemasks. */
1957    int i;
1958
1959    vec4_instruction *pre_rhs_inst, *last_rhs_inst;
1960    pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1961
1962    ir->rhs->accept(this);
1963
1964    last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1965
1966    src_reg src = this->result;
1967
1968    int swizzles[4];
1969    int first_enabled_chan = 0;
1970    int src_chan = 0;
1971
1972    assert(ir->lhs->type->is_vector() ||
1973           ir->lhs->type->is_scalar());
1974    dst.writemask = ir->write_mask;
1975
1976    for (int i = 0; i < 4; i++) {
1977       if (dst.writemask & (1 << i)) {
1978          first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
1979          break;
1980       }
1981    }
1982
1983    /* Swizzle a small RHS vector into the channels being written.
1984     *
1985     * glsl ir treats write_mask as dictating how many channels are
1986     * present on the RHS while in our instructions we need to make
1987     * those channels appear in the slots of the vec4 they're written to.
1988     */
1989    for (int i = 0; i < 4; i++) {
1990       if (dst.writemask & (1 << i))
1991          swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
1992       else
1993          swizzles[i] = first_enabled_chan;
1994    }
1995    src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
1996                               swizzles[2], swizzles[3]);
1997
1998    if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
1999       return;
2000    }
2001
2002    if (ir->condition) {
2003       emit_bool_to_cond_code(ir->condition, &predicate);
2004    }
2005
2006    for (i = 0; i < type_size(ir->lhs->type); i++) {
2007       vec4_instruction *inst = emit(MOV(dst, src));
2008       inst->predicate = predicate;
2009
2010       dst.reg_offset++;
2011       src.reg_offset++;
2012    }
2013 }
2014
2015 void
2016 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2017 {
2018    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2019       foreach_list(node, &ir->components) {
2020          ir_constant *field_value = (ir_constant *)node;
2021
2022          emit_constant_values(dst, field_value);
2023       }
2024       return;
2025    }
2026
2027    if (ir->type->is_array()) {
2028       for (unsigned int i = 0; i < ir->type->length; i++) {
2029          emit_constant_values(dst, ir->array_elements[i]);
2030       }
2031       return;
2032    }
2033
2034    if (ir->type->is_matrix()) {
2035       for (int i = 0; i < ir->type->matrix_columns; i++) {
2036          float *vec = &ir->value.f[i * ir->type->vector_elements];
2037
2038          for (int j = 0; j < ir->type->vector_elements; j++) {
2039             dst->writemask = 1 << j;
2040             dst->type = BRW_REGISTER_TYPE_F;
2041
2042             emit(MOV(*dst, src_reg(vec[j])));
2043          }
2044          dst->reg_offset++;
2045       }
2046       return;
2047    }
2048
2049    int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2050
2051    for (int i = 0; i < ir->type->vector_elements; i++) {
2052       if (!(remaining_writemask & (1 << i)))
2053          continue;
2054
2055       dst->writemask = 1 << i;
2056       dst->type = brw_type_for_base_type(ir->type);
2057
2058       /* Find other components that match the one we're about to
2059        * write.  Emits fewer instructions for things like vec4(0.5,
2060        * 1.5, 1.5, 1.5).
2061        */
2062       for (int j = i + 1; j < ir->type->vector_elements; j++) {
2063          if (ir->type->base_type == GLSL_TYPE_BOOL) {
2064             if (ir->value.b[i] == ir->value.b[j])
2065                dst->writemask |= (1 << j);
2066          } else {
2067             /* u, i, and f storage all line up, so no need for a
2068              * switch case for comparing each type.
2069              */
2070             if (ir->value.u[i] == ir->value.u[j])
2071                dst->writemask |= (1 << j);
2072          }
2073       }
2074
2075       switch (ir->type->base_type) {
2076       case GLSL_TYPE_FLOAT:
2077          emit(MOV(*dst, src_reg(ir->value.f[i])));
2078          break;
2079       case GLSL_TYPE_INT:
2080          emit(MOV(*dst, src_reg(ir->value.i[i])));
2081          break;
2082       case GLSL_TYPE_UINT:
2083          emit(MOV(*dst, src_reg(ir->value.u[i])));
2084          break;
2085       case GLSL_TYPE_BOOL:
2086          emit(MOV(*dst, src_reg(ir->value.b[i])));
2087          break;
2088       default:
2089          assert(!"Non-float/uint/int/bool constant");
2090          break;
2091       }
2092
2093       remaining_writemask &= ~dst->writemask;
2094    }
2095    dst->reg_offset++;
2096 }
2097
2098 void
2099 vec4_visitor::visit(ir_constant *ir)
2100 {
2101    dst_reg dst = dst_reg(this, ir->type);
2102    this->result = src_reg(dst);
2103
2104    emit_constant_values(&dst, ir);
2105 }
2106
2107 void
2108 vec4_visitor::visit(ir_call *ir)
2109 {
2110    assert(!"not reached");
2111 }
2112
2113 void
2114 vec4_visitor::visit(ir_texture *ir)
2115 {
2116    int sampler =
2117       _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2118
2119    /* Should be lowered by do_lower_texture_projection */
2120    assert(!ir->projector);
2121
2122    /* Generate code to compute all the subexpression trees.  This has to be
2123     * done before loading any values into MRFs for the sampler message since
2124     * generating these values may involve SEND messages that need the MRFs.
2125     */
2126    src_reg coordinate;
2127    if (ir->coordinate) {
2128       ir->coordinate->accept(this);
2129       coordinate = this->result;
2130    }
2131
2132    src_reg shadow_comparitor;
2133    if (ir->shadow_comparitor) {
2134       ir->shadow_comparitor->accept(this);
2135       shadow_comparitor = this->result;
2136    }
2137
2138    const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2139    src_reg lod, dPdx, dPdy, sample_index;
2140    switch (ir->op) {
2141    case ir_tex:
2142       lod = src_reg(0.0f);
2143       lod_type = glsl_type::float_type;
2144       break;
2145    case ir_txf:
2146    case ir_txl:
2147    case ir_txs:
2148       ir->lod_info.lod->accept(this);
2149       lod = this->result;
2150       lod_type = ir->lod_info.lod->type;
2151       break;
2152    case ir_txf_ms:
2153       ir->lod_info.sample_index->accept(this);
2154       sample_index = this->result;
2155       sample_index_type = ir->lod_info.sample_index->type;
2156       break;
2157    case ir_txd:
2158       ir->lod_info.grad.dPdx->accept(this);
2159       dPdx = this->result;
2160
2161       ir->lod_info.grad.dPdy->accept(this);
2162       dPdy = this->result;
2163
2164       lod_type = ir->lod_info.grad.dPdx->type;
2165       break;
2166    case ir_txb:
2167    case ir_lod:
2168       break;
2169    }
2170
2171    vec4_instruction *inst = NULL;
2172    switch (ir->op) {
2173    case ir_tex:
2174    case ir_txl:
2175       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXL);
2176       break;
2177    case ir_txd:
2178       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXD);
2179       break;
2180    case ir_txf:
2181       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF);
2182       break;
2183    case ir_txf_ms:
2184       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_MS);
2185       break;
2186    case ir_txs:
2187       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
2188       break;
2189    case ir_txb:
2190       assert(!"TXB is not valid for vertex shaders.");
2191       break;
2192    case ir_lod:
2193       assert(!"LOD is not valid for vertex shaders.");
2194       break;
2195    }
2196
2197    bool use_texture_offset = ir->offset != NULL && ir->op != ir_txf;
2198
2199    /* Texel offsets go in the message header; Gen4 also requires headers. */
2200    inst->header_present = use_texture_offset || brw->gen < 5;
2201    inst->base_mrf = 2;
2202    inst->mlen = inst->header_present + 1; /* always at least one */
2203    inst->sampler = sampler;
2204    inst->dst = dst_reg(this, ir->type);
2205    inst->dst.writemask = WRITEMASK_XYZW;
2206    inst->shadow_compare = ir->shadow_comparitor != NULL;
2207
2208    if (use_texture_offset)
2209       inst->texture_offset = brw_texture_offset(ir->offset->as_constant());
2210
2211    /* MRF for the first parameter */
2212    int param_base = inst->base_mrf + inst->header_present;
2213
2214    if (ir->op == ir_txs) {
2215       int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2216       emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2217    } else {
2218       int i, coord_mask = 0, zero_mask = 0;
2219       /* Load the coordinate */
2220       /* FINISHME: gl_clamp_mask and saturate */
2221       for (i = 0; i < ir->coordinate->type->vector_elements; i++)
2222          coord_mask |= (1 << i);
2223       for (; i < 4; i++)
2224          zero_mask |= (1 << i);
2225
2226       if (ir->offset && ir->op == ir_txf) {
2227          /* It appears that the ld instruction used for txf does its
2228           * address bounds check before adding in the offset.  To work
2229           * around this, just add the integer offset to the integer
2230           * texel coordinate, and don't put the offset in the header.
2231           */
2232          ir_constant *offset = ir->offset->as_constant();
2233          assert(offset);
2234
2235          for (int j = 0; j < ir->coordinate->type->vector_elements; j++) {
2236             src_reg src = coordinate;
2237             src.swizzle = BRW_SWIZZLE4(BRW_GET_SWZ(src.swizzle, j),
2238                                        BRW_GET_SWZ(src.swizzle, j),
2239                                        BRW_GET_SWZ(src.swizzle, j),
2240                                        BRW_GET_SWZ(src.swizzle, j));
2241             emit(ADD(dst_reg(MRF, param_base, ir->coordinate->type, 1 << j),
2242                      src, offset->value.i[j]));
2243          }
2244       } else {
2245          emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2246                   coordinate));
2247       }
2248       if (zero_mask != 0) {
2249          emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2250                   src_reg(0)));
2251       }
2252       /* Load the shadow comparitor */
2253       if (ir->shadow_comparitor && ir->op != ir_txd) {
2254          emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2255                           WRITEMASK_X),
2256                   shadow_comparitor));
2257          inst->mlen++;
2258       }
2259
2260       /* Load the LOD info */
2261       if (ir->op == ir_tex || ir->op == ir_txl) {
2262          int mrf, writemask;
2263          if (brw->gen >= 5) {
2264             mrf = param_base + 1;
2265             if (ir->shadow_comparitor) {
2266                writemask = WRITEMASK_Y;
2267                /* mlen already incremented */
2268             } else {
2269                writemask = WRITEMASK_X;
2270                inst->mlen++;
2271             }
2272          } else /* brw->gen == 4 */ {
2273             mrf = param_base;
2274             writemask = WRITEMASK_W;
2275          }
2276          emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2277       } else if (ir->op == ir_txf) {
2278          emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2279       } else if (ir->op == ir_txf_ms) {
2280          emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2281                   sample_index));
2282          inst->mlen++;
2283
2284          /* on Gen7, there is an additional MCS parameter here after SI,
2285           * but we don't bother to emit it since it's always zero. If
2286           * we start supporting texturing from CMS surfaces, this will have
2287           * to change
2288           */
2289       } else if (ir->op == ir_txd) {
2290          const glsl_type *type = lod_type;
2291
2292          if (brw->gen >= 5) {
2293             dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2294             dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2295             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2296             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2297             inst->mlen++;
2298
2299             if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2300                dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2301                dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2302                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2303                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2304                inst->mlen++;
2305
2306                if (ir->shadow_comparitor) {
2307                   emit(MOV(dst_reg(MRF, param_base + 2,
2308                                    ir->shadow_comparitor->type, WRITEMASK_Z),
2309                            shadow_comparitor));
2310                }
2311             }
2312          } else /* brw->gen == 4 */ {
2313             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2314             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2315             inst->mlen += 2;
2316          }
2317       }
2318    }
2319
2320    emit(inst);
2321
2322    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2323     * spec requires layers.
2324     */
2325    if (ir->op == ir_txs) {
2326       glsl_type const *type = ir->sampler->type;
2327       if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2328           type->sampler_array) {
2329          emit_math(SHADER_OPCODE_INT_QUOTIENT,
2330                    with_writemask(inst->dst, WRITEMASK_Z),
2331                    src_reg(inst->dst), src_reg(6));
2332       }
2333    }
2334
2335    swizzle_result(ir, src_reg(inst->dst), sampler);
2336 }
2337
2338 void
2339 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, int sampler)
2340 {
2341    int s = key->tex.swizzles[sampler];
2342
2343    this->result = src_reg(this, ir->type);
2344    dst_reg swizzled_result(this->result);
2345
2346    if (ir->op == ir_txs || ir->type == glsl_type::float_type
2347                         || s == SWIZZLE_NOOP) {
2348       emit(MOV(swizzled_result, orig_val));
2349       return;
2350    }
2351
2352    int zero_mask = 0, one_mask = 0, copy_mask = 0;
2353    int swizzle[4] = {0};
2354
2355    for (int i = 0; i < 4; i++) {
2356       switch (GET_SWZ(s, i)) {
2357       case SWIZZLE_ZERO:
2358          zero_mask |= (1 << i);
2359          break;
2360       case SWIZZLE_ONE:
2361          one_mask |= (1 << i);
2362          break;
2363       default:
2364          copy_mask |= (1 << i);
2365          swizzle[i] = GET_SWZ(s, i);
2366          break;
2367       }
2368    }
2369
2370    if (copy_mask) {
2371       orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2372       swizzled_result.writemask = copy_mask;
2373       emit(MOV(swizzled_result, orig_val));
2374    }
2375
2376    if (zero_mask) {
2377       swizzled_result.writemask = zero_mask;
2378       emit(MOV(swizzled_result, src_reg(0.0f)));
2379    }
2380
2381    if (one_mask) {
2382       swizzled_result.writemask = one_mask;
2383       emit(MOV(swizzled_result, src_reg(1.0f)));
2384    }
2385 }
2386
2387 void
2388 vec4_visitor::visit(ir_return *ir)
2389 {
2390    assert(!"not reached");
2391 }
2392
2393 void
2394 vec4_visitor::visit(ir_discard *ir)
2395 {
2396    assert(!"not reached");
2397 }
2398
2399 void
2400 vec4_visitor::visit(ir_if *ir)
2401 {
2402    /* Don't point the annotation at the if statement, because then it plus
2403     * the then and else blocks get printed.
2404     */
2405    this->base_ir = ir->condition;
2406
2407    if (brw->gen == 6) {
2408       emit_if_gen6(ir);
2409    } else {
2410       uint32_t predicate;
2411       emit_bool_to_cond_code(ir->condition, &predicate);
2412       emit(IF(predicate));
2413    }
2414
2415    visit_instructions(&ir->then_instructions);
2416
2417    if (!ir->else_instructions.is_empty()) {
2418       this->base_ir = ir->condition;
2419       emit(BRW_OPCODE_ELSE);
2420
2421       visit_instructions(&ir->else_instructions);
2422    }
2423
2424    this->base_ir = ir->condition;
2425    emit(BRW_OPCODE_ENDIF);
2426 }
2427
2428 void
2429 vec4_visitor::visit(ir_emit_vertex *)
2430 {
2431    assert(!"not reached");
2432 }
2433
2434 void
2435 vec4_visitor::visit(ir_end_primitive *)
2436 {
2437    assert(!"not reached");
2438 }
2439
2440 void
2441 vec4_visitor::emit_ndc_computation()
2442 {
2443    /* Get the position */
2444    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2445
2446    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2447    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2448    output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2449
2450    current_annotation = "NDC";
2451    dst_reg ndc_w = ndc;
2452    ndc_w.writemask = WRITEMASK_W;
2453    src_reg pos_w = pos;
2454    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2455    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2456
2457    dst_reg ndc_xyz = ndc;
2458    ndc_xyz.writemask = WRITEMASK_XYZ;
2459
2460    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2461 }
2462
2463 void
2464 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2465 {
2466    if (brw->gen < 6 &&
2467        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2468         key->userclip_active || brw->has_negative_rhw_bug)) {
2469       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2470       dst_reg header1_w = header1;
2471       header1_w.writemask = WRITEMASK_W;
2472
2473       emit(MOV(header1, 0u));
2474
2475       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2476          src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2477
2478          current_annotation = "Point size";
2479          emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2480          emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2481       }
2482
2483       if (key->userclip_active) {
2484          current_annotation = "Clipping flags";
2485          dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
2486          dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
2487
2488          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
2489          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
2490          emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
2491
2492          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
2493          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
2494          emit(SHL(flags1, src_reg(flags1), src_reg(4)));
2495          emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
2496       }
2497
2498       /* i965 clipping workaround:
2499        * 1) Test for -ve rhw
2500        * 2) If set,
2501        *      set ndc = (0,0,0,0)
2502        *      set ucp[6] = 1
2503        *
2504        * Later, clipping will detect ucp[6] and ensure the primitive is
2505        * clipped against all fixed planes.
2506        */
2507       if (brw->has_negative_rhw_bug) {
2508          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
2509          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
2510          emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
2511          vec4_instruction *inst;
2512          inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
2513          inst->predicate = BRW_PREDICATE_NORMAL;
2514          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
2515          inst->predicate = BRW_PREDICATE_NORMAL;
2516       }
2517
2518       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2519    } else if (brw->gen < 6) {
2520       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2521    } else {
2522       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2523       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2524          emit(MOV(brw_writemask(reg, WRITEMASK_W),
2525                   src_reg(output_reg[VARYING_SLOT_PSIZ])));
2526       }
2527       if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
2528          emit(MOV(retype(brw_writemask(reg, WRITEMASK_Y), BRW_REGISTER_TYPE_D),
2529                   src_reg(output_reg[VARYING_SLOT_LAYER])));
2530       }
2531    }
2532 }
2533
2534 void
2535 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
2536 {
2537    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2538     *
2539     *     "If a linked set of shaders forming the vertex stage contains no
2540     *     static write to gl_ClipVertex or gl_ClipDistance, but the
2541     *     application has requested clipping against user clip planes through
2542     *     the API, then the coordinate written to gl_Position is used for
2543     *     comparison against the user clip planes."
2544     *
2545     * This function is only called if the shader didn't write to
2546     * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
2547     * if the user wrote to it; otherwise we use gl_Position.
2548     */
2549    gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
2550    if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
2551       clip_vertex = VARYING_SLOT_POS;
2552    }
2553
2554    for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
2555         ++i) {
2556       reg.writemask = 1 << i;
2557       emit(DP4(reg,
2558                src_reg(output_reg[clip_vertex]),
2559                src_reg(this->userplane[i + offset])));
2560    }
2561 }
2562
2563 void
2564 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
2565 {
2566    assert (varying < VARYING_SLOT_MAX);
2567    reg.type = output_reg[varying].type;
2568    current_annotation = output_reg_annotation[varying];
2569    /* Copy the register, saturating if necessary */
2570    vec4_instruction *inst = emit(MOV(reg,
2571                                      src_reg(output_reg[varying])));
2572    if ((varying == VARYING_SLOT_COL0 ||
2573         varying == VARYING_SLOT_COL1 ||
2574         varying == VARYING_SLOT_BFC0 ||
2575         varying == VARYING_SLOT_BFC1) &&
2576        key->clamp_vertex_color) {
2577       inst->saturate = true;
2578    }
2579 }
2580
2581 void
2582 vec4_visitor::emit_urb_slot(int mrf, int varying)
2583 {
2584    struct brw_reg hw_reg = brw_message_reg(mrf);
2585    dst_reg reg = dst_reg(MRF, mrf);
2586    reg.type = BRW_REGISTER_TYPE_F;
2587
2588    switch (varying) {
2589    case VARYING_SLOT_PSIZ:
2590       /* PSIZ is always in slot 0, and is coupled with other flags. */
2591       current_annotation = "indices, point width, clip flags";
2592       emit_psiz_and_flags(hw_reg);
2593       break;
2594    case BRW_VARYING_SLOT_NDC:
2595       current_annotation = "NDC";
2596       emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
2597       break;
2598    case VARYING_SLOT_POS:
2599       current_annotation = "gl_Position";
2600       emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
2601       break;
2602    case VARYING_SLOT_EDGE:
2603       /* This is present when doing unfilled polygons.  We're supposed to copy
2604        * the edge flag from the user-provided vertex array
2605        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
2606        * of that attribute (starts as 1.0f).  This is then used in clipping to
2607        * determine which edges should be drawn as wireframe.
2608        */
2609       current_annotation = "edge flag";
2610       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
2611                                     glsl_type::float_type, WRITEMASK_XYZW))));
2612       break;
2613    case BRW_VARYING_SLOT_PAD:
2614       /* No need to write to this slot */
2615       break;
2616    default:
2617       emit_generic_urb_slot(reg, varying);
2618       break;
2619    }
2620 }
2621
2622 static int
2623 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
2624 {
2625    if (brw->gen >= 6) {
2626       /* URB data written (does not include the message header reg) must
2627        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
2628        * section 5.4.3.2.2: URB_INTERLEAVED.
2629        *
2630        * URB entries are allocated on a multiple of 1024 bits, so an
2631        * extra 128 bits written here to make the end align to 256 is
2632        * no problem.
2633        */
2634       if ((mlen % 2) != 1)
2635          mlen++;
2636    }
2637
2638    return mlen;
2639 }
2640
2641
2642 /**
2643  * Generates the VUE payload plus the necessary URB write instructions to
2644  * output it.
2645  *
2646  * The VUE layout is documented in Volume 2a.
2647  */
2648 void
2649 vec4_visitor::emit_vertex()
2650 {
2651    /* MRF 0 is reserved for the debugger, so start with message header
2652     * in MRF 1.
2653     */
2654    int base_mrf = 1;
2655    int mrf = base_mrf;
2656    /* In the process of generating our URB write message contents, we
2657     * may need to unspill a register or load from an array.  Those
2658     * reads would use MRFs 14-15.
2659     */
2660    int max_usable_mrf = 13;
2661
2662    /* The following assertion verifies that max_usable_mrf causes an
2663     * even-numbered amount of URB write data, which will meet gen6's
2664     * requirements for length alignment.
2665     */
2666    assert ((max_usable_mrf - base_mrf) % 2 == 0);
2667
2668    /* First mrf is the g0-based message header containing URB handles and
2669     * such.
2670     */
2671    emit_urb_write_header(mrf++);
2672
2673    if (brw->gen < 6) {
2674       emit_ndc_computation();
2675    }
2676
2677    /* Lower legacy ff and ClipVertex clipping to clip distances */
2678    if (key->userclip_active && !key->uses_clip_distance) {
2679       current_annotation = "user clip distances";
2680
2681       output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
2682       output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
2683
2684       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
2685       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
2686    }
2687
2688    /* Set up the VUE data for the first URB write */
2689    int slot;
2690    for (slot = 0; slot < prog_data->vue_map.num_slots; ++slot) {
2691       emit_urb_slot(mrf++, prog_data->vue_map.slot_to_varying[slot]);
2692
2693       /* If this was max_usable_mrf, we can't fit anything more into this URB
2694        * WRITE.
2695        */
2696       if (mrf > max_usable_mrf) {
2697          slot++;
2698          break;
2699       }
2700    }
2701
2702    bool complete = slot >= prog_data->vue_map.num_slots;
2703    current_annotation = "URB write";
2704    vec4_instruction *inst = emit_urb_write_opcode(complete);
2705    inst->base_mrf = base_mrf;
2706    inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2707
2708    /* Optional second URB write */
2709    if (!complete) {
2710       mrf = base_mrf + 1;
2711
2712       for (; slot < prog_data->vue_map.num_slots; ++slot) {
2713          assert(mrf < max_usable_mrf);
2714
2715          emit_urb_slot(mrf++, prog_data->vue_map.slot_to_varying[slot]);
2716       }
2717
2718       current_annotation = "URB write";
2719       inst = emit_urb_write_opcode(true /* complete */);
2720       inst->base_mrf = base_mrf;
2721       inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2722       /* URB destination offset.  In the previous write, we got MRFs
2723        * 2-13 minus the one header MRF, so 12 regs.  URB offset is in
2724        * URB row increments, and each of our MRFs is half of one of
2725        * those, since we're doing interleaved writes.
2726        */
2727       inst->offset = (max_usable_mrf - base_mrf) / 2;
2728    }
2729 }
2730
2731
2732 src_reg
2733 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
2734                                  src_reg *reladdr, int reg_offset)
2735 {
2736    /* Because we store the values to scratch interleaved like our
2737     * vertex data, we need to scale the vec4 index by 2.
2738     */
2739    int message_header_scale = 2;
2740
2741    /* Pre-gen6, the message header uses byte offsets instead of vec4
2742     * (16-byte) offset units.
2743     */
2744    if (brw->gen < 6)
2745       message_header_scale *= 16;
2746
2747    if (reladdr) {
2748       src_reg index = src_reg(this, glsl_type::int_type);
2749
2750       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2751       emit_before(inst, MUL(dst_reg(index),
2752                             index, src_reg(message_header_scale)));
2753
2754       return index;
2755    } else {
2756       return src_reg(reg_offset * message_header_scale);
2757    }
2758 }
2759
2760 src_reg
2761 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
2762                                        src_reg *reladdr, int reg_offset)
2763 {
2764    if (reladdr) {
2765       src_reg index = src_reg(this, glsl_type::int_type);
2766
2767       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2768
2769       /* Pre-gen6, the message header uses byte offsets instead of vec4
2770        * (16-byte) offset units.
2771        */
2772       if (brw->gen < 6) {
2773          emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
2774       }
2775
2776       return index;
2777    } else {
2778       int message_header_scale = brw->gen < 6 ? 16 : 1;
2779       return src_reg(reg_offset * message_header_scale);
2780    }
2781 }
2782
2783 /**
2784  * Emits an instruction before @inst to load the value named by @orig_src
2785  * from scratch space at @base_offset to @temp.
2786  *
2787  * @base_offset is measured in 32-byte units (the size of a register).
2788  */
2789 void
2790 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
2791                                 dst_reg temp, src_reg orig_src,
2792                                 int base_offset)
2793 {
2794    int reg_offset = base_offset + orig_src.reg_offset;
2795    src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
2796
2797    emit_before(inst, SCRATCH_READ(temp, index));
2798 }
2799
2800 /**
2801  * Emits an instruction after @inst to store the value to be written
2802  * to @orig_dst to scratch space at @base_offset, from @temp.
2803  *
2804  * @base_offset is measured in 32-byte units (the size of a register).
2805  */
2806 void
2807 vec4_visitor::emit_scratch_write(vec4_instruction *inst, int base_offset)
2808 {
2809    int reg_offset = base_offset + inst->dst.reg_offset;
2810    src_reg index = get_scratch_offset(inst, inst->dst.reladdr, reg_offset);
2811
2812    /* Create a temporary register to store *inst's result in.
2813     *
2814     * We have to be careful in MOVing from our temporary result register in
2815     * the scratch write.  If we swizzle from channels of the temporary that
2816     * weren't initialized, it will confuse live interval analysis, which will
2817     * make spilling fail to make progress.
2818     */
2819    src_reg temp = src_reg(this, glsl_type::vec4_type);
2820    temp.type = inst->dst.type;
2821    int first_writemask_chan = ffs(inst->dst.writemask) - 1;
2822    int swizzles[4];
2823    for (int i = 0; i < 4; i++)
2824       if (inst->dst.writemask & (1 << i))
2825          swizzles[i] = i;
2826       else
2827          swizzles[i] = first_writemask_chan;
2828    temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2829                                swizzles[2], swizzles[3]);
2830
2831    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
2832                                        inst->dst.writemask));
2833    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
2834    write->predicate = inst->predicate;
2835    write->ir = inst->ir;
2836    write->annotation = inst->annotation;
2837    inst->insert_after(write);
2838
2839    inst->dst.file = temp.file;
2840    inst->dst.reg = temp.reg;
2841    inst->dst.reg_offset = temp.reg_offset;
2842    inst->dst.reladdr = NULL;
2843 }
2844
2845 /**
2846  * We can't generally support array access in GRF space, because a
2847  * single instruction's destination can only span 2 contiguous
2848  * registers.  So, we send all GRF arrays that get variable index
2849  * access to scratch space.
2850  */
2851 void
2852 vec4_visitor::move_grf_array_access_to_scratch()
2853 {
2854    int scratch_loc[this->virtual_grf_count];
2855
2856    for (int i = 0; i < this->virtual_grf_count; i++) {
2857       scratch_loc[i] = -1;
2858    }
2859
2860    /* First, calculate the set of virtual GRFs that need to be punted
2861     * to scratch due to having any array access on them, and where in
2862     * scratch.
2863     */
2864    foreach_list(node, &this->instructions) {
2865       vec4_instruction *inst = (vec4_instruction *)node;
2866
2867       if (inst->dst.file == GRF && inst->dst.reladdr &&
2868           scratch_loc[inst->dst.reg] == -1) {
2869          scratch_loc[inst->dst.reg] = c->last_scratch;
2870          c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
2871       }
2872
2873       for (int i = 0 ; i < 3; i++) {
2874          src_reg *src = &inst->src[i];
2875
2876          if (src->file == GRF && src->reladdr &&
2877              scratch_loc[src->reg] == -1) {
2878             scratch_loc[src->reg] = c->last_scratch;
2879             c->last_scratch += this->virtual_grf_sizes[src->reg];
2880          }
2881       }
2882    }
2883
2884    /* Now, for anything that will be accessed through scratch, rewrite
2885     * it to load/store.  Note that this is a _safe list walk, because
2886     * we may generate a new scratch_write instruction after the one
2887     * we're processing.
2888     */
2889    foreach_list_safe(node, &this->instructions) {
2890       vec4_instruction *inst = (vec4_instruction *)node;
2891
2892       /* Set up the annotation tracking for new generated instructions. */
2893       base_ir = inst->ir;
2894       current_annotation = inst->annotation;
2895
2896       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
2897          emit_scratch_write(inst, scratch_loc[inst->dst.reg]);
2898       }
2899
2900       for (int i = 0 ; i < 3; i++) {
2901          if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
2902             continue;
2903
2904          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
2905
2906          emit_scratch_read(inst, temp, inst->src[i],
2907                            scratch_loc[inst->src[i].reg]);
2908
2909          inst->src[i].file = temp.file;
2910          inst->src[i].reg = temp.reg;
2911          inst->src[i].reg_offset = temp.reg_offset;
2912          inst->src[i].reladdr = NULL;
2913       }
2914    }
2915 }
2916
2917 /**
2918  * Emits an instruction before @inst to load the value named by @orig_src
2919  * from the pull constant buffer (surface) at @base_offset to @temp.
2920  */
2921 void
2922 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
2923                                       dst_reg temp, src_reg orig_src,
2924                                       int base_offset)
2925 {
2926    int reg_offset = base_offset + orig_src.reg_offset;
2927    src_reg index = src_reg((unsigned)SURF_INDEX_VEC4_CONST_BUFFER);
2928    src_reg offset = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
2929    vec4_instruction *load;
2930
2931    if (brw->gen >= 7) {
2932       dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
2933       grf_offset.type = offset.type;
2934       emit_before(inst, MOV(grf_offset, offset));
2935
2936       load = new(mem_ctx) vec4_instruction(this,
2937                                            VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
2938                                            temp, index, src_reg(grf_offset));
2939    } else {
2940       load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
2941                                            temp, index, offset);
2942       load->base_mrf = 14;
2943       load->mlen = 1;
2944    }
2945    emit_before(inst, load);
2946 }
2947
2948 /**
2949  * Implements array access of uniforms by inserting a
2950  * PULL_CONSTANT_LOAD instruction.
2951  *
2952  * Unlike temporary GRF array access (where we don't support it due to
2953  * the difficulty of doing relative addressing on instruction
2954  * destinations), we could potentially do array access of uniforms
2955  * that were loaded in GRF space as push constants.  In real-world
2956  * usage we've seen, though, the arrays being used are always larger
2957  * than we could load as push constants, so just always move all
2958  * uniform array access out to a pull constant buffer.
2959  */
2960 void
2961 vec4_visitor::move_uniform_array_access_to_pull_constants()
2962 {
2963    int pull_constant_loc[this->uniforms];
2964
2965    for (int i = 0; i < this->uniforms; i++) {
2966       pull_constant_loc[i] = -1;
2967    }
2968
2969    /* Walk through and find array access of uniforms.  Put a copy of that
2970     * uniform in the pull constant buffer.
2971     *
2972     * Note that we don't move constant-indexed accesses to arrays.  No
2973     * testing has been done of the performance impact of this choice.
2974     */
2975    foreach_list_safe(node, &this->instructions) {
2976       vec4_instruction *inst = (vec4_instruction *)node;
2977
2978       for (int i = 0 ; i < 3; i++) {
2979          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2980             continue;
2981
2982          int uniform = inst->src[i].reg;
2983
2984          /* If this array isn't already present in the pull constant buffer,
2985           * add it.
2986           */
2987          if (pull_constant_loc[uniform] == -1) {
2988             const float **values = &prog_data->param[uniform * 4];
2989
2990             pull_constant_loc[uniform] = prog_data->nr_pull_params / 4;
2991
2992             for (int j = 0; j < uniform_size[uniform] * 4; j++) {
2993                prog_data->pull_param[prog_data->nr_pull_params++]
2994                   = values[j];
2995             }
2996          }
2997
2998          /* Set up the annotation tracking for new generated instructions. */
2999          base_ir = inst->ir;
3000          current_annotation = inst->annotation;
3001
3002          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3003
3004          emit_pull_constant_load(inst, temp, inst->src[i],
3005                                  pull_constant_loc[uniform]);
3006
3007          inst->src[i].file = temp.file;
3008          inst->src[i].reg = temp.reg;
3009          inst->src[i].reg_offset = temp.reg_offset;
3010          inst->src[i].reladdr = NULL;
3011       }
3012    }
3013
3014    /* Now there are no accesses of the UNIFORM file with a reladdr, so
3015     * no need to track them as larger-than-vec4 objects.  This will be
3016     * relied on in cutting out unused uniform vectors from push
3017     * constants.
3018     */
3019    split_uniform_registers();
3020 }
3021
3022 void
3023 vec4_visitor::resolve_ud_negate(src_reg *reg)
3024 {
3025    if (reg->type != BRW_REGISTER_TYPE_UD ||
3026        !reg->negate)
3027       return;
3028
3029    src_reg temp = src_reg(this, glsl_type::uvec4_type);
3030    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3031    *reg = temp;
3032 }
3033
3034 vec4_visitor::vec4_visitor(struct brw_context *brw,
3035                            struct brw_vec4_compile *c,
3036                            struct gl_program *prog,
3037                            const struct brw_vec4_prog_key *key,
3038                            struct brw_vec4_prog_data *prog_data,
3039                            struct gl_shader_program *shader_prog,
3040                            struct brw_shader *shader,
3041                            void *mem_ctx,
3042                            bool debug_flag)
3043    : debug_flag(debug_flag)
3044 {
3045    this->brw = brw;
3046    this->ctx = &brw->ctx;
3047    this->shader_prog = shader_prog;
3048    this->shader = shader;
3049
3050    this->mem_ctx = mem_ctx;
3051    this->failed = false;
3052
3053    this->base_ir = NULL;
3054    this->current_annotation = NULL;
3055    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3056
3057    this->c = c;
3058    this->prog = prog;
3059    this->key = key;
3060    this->prog_data = prog_data;
3061
3062    this->variable_ht = hash_table_ctor(0,
3063                                        hash_table_pointer_hash,
3064                                        hash_table_pointer_compare);
3065
3066    this->virtual_grf_start = NULL;
3067    this->virtual_grf_end = NULL;
3068    this->virtual_grf_sizes = NULL;
3069    this->virtual_grf_count = 0;
3070    this->virtual_grf_reg_map = NULL;
3071    this->virtual_grf_reg_count = 0;
3072    this->virtual_grf_array_size = 0;
3073    this->live_intervals_valid = false;
3074
3075    this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3076
3077    this->uniforms = 0;
3078 }
3079
3080 vec4_visitor::~vec4_visitor()
3081 {
3082    hash_table_dtor(this->variable_ht);
3083 }
3084
3085
3086 void
3087 vec4_visitor::fail(const char *format, ...)
3088 {
3089    va_list va;
3090    char *msg;
3091
3092    if (failed)
3093       return;
3094
3095    failed = true;
3096
3097    va_start(va, format);
3098    msg = ralloc_vasprintf(mem_ctx, format, va);
3099    va_end(va);
3100    msg = ralloc_asprintf(mem_ctx, "VS compile failed: %s\n", msg);
3101
3102    this->fail_msg = msg;
3103
3104    if (debug_flag) {
3105       fprintf(stderr, "%s",  msg);
3106    }
3107 }
3108
3109 } /* namespace brw */