src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "glsl/ir_uniform.h"
  26 extern "C" {
  27 #include "main/context.h"
  28 #include "main/macros.h"
  29 #include "program/prog_parameter.h"
  30 #include "program/sampler.h"
  31 }
  32
  33 namespace brw {
  34
  35 vec4_instruction::vec4_instruction(vec4_visitor *v,
  36                                    enum opcode opcode, dst_reg dst,
  37                                    src_reg src0, src_reg src1, src_reg src2)
  38 {
  39    this->opcode = opcode;
  40    this->dst = dst;
  41    this->src[0] = src0;
  42    this->src[1] = src1;
  43    this->src[2] = src2;
  44    this->ir = v->base_ir;
  45    this->annotation = v->current_annotation;
  46 }
  47
  48 vec4_instruction *
  49 vec4_visitor::emit(vec4_instruction *inst)
  50 {
  51    this->instructions.push_tail(inst);
  52
  53    return inst;
  54 }
  55
  56 vec4_instruction *
  57 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
  58 {
  59    new_inst->ir = inst->ir;
  60    new_inst->annotation = inst->annotation;
  61
  62    inst->insert_before(new_inst);
  63
  64    return inst;
  65 }
  66
  67 vec4_instruction *
  68 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
  69                    src_reg src0, src_reg src1, src_reg src2)
  70 {
  71    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
  72                                              src0, src1, src2));
  73 }
  74
  75
  76 vec4_instruction *
  77 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
  78 {
  79    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
  80 }
  81
  82 vec4_instruction *
  83 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
  84 {
  85    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
  86 }
  87
  88 vec4_instruction *
  89 vec4_visitor::emit(enum opcode opcode)
  90 {
  91    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
  92 }
  93
  94 #define ALU1(op)                                                        \
  95    vec4_instruction *                                                   \
  96    vec4_visitor::op(dst_reg dst, src_reg src0)                          \
  97    {                                                                    \
  98       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
  99                                            src0);                       \
 100    }
 101
 102 #define ALU2(op)                                                        \
 103    vec4_instruction *                                                   \
 104    vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1)            \
 105    {                                                                    \
 106       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 107                                            src0, src1);                 \
 108    }
 109
 110 ALU1(NOT)
 111 ALU1(MOV)
 112 ALU1(FRC)
 113 ALU1(RNDD)
 114 ALU1(RNDE)
 115 ALU1(RNDZ)
 116 ALU1(F32TO16)
 117 ALU1(F16TO32)
 118 ALU2(ADD)
 119 ALU2(MUL)
 120 ALU2(MACH)
 121 ALU2(AND)
 122 ALU2(OR)
 123 ALU2(XOR)
 124 ALU2(DP3)
 125 ALU2(DP4)
 126 ALU2(DPH)
 127 ALU2(SHL)
 128 ALU2(SHR)
 129 ALU2(ASR)
 130
 131 /** Gen4 predicated IF. */
 132 vec4_instruction *
 133 vec4_visitor::IF(uint32_t predicate)
 134 {
 135    vec4_instruction *inst;
 136
 137    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
 138    inst->predicate = predicate;
 139
 140    return inst;
 141 }
 142
 143 /** Gen6+ IF with embedded comparison. */
 144 vec4_instruction *
 145 vec4_visitor::IF(src_reg src0, src_reg src1, uint32_t condition)
 146 {
 147    assert(intel->gen >= 6);
 148
 149    vec4_instruction *inst;
 150
 151    resolve_ud_negate(&src0);
 152    resolve_ud_negate(&src1);
 153
 154    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
 155                                         src0, src1);
 156    inst->conditional_mod = condition;
 157
 158    return inst;
 159 }
 160
 161 /**
 162  * CMP: Sets the low bit of the destination channels with the result
 163  * of the comparison, while the upper bits are undefined, and updates
 164  * the flag register with the packed 16 bits of the result.
 165  */
 166 vec4_instruction *
 167 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, uint32_t condition)
 168 {
 169    vec4_instruction *inst;
 170
 171    /* original gen4 does type conversion to the destination type
 172     * before before comparison, producing garbage results for floating
 173     * point comparisons.
 174     */
 175    if (intel->gen == 4) {
 176       dst.type = src0.type;
 177       if (dst.file == HW_REG)
 178          dst.fixed_hw_reg.type = dst.type;
 179    }
 180
 181    resolve_ud_negate(&src0);
 182    resolve_ud_negate(&src1);
 183
 184    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
 185    inst->conditional_mod = condition;
 186
 187    return inst;
 188 }
 189
 190 vec4_instruction *
 191 vec4_visitor::SCRATCH_READ(dst_reg dst, src_reg index)
 192 {
 193    vec4_instruction *inst;
 194
 195    inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_READ,
 196                                         dst, index);
 197    inst->base_mrf = 14;
 198    inst->mlen = 2;
 199
 200    return inst;
 201 }
 202
 203 vec4_instruction *
 204 vec4_visitor::SCRATCH_WRITE(dst_reg dst, src_reg src, src_reg index)
 205 {
 206    vec4_instruction *inst;
 207
 208    inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_WRITE,
 209                                         dst, src, index);
 210    inst->base_mrf = 13;
 211    inst->mlen = 3;
 212
 213    return inst;
 214 }
 215
 216 void
 217 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
 218 {
 219    static enum opcode dot_opcodes[] = {
 220       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
 221    };
 222
 223    emit(dot_opcodes[elements - 2], dst, src0, src1);
 224 }
 225
 226 src_reg
 227 vec4_visitor::fix_math_operand(src_reg src)
 228 {
 229    /* The gen6 math instruction ignores the source modifiers --
 230     * swizzle, abs, negate, and at least some parts of the register
 231     * region description.
 232     *
 233     * Rather than trying to enumerate all these cases, *always* expand the
 234     * operand to a temp GRF for gen6.
 235     *
 236     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 237     * can't use.
 238     */
 239
 240    if (intel->gen == 7 && src.file != IMM)
 241       return src;
 242
 243    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 244    expanded.type = src.type;
 245    emit(MOV(expanded, src));
 246    return src_reg(expanded);
 247 }
 248
 249 void
 250 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
 251 {
 252    src = fix_math_operand(src);
 253
 254    if (dst.writemask != WRITEMASK_XYZW) {
 255       /* The gen6 math instruction must be align1, so we can't do
 256        * writemasks.
 257        */
 258       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 259
 260       emit(opcode, temp_dst, src);
 261
 262       emit(MOV(dst, src_reg(temp_dst)));
 263    } else {
 264       emit(opcode, dst, src);
 265    }
 266 }
 267
 268 void
 269 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
 270 {
 271    vec4_instruction *inst = emit(opcode, dst, src);
 272    inst->base_mrf = 1;
 273    inst->mlen = 1;
 274 }
 275
 276 void
 277 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
 278 {
 279    switch (opcode) {
 280    case SHADER_OPCODE_RCP:
 281    case SHADER_OPCODE_RSQ:
 282    case SHADER_OPCODE_SQRT:
 283    case SHADER_OPCODE_EXP2:
 284    case SHADER_OPCODE_LOG2:
 285    case SHADER_OPCODE_SIN:
 286    case SHADER_OPCODE_COS:
 287       break;
 288    default:
 289       assert(!"not reached: bad math opcode");
 290       return;
 291    }
 292
 293    if (intel->gen >= 6) {
 294       return emit_math1_gen6(opcode, dst, src);
 295    } else {
 296       return emit_math1_gen4(opcode, dst, src);
 297    }
 298 }
 299
 300 void
 301 vec4_visitor::emit_math2_gen6(enum opcode opcode,
 302                               dst_reg dst, src_reg src0, src_reg src1)
 303 {
 304    src0 = fix_math_operand(src0);
 305    src1 = fix_math_operand(src1);
 306
 307    if (dst.writemask != WRITEMASK_XYZW) {
 308       /* The gen6 math instruction must be align1, so we can't do
 309        * writemasks.
 310        */
 311       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 312       temp_dst.type = dst.type;
 313
 314       emit(opcode, temp_dst, src0, src1);
 315
 316       emit(MOV(dst, src_reg(temp_dst)));
 317    } else {
 318       emit(opcode, dst, src0, src1);
 319    }
 320 }
 321
 322 void
 323 vec4_visitor::emit_math2_gen4(enum opcode opcode,
 324                               dst_reg dst, src_reg src0, src_reg src1)
 325 {
 326    vec4_instruction *inst = emit(opcode, dst, src0, src1);
 327    inst->base_mrf = 1;
 328    inst->mlen = 2;
 329 }
 330
 331 void
 332 vec4_visitor::emit_math(enum opcode opcode,
 333                         dst_reg dst, src_reg src0, src_reg src1)
 334 {
 335    switch (opcode) {
 336    case SHADER_OPCODE_POW:
 337    case SHADER_OPCODE_INT_QUOTIENT:
 338    case SHADER_OPCODE_INT_REMAINDER:
 339       break;
 340    default:
 341       assert(!"not reached: unsupported binary math opcode");
 342       return;
 343    }
 344
 345    if (intel->gen >= 6) {
 346       return emit_math2_gen6(opcode, dst, src0, src1);
 347    } else {
 348       return emit_math2_gen4(opcode, dst, src0, src1);
 349    }
 350 }
 351
 352 void
 353 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 354 {
 355    if (intel->gen < 7)
 356       assert(!"ir_unop_pack_half_2x16 should be lowered");
 357
 358    assert(dst.type == BRW_REGISTER_TYPE_UD);
 359    assert(src0.type == BRW_REGISTER_TYPE_F);
 360
 361    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
 362     *
 363     *   Because this instruction does not have a 16-bit floating-point type,
 364     *   the destination data type must be Word (W).
 365     *
 366     *   The destination must be DWord-aligned and specify a horizontal stride
 367     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
 368     *   each destination channel and the upper word is not modified.
 369     *
 370     * The above restriction implies that the f32to16 instruction must use
 371     * align1 mode, because only in align1 mode is it possible to specify
 372     * horizontal stride.  We choose here to defy the hardware docs and emit
 373     * align16 instructions.
 374     *
 375     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
 376     * instructions. I was partially successful in that the code passed all
 377     * tests.  However, the code was dubiously correct and fragile, and the
 378     * tests were not harsh enough to probe that frailty. Not trusting the
 379     * code, I chose instead to remain in align16 mode in defiance of the hw
 380     * docs).
 381     *
 382     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
 383     * simulator, emitting a f32to16 in align16 mode with UD as destination
 384     * data type is safe. The behavior differs from that specified in the PRM
 385     * in that the upper word of each destination channel is cleared to 0.
 386     */
 387
 388    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 389    src_reg tmp_src(tmp_dst);
 390
 391 #if 0
 392    /* Verify the undocumented behavior on which the following instructions
 393     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
 394     * then the result of the bit-or instruction below will be incorrect.
 395     *
 396     * You should inspect the disasm output in order to verify that the MOV is
 397     * not optimized away.
 398     */
 399    emit(MOV(tmp_dst, src_reg(0x12345678u)));
 400 #endif
 401
 402    /* Give tmp the form below, where "." means untouched.
 403     *
 404     *     w z          y          x w z          y          x
 405     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
 406     *
 407     * That the upper word of each write-channel be 0 is required for the
 408     * following bit-shift and bit-or instructions to work. Note that this
 409     * relies on the undocumented hardware behavior mentioned above.
 410     */
 411    tmp_dst.writemask = WRITEMASK_XY;
 412    emit(F32TO16(tmp_dst, src0));
 413
 414    /* Give the write-channels of dst the form:
 415     *   0xhhhh0000
 416     */
 417    tmp_src.swizzle = SWIZZLE_Y;
 418    emit(SHL(dst, tmp_src, src_reg(16u)));
 419
 420    /* Finally, give the write-channels of dst the form of packHalf2x16's
 421     * output:
 422     *   0xhhhhllll
 423     */
 424    tmp_src.swizzle = SWIZZLE_X;
 425    emit(OR(dst, src_reg(dst), tmp_src));
 426 }
 427
 428 void
 429 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 430 {
 431    if (intel->gen < 7)
 432       assert(!"ir_unop_unpack_half_2x16 should be lowered");
 433
 434    assert(dst.type == BRW_REGISTER_TYPE_F);
 435    assert(src0.type == BRW_REGISTER_TYPE_UD);
 436
 437    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
 438     *
 439     *   Because this instruction does not have a 16-bit floating-point type,
 440     *   the source data type must be Word (W). The destination type must be
 441     *   F (Float).
 442     *
 443     * To use W as the source data type, we must adjust horizontal strides,
 444     * which is only possible in align1 mode. All my [chadv] attempts at
 445     * emitting align1 instructions for unpackHalf2x16 failed to pass the
 446     * Piglit tests, so I gave up.
 447     *
 448     * I've verified that, on gen7 hardware and the simulator, it is safe to
 449     * emit f16to32 in align16 mode with UD as source data type.
 450     */
 451
 452    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 453    src_reg tmp_src(tmp_dst);
 454
 455    tmp_dst.writemask = WRITEMASK_X;
 456    emit(AND(tmp_dst, src0, src_reg(0xffffu)));
 457
 458    tmp_dst.writemask = WRITEMASK_Y;
 459    emit(SHR(tmp_dst, src0, src_reg(16u)));
 460
 461    dst.writemask = WRITEMASK_XY;
 462    emit(F16TO32(dst, tmp_src));
 463 }
 464
 465 void
 466 vec4_visitor::visit_instructions(const exec_list *list)
 467 {
 468    foreach_list(node, list) {
 469       ir_instruction *ir = (ir_instruction *)node;
 470
 471       base_ir = ir;
 472       ir->accept(this);
 473    }
 474 }
 475
 476
 477 static int
 478 type_size(const struct glsl_type *type)
 479 {
 480    unsigned int i;
 481    int size;
 482
 483    switch (type->base_type) {
 484    case GLSL_TYPE_UINT:
 485    case GLSL_TYPE_INT:
 486    case GLSL_TYPE_FLOAT:
 487    case GLSL_TYPE_BOOL:
 488       if (type->is_matrix()) {
 489          return type->matrix_columns;
 490       } else {
 491          /* Regardless of size of vector, it gets a vec4. This is bad
 492           * packing for things like floats, but otherwise arrays become a
 493           * mess.  Hopefully a later pass over the code can pack scalars
 494           * down if appropriate.
 495           */
 496          return 1;
 497       }
 498    case GLSL_TYPE_ARRAY:
 499       assert(type->length > 0);
 500       return type_size(type->fields.array) * type->length;
 501    case GLSL_TYPE_STRUCT:
 502       size = 0;
 503       for (i = 0; i < type->length; i++) {
 504          size += type_size(type->fields.structure[i].type);
 505       }
 506       return size;
 507    case GLSL_TYPE_SAMPLER:
 508       /* Samplers take up one slot in UNIFORMS[], but they're baked in
 509        * at link time.
 510        */
 511       return 1;
 512    default:
 513       assert(0);
 514       return 0;
 515    }
 516 }
 517
 518 int
 519 vec4_visitor::virtual_grf_alloc(int size)
 520 {
 521    if (virtual_grf_array_size <= virtual_grf_count) {
 522       if (virtual_grf_array_size == 0)
 523          virtual_grf_array_size = 16;
 524       else
 525          virtual_grf_array_size *= 2;
 526       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 527                                    virtual_grf_array_size);
 528       virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
 529                                      virtual_grf_array_size);
 530    }
 531    virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
 532    virtual_grf_reg_count += size;
 533    virtual_grf_sizes[virtual_grf_count] = size;
 534    return virtual_grf_count++;
 535 }
 536
 537 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 538 {
 539    init();
 540
 541    this->file = GRF;
 542    this->reg = v->virtual_grf_alloc(type_size(type));
 543
 544    if (type->is_array() || type->is_record()) {
 545       this->swizzle = BRW_SWIZZLE_NOOP;
 546    } else {
 547       this->swizzle = swizzle_for_size(type->vector_elements);
 548    }
 549
 550    this->type = brw_type_for_base_type(type);
 551 }
 552
 553 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 554 {
 555    init();
 556
 557    this->file = GRF;
 558    this->reg = v->virtual_grf_alloc(type_size(type));
 559
 560    if (type->is_array() || type->is_record()) {
 561       this->writemask = WRITEMASK_XYZW;
 562    } else {
 563       this->writemask = (1 << type->vector_elements) - 1;
 564    }
 565
 566    this->type = brw_type_for_base_type(type);
 567 }
 568
 569 /* Our support for uniforms is piggy-backed on the struct
 570  * gl_fragment_program, because that's where the values actually
 571  * get stored, rather than in some global gl_shader_program uniform
 572  * store.
 573  */
 574 void
 575 vec4_visitor::setup_uniform_values(ir_variable *ir)
 576 {
 577    int namelen = strlen(ir->name);
 578
 579    /* The data for our (non-builtin) uniforms is stored in a series of
 580     * gl_uniform_driver_storage structs for each subcomponent that
 581     * glGetUniformLocation() could name.  We know it's been set up in the same
 582     * order we'd walk the type, so walk the list of storage and find anything
 583     * with our name, or the prefix of a component that starts with our name.
 584     */
 585    for (unsigned u = 0; u < prog->NumUserUniformStorage; u++) {
 586       struct gl_uniform_storage *storage = &prog->UniformStorage[u];
 587
 588       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 589           (storage->name[namelen] != 0 &&
 590            storage->name[namelen] != '.' &&
 591            storage->name[namelen] != '[')) {
 592          continue;
 593       }
 594
 595       gl_constant_value *components = storage->storage;
 596       unsigned vector_count = (MAX2(storage->array_elements, 1) *
 597                                storage->type->matrix_columns);
 598
 599       for (unsigned s = 0; s < vector_count; s++) {
 600          uniform_vector_size[uniforms] = storage->type->vector_elements;
 601
 602          int i;
 603          for (i = 0; i < uniform_vector_size[uniforms]; i++) {
 604             c->prog_data.param[uniforms * 4 + i] = &components->f;
 605             components++;
 606          }
 607          for (; i < 4; i++) {
 608             static float zero = 0;
 609             c->prog_data.param[uniforms * 4 + i] = &zero;
 610          }
 611
 612          uniforms++;
 613       }
 614    }
 615 }
 616
 617 void
 618 vec4_visitor::setup_uniform_clipplane_values()
 619 {
 620    gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
 621
 622    if (intel->gen < 6) {
 623       /* Pre-Gen6, we compact clip planes.  For example, if the user
 624        * enables just clip planes 0, 1, and 3, we will enable clip planes
 625        * 0, 1, and 2 in the hardware, and we'll move clip plane 3 to clip
 626        * plane 2.  This simplifies the implementation of the Gen6 clip
 627        * thread.
 628        */
 629       int compacted_clipplane_index = 0;
 630       for (int i = 0; i < MAX_CLIP_PLANES; ++i) {
 631          if (!(c->key.userclip_planes_enabled_gen_4_5 & (1 << i)))
 632             continue;
 633
 634          this->uniform_vector_size[this->uniforms] = 4;
 635          this->userplane[compacted_clipplane_index] = dst_reg(UNIFORM, this->uniforms);
 636          this->userplane[compacted_clipplane_index].type = BRW_REGISTER_TYPE_F;
 637          for (int j = 0; j < 4; ++j) {
 638             c->prog_data.param[this->uniforms * 4 + j] = &clip_planes[i][j];
 639          }
 640          ++compacted_clipplane_index;
 641          ++this->uniforms;
 642       }
 643    } else {
 644       /* In Gen6 and later, we don't compact clip planes, because this
 645        * simplifies the implementation of gl_ClipDistance.
 646        */
 647       for (int i = 0; i < c->key.nr_userclip_plane_consts; ++i) {
 648          this->uniform_vector_size[this->uniforms] = 4;
 649          this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
 650          this->userplane[i].type = BRW_REGISTER_TYPE_F;
 651          for (int j = 0; j < 4; ++j) {
 652             c->prog_data.param[this->uniforms * 4 + j] = &clip_planes[i][j];
 653          }
 654          ++this->uniforms;
 655       }
 656    }
 657 }
 658
 659 /* Our support for builtin uniforms is even scarier than non-builtin.
 660  * It sits on top of the PROG_STATE_VAR parameters that are
 661  * automatically updated from GL context state.
 662  */
 663 void
 664 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
 665 {
 666    const ir_state_slot *const slots = ir->state_slots;
 667    assert(ir->state_slots != NULL);
 668
 669    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 670       /* This state reference has already been setup by ir_to_mesa,
 671        * but we'll get the same index back here.  We can reference
 672        * ParameterValues directly, since unlike brw_fs.cpp, we never
 673        * add new state references during compile.
 674        */
 675       int index = _mesa_add_state_reference(this->vp->Base.Parameters,
 676                                             (gl_state_index *)slots[i].tokens);
 677       float *values = &this->vp->Base.Parameters->ParameterValues[index][0].f;
 678
 679       this->uniform_vector_size[this->uniforms] = 0;
 680       /* Add each of the unique swizzled channels of the element.
 681        * This will end up matching the size of the glsl_type of this field.
 682        */
 683       int last_swiz = -1;
 684       for (unsigned int j = 0; j < 4; j++) {
 685          int swiz = GET_SWZ(slots[i].swizzle, j);
 686          last_swiz = swiz;
 687
 688          c->prog_data.param[this->uniforms * 4 + j] = &values[swiz];
 689          if (swiz <= last_swiz)
 690             this->uniform_vector_size[this->uniforms]++;
 691       }
 692       this->uniforms++;
 693    }
 694 }
 695
 696 dst_reg *
 697 vec4_visitor::variable_storage(ir_variable *var)
 698 {
 699    return (dst_reg *)hash_table_find(this->variable_ht, var);
 700 }
 701
 702 void
 703 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir, uint32_t *predicate)
 704 {
 705    ir_expression *expr = ir->as_expression();
 706
 707    *predicate = BRW_PREDICATE_NORMAL;
 708
 709    if (expr) {
 710       src_reg op[2];
 711       vec4_instruction *inst;
 712
 713       assert(expr->get_num_operands() <= 2);
 714       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 715          expr->operands[i]->accept(this);
 716          op[i] = this->result;
 717
 718          resolve_ud_negate(&op[i]);
 719       }
 720
 721       switch (expr->operation) {
 722       case ir_unop_logic_not:
 723          inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
 724          inst->conditional_mod = BRW_CONDITIONAL_Z;
 725          break;
 726
 727       case ir_binop_logic_xor:
 728          inst = emit(XOR(dst_null_d(), op[0], op[1]));
 729          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 730          break;
 731
 732       case ir_binop_logic_or:
 733          inst = emit(OR(dst_null_d(), op[0], op[1]));
 734          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 735          break;
 736
 737       case ir_binop_logic_and:
 738          inst = emit(AND(dst_null_d(), op[0], op[1]));
 739          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 740          break;
 741
 742       case ir_unop_f2b:
 743          if (intel->gen >= 6) {
 744             emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
 745          } else {
 746             inst = emit(MOV(dst_null_f(), op[0]));
 747             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 748          }
 749          break;
 750
 751       case ir_unop_i2b:
 752          if (intel->gen >= 6) {
 753             emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 754          } else {
 755             inst = emit(MOV(dst_null_d(), op[0]));
 756             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 757          }
 758          break;
 759
 760       case ir_binop_all_equal:
 761          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 762          *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
 763          break;
 764
 765       case ir_binop_any_nequal:
 766          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 767          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 768          break;
 769
 770       case ir_unop_any:
 771          inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 772          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 773          break;
 774
 775       case ir_binop_greater:
 776       case ir_binop_gequal:
 777       case ir_binop_less:
 778       case ir_binop_lequal:
 779       case ir_binop_equal:
 780       case ir_binop_nequal:
 781          emit(CMP(dst_null_d(), op[0], op[1],
 782                   brw_conditional_for_comparison(expr->operation)));
 783          break;
 784
 785       default:
 786          assert(!"not reached");
 787          break;
 788       }
 789       return;
 790    }
 791
 792    ir->accept(this);
 793
 794    resolve_ud_negate(&this->result);
 795
 796    if (intel->gen >= 6) {
 797       vec4_instruction *inst = emit(AND(dst_null_d(),
 798                                         this->result, src_reg(1)));
 799       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 800    } else {
 801       vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
 802       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 803    }
 804 }
 805
 806 /**
 807  * Emit a gen6 IF statement with the comparison folded into the IF
 808  * instruction.
 809  */
 810 void
 811 vec4_visitor::emit_if_gen6(ir_if *ir)
 812 {
 813    ir_expression *expr = ir->condition->as_expression();
 814
 815    if (expr) {
 816       src_reg op[2];
 817       dst_reg temp;
 818
 819       assert(expr->get_num_operands() <= 2);
 820       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 821          expr->operands[i]->accept(this);
 822          op[i] = this->result;
 823       }
 824
 825       switch (expr->operation) {
 826       case ir_unop_logic_not:
 827          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
 828          return;
 829
 830       case ir_binop_logic_xor:
 831          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
 832          return;
 833
 834       case ir_binop_logic_or:
 835          temp = dst_reg(this, glsl_type::bool_type);
 836          emit(OR(temp, op[0], op[1]));
 837          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 838          return;
 839
 840       case ir_binop_logic_and:
 841          temp = dst_reg(this, glsl_type::bool_type);
 842          emit(AND(temp, op[0], op[1]));
 843          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 844          return;
 845
 846       case ir_unop_f2b:
 847          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 848          return;
 849
 850       case ir_unop_i2b:
 851          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 852          return;
 853
 854       case ir_binop_greater:
 855       case ir_binop_gequal:
 856       case ir_binop_less:
 857       case ir_binop_lequal:
 858       case ir_binop_equal:
 859       case ir_binop_nequal:
 860          emit(IF(op[0], op[1],
 861                  brw_conditional_for_comparison(expr->operation)));
 862          return;
 863
 864       case ir_binop_all_equal:
 865          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 866          emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
 867          return;
 868
 869       case ir_binop_any_nequal:
 870          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 871          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 872          return;
 873
 874       case ir_unop_any:
 875          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 876          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 877          return;
 878
 879       default:
 880          assert(!"not reached");
 881          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 882          return;
 883       }
 884       return;
 885    }
 886
 887    ir->condition->accept(this);
 888
 889    emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
 890 }
 891
 892 static dst_reg
 893 with_writemask(dst_reg const & r, int mask)
 894 {
 895    dst_reg result = r;
 896    result.writemask = mask;
 897    return result;
 898 }
 899
 900 void
 901 vec4_visitor::emit_attribute_fixups()
 902 {
 903    dst_reg sign_recovery_shift;
 904    dst_reg normalize_factor;
 905    dst_reg es3_normalize_factor;
 906
 907    for (int i = 0; i < VERT_ATTRIB_MAX; i++) {
 908       if (prog_data->inputs_read & BITFIELD64_BIT(i)) {
 909          uint8_t wa_flags = c->key.gl_attrib_wa_flags[i];
 910          dst_reg reg(ATTR, i);
 911          dst_reg reg_d = reg;
 912          reg_d.type = BRW_REGISTER_TYPE_D;
 913          dst_reg reg_ud = reg;
 914          reg_ud.type = BRW_REGISTER_TYPE_UD;
 915
 916          /* Do GL_FIXED rescaling for GLES2.0.  Our GL_FIXED attributes
 917           * come in as floating point conversions of the integer values.
 918           */
 919          if (wa_flags & BRW_ATTRIB_WA_COMPONENT_MASK) {
 920             dst_reg dst = reg;
 921             dst.type = brw_type_for_base_type(glsl_type::vec4_type);
 922             dst.writemask = (1 << (wa_flags & BRW_ATTRIB_WA_COMPONENT_MASK)) - 1;
 923             emit(MUL(dst, src_reg(dst), src_reg(1.0f / 65536.0f)));
 924          }
 925
 926          /* Do sign recovery for 2101010 formats if required. */
 927          if (wa_flags & BRW_ATTRIB_WA_SIGN) {
 928             if (sign_recovery_shift.file == BAD_FILE) {
 929                /* shift constant: <22,22,22,30> */
 930                sign_recovery_shift = dst_reg(this, glsl_type::uvec4_type);
 931                emit(MOV(with_writemask(sign_recovery_shift, WRITEMASK_XYZ), src_reg(22u)));
 932                emit(MOV(with_writemask(sign_recovery_shift, WRITEMASK_W), src_reg(30u)));
 933             }
 934
 935             emit(SHL(reg_ud, src_reg(reg_ud), src_reg(sign_recovery_shift)));
 936             emit(ASR(reg_d, src_reg(reg_d), src_reg(sign_recovery_shift)));
 937          }
 938
 939          /* Apply BGRA swizzle if required. */
 940          if (wa_flags & BRW_ATTRIB_WA_BGRA) {
 941             src_reg temp = src_reg(reg);
 942             temp.swizzle = BRW_SWIZZLE4(2,1,0,3);
 943             emit(MOV(reg, temp));
 944          }
 945
 946          if (wa_flags & BRW_ATTRIB_WA_NORMALIZE) {
 947             /* ES 3.0 has different rules for converting signed normalized
 948              * fixed-point numbers than desktop GL.
 949              */
 950             if (_mesa_is_gles3(ctx) && (wa_flags & BRW_ATTRIB_WA_SIGN)) {
 951                /* According to equation 2.2 of the ES 3.0 specification,
 952                 * signed normalization conversion is done by:
 953                 *
 954                 * f = c / (2^(b-1)-1)
 955                 */
 956                if (es3_normalize_factor.file == BAD_FILE) {
 957                   /* mul constant: 1 / (2^(b-1) - 1) */
 958                   es3_normalize_factor = dst_reg(this, glsl_type::vec4_type);
 959                   emit(MOV(with_writemask(es3_normalize_factor, WRITEMASK_XYZ),
 960                            src_reg(1.0f / ((1<<9) - 1))));
 961                   emit(MOV(with_writemask(es3_normalize_factor, WRITEMASK_W),
 962                            src_reg(1.0f / ((1<<1) - 1))));
 963                }
 964
 965                dst_reg dst = reg;
 966                dst.type = brw_type_for_base_type(glsl_type::vec4_type);
 967                emit(MOV(dst, src_reg(reg_d)));
 968                emit(MUL(dst, src_reg(dst), src_reg(es3_normalize_factor)));
 969                emit_minmax(BRW_CONDITIONAL_G, dst, src_reg(dst), src_reg(-1.0f));
 970             } else {
 971                /* The following equations are from the OpenGL 3.2 specification:
 972                 *
 973                 * 2.1 unsigned normalization
 974                 * f = c/(2^n-1)
 975                 *
 976                 * 2.2 signed normalization
 977                 * f = (2c+1)/(2^n-1)
 978                 *
 979                 * Both of these share a common divisor, which is represented by
 980                 * "normalize_factor" in the code below.
 981                 */
 982                if (normalize_factor.file == BAD_FILE) {
 983                   /* 1 / (2^b - 1) for b=<10,10,10,2> */
 984                   normalize_factor = dst_reg(this, glsl_type::vec4_type);
 985                   emit(MOV(with_writemask(normalize_factor, WRITEMASK_XYZ),
 986                            src_reg(1.0f / ((1<<10) - 1))));
 987                   emit(MOV(with_writemask(normalize_factor, WRITEMASK_W),
 988                            src_reg(1.0f / ((1<<2) - 1))));
 989                }
 990
 991                dst_reg dst = reg;
 992                dst.type = brw_type_for_base_type(glsl_type::vec4_type);
 993                emit(MOV(dst, src_reg((wa_flags & BRW_ATTRIB_WA_SIGN) ? reg_d : reg_ud)));
 994
 995                /* For signed normalization, we want the numerator to be 2c+1. */
 996                if (wa_flags & BRW_ATTRIB_WA_SIGN) {
 997                   emit(MUL(dst, src_reg(dst), src_reg(2.0f)));
 998                   emit(ADD(dst, src_reg(dst), src_reg(1.0f)));
 999                }
1000
1001                emit(MUL(dst, src_reg(dst), src_reg(normalize_factor)));
1002             }
1003          }
1004
1005          if (wa_flags & BRW_ATTRIB_WA_SCALE) {
1006             dst_reg dst = reg;
1007             dst.type = brw_type_for_base_type(glsl_type::vec4_type);
1008             emit(MOV(dst, src_reg((wa_flags & BRW_ATTRIB_WA_SIGN) ? reg_d : reg_ud)));
1009          }
1010       }
1011    }
1012 }
1013
1014 void
1015 vec4_visitor::visit(ir_variable *ir)
1016 {
1017    dst_reg *reg = NULL;
1018
1019    if (variable_storage(ir))
1020       return;
1021
1022    switch (ir->mode) {
1023    case ir_var_shader_in:
1024       reg = new(mem_ctx) dst_reg(ATTR, ir->location);
1025       break;
1026
1027    case ir_var_shader_out:
1028       reg = new(mem_ctx) dst_reg(this, ir->type);
1029
1030       for (int i = 0; i < type_size(ir->type); i++) {
1031          output_reg[ir->location + i] = *reg;
1032          output_reg[ir->location + i].reg_offset = i;
1033          output_reg[ir->location + i].type =
1034             brw_type_for_base_type(ir->type->get_scalar_type());
1035          output_reg_annotation[ir->location + i] = ir->name;
1036       }
1037       break;
1038
1039    case ir_var_auto:
1040    case ir_var_temporary:
1041       reg = new(mem_ctx) dst_reg(this, ir->type);
1042       break;
1043
1044    case ir_var_uniform:
1045       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
1046
1047       /* Thanks to the lower_ubo_reference pass, we will see only
1048        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
1049        * variables, so no need for them to be in variable_ht.
1050        */
1051       if (ir->uniform_block != -1)
1052          return;
1053
1054       /* Track how big the whole uniform variable is, in case we need to put a
1055        * copy of its data into pull constants for array access.
1056        */
1057       this->uniform_size[this->uniforms] = type_size(ir->type);
1058
1059       if (!strncmp(ir->name, "gl_", 3)) {
1060          setup_builtin_uniform_values(ir);
1061       } else {
1062          setup_uniform_values(ir);
1063       }
1064       break;
1065
1066    case ir_var_system_value:
1067       /* VertexID is stored by the VF as the last vertex element, but
1068        * we don't represent it with a flag in inputs_read, so we call
1069        * it VERT_ATTRIB_MAX, which setup_attributes() picks up on.
1070        */
1071       reg = new(mem_ctx) dst_reg(ATTR, VERT_ATTRIB_MAX);
1072       prog_data->uses_vertexid = true;
1073
1074       switch (ir->location) {
1075       case SYSTEM_VALUE_VERTEX_ID:
1076          reg->writemask = WRITEMASK_X;
1077          break;
1078       case SYSTEM_VALUE_INSTANCE_ID:
1079          reg->writemask = WRITEMASK_Y;
1080          break;
1081       default:
1082          assert(!"not reached");
1083          break;
1084       }
1085       break;
1086
1087    default:
1088       assert(!"not reached");
1089    }
1090
1091    reg->type = brw_type_for_base_type(ir->type);
1092    hash_table_insert(this->variable_ht, reg, ir);
1093 }
1094
1095 void
1096 vec4_visitor::visit(ir_loop *ir)
1097 {
1098    dst_reg counter;
1099
1100    /* We don't want debugging output to print the whole body of the
1101     * loop as the annotation.
1102     */
1103    this->base_ir = NULL;
1104
1105    if (ir->counter != NULL) {
1106       this->base_ir = ir->counter;
1107       ir->counter->accept(this);
1108       counter = *(variable_storage(ir->counter));
1109
1110       if (ir->from != NULL) {
1111          this->base_ir = ir->from;
1112          ir->from->accept(this);
1113
1114          emit(MOV(counter, this->result));
1115       }
1116    }
1117
1118    emit(BRW_OPCODE_DO);
1119
1120    if (ir->to) {
1121       this->base_ir = ir->to;
1122       ir->to->accept(this);
1123
1124       emit(CMP(dst_null_d(), src_reg(counter), this->result,
1125                brw_conditional_for_comparison(ir->cmp)));
1126
1127       vec4_instruction *inst = emit(BRW_OPCODE_BREAK);
1128       inst->predicate = BRW_PREDICATE_NORMAL;
1129    }
1130
1131    visit_instructions(&ir->body_instructions);
1132
1133
1134    if (ir->increment) {
1135       this->base_ir = ir->increment;
1136       ir->increment->accept(this);
1137       emit(ADD(counter, src_reg(counter), this->result));
1138    }
1139
1140    emit(BRW_OPCODE_WHILE);
1141 }
1142
1143 void
1144 vec4_visitor::visit(ir_loop_jump *ir)
1145 {
1146    switch (ir->mode) {
1147    case ir_loop_jump::jump_break:
1148       emit(BRW_OPCODE_BREAK);
1149       break;
1150    case ir_loop_jump::jump_continue:
1151       emit(BRW_OPCODE_CONTINUE);
1152       break;
1153    }
1154 }
1155
1156
1157 void
1158 vec4_visitor::visit(ir_function_signature *ir)
1159 {
1160    assert(0);
1161    (void)ir;
1162 }
1163
1164 void
1165 vec4_visitor::visit(ir_function *ir)
1166 {
1167    /* Ignore function bodies other than main() -- we shouldn't see calls to
1168     * them since they should all be inlined.
1169     */
1170    if (strcmp(ir->name, "main") == 0) {
1171       const ir_function_signature *sig;
1172       exec_list empty;
1173
1174       sig = ir->matching_signature(&empty);
1175
1176       assert(sig);
1177
1178       visit_instructions(&sig->body);
1179    }
1180 }
1181
1182 bool
1183 vec4_visitor::try_emit_sat(ir_expression *ir)
1184 {
1185    ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
1186    if (!sat_src)
1187       return false;
1188
1189    sat_src->accept(this);
1190    src_reg src = this->result;
1191
1192    this->result = src_reg(this, ir->type);
1193    vec4_instruction *inst;
1194    inst = emit(MOV(dst_reg(this->result), src));
1195    inst->saturate = true;
1196
1197    return true;
1198 }
1199
1200 void
1201 vec4_visitor::emit_bool_comparison(unsigned int op,
1202                                  dst_reg dst, src_reg src0, src_reg src1)
1203 {
1204    /* original gen4 does destination conversion before comparison. */
1205    if (intel->gen < 5)
1206       dst.type = src0.type;
1207
1208    emit(CMP(dst, src0, src1, brw_conditional_for_comparison(op)));
1209
1210    dst.type = BRW_REGISTER_TYPE_D;
1211    emit(AND(dst, src_reg(dst), src_reg(0x1)));
1212 }
1213
1214 void
1215 vec4_visitor::emit_minmax(uint32_t conditionalmod, dst_reg dst,
1216                           src_reg src0, src_reg src1)
1217 {
1218    vec4_instruction *inst;
1219
1220    if (intel->gen >= 6) {
1221       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1222       inst->conditional_mod = conditionalmod;
1223    } else {
1224       emit(CMP(dst, src0, src1, conditionalmod));
1225
1226       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1227       inst->predicate = BRW_PREDICATE_NORMAL;
1228    }
1229 }
1230
1231 void
1232 vec4_visitor::visit(ir_expression *ir)
1233 {
1234    unsigned int operand;
1235    src_reg op[Elements(ir->operands)];
1236    src_reg result_src;
1237    dst_reg result_dst;
1238    vec4_instruction *inst;
1239
1240    if (try_emit_sat(ir))
1241       return;
1242
1243    for (operand = 0; operand < ir->get_num_operands(); operand++) {
1244       this->result.file = BAD_FILE;
1245       ir->operands[operand]->accept(this);
1246       if (this->result.file == BAD_FILE) {
1247          printf("Failed to get tree for expression operand:\n");
1248          ir->operands[operand]->print();
1249          exit(1);
1250       }
1251       op[operand] = this->result;
1252
1253       /* Matrix expression operands should have been broken down to vector
1254        * operations already.
1255        */
1256       assert(!ir->operands[operand]->type->is_matrix());
1257    }
1258
1259    int vector_elements = ir->operands[0]->type->vector_elements;
1260    if (ir->operands[1]) {
1261       vector_elements = MAX2(vector_elements,
1262                              ir->operands[1]->type->vector_elements);
1263    }
1264
1265    this->result.file = BAD_FILE;
1266
1267    /* Storage for our result.  Ideally for an assignment we'd be using
1268     * the actual storage for the result here, instead.
1269     */
1270    result_src = src_reg(this, ir->type);
1271    /* convenience for the emit functions below. */
1272    result_dst = dst_reg(result_src);
1273    /* If nothing special happens, this is the result. */
1274    this->result = result_src;
1275    /* Limit writes to the channels that will be used by result_src later.
1276     * This does limit this temp's use as a temporary for multi-instruction
1277     * sequences.
1278     */
1279    result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1280
1281    switch (ir->operation) {
1282    case ir_unop_logic_not:
1283       /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
1284        * ones complement of the whole register, not just bit 0.
1285        */
1286       emit(XOR(result_dst, op[0], src_reg(1)));
1287       break;
1288    case ir_unop_neg:
1289       op[0].negate = !op[0].negate;
1290       this->result = op[0];
1291       break;
1292    case ir_unop_abs:
1293       op[0].abs = true;
1294       op[0].negate = false;
1295       this->result = op[0];
1296       break;
1297
1298    case ir_unop_sign:
1299       emit(MOV(result_dst, src_reg(0.0f)));
1300
1301       emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_G));
1302       inst = emit(MOV(result_dst, src_reg(1.0f)));
1303       inst->predicate = BRW_PREDICATE_NORMAL;
1304
1305       emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_L));
1306       inst = emit(MOV(result_dst, src_reg(-1.0f)));
1307       inst->predicate = BRW_PREDICATE_NORMAL;
1308
1309       break;
1310
1311    case ir_unop_rcp:
1312       emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1313       break;
1314
1315    case ir_unop_exp2:
1316       emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1317       break;
1318    case ir_unop_log2:
1319       emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1320       break;
1321    case ir_unop_exp:
1322    case ir_unop_log:
1323       assert(!"not reached: should be handled by ir_explog_to_explog2");
1324       break;
1325    case ir_unop_sin:
1326    case ir_unop_sin_reduced:
1327       emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1328       break;
1329    case ir_unop_cos:
1330    case ir_unop_cos_reduced:
1331       emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1332       break;
1333
1334    case ir_unop_dFdx:
1335    case ir_unop_dFdy:
1336       assert(!"derivatives not valid in vertex shader");
1337       break;
1338
1339    case ir_unop_noise:
1340       assert(!"not reached: should be handled by lower_noise");
1341       break;
1342
1343    case ir_binop_add:
1344       emit(ADD(result_dst, op[0], op[1]));
1345       break;
1346    case ir_binop_sub:
1347       assert(!"not reached: should be handled by ir_sub_to_add_neg");
1348       break;
1349
1350    case ir_binop_mul:
1351       if (ir->type->is_integer()) {
1352          /* For integer multiplication, the MUL uses the low 16 bits
1353           * of one of the operands (src0 on gen6, src1 on gen7).  The
1354           * MACH accumulates in the contribution of the upper 16 bits
1355           * of that operand.
1356           *
1357           * FINISHME: Emit just the MUL if we know an operand is small
1358           * enough.
1359           */
1360          struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_D);
1361
1362          emit(MUL(acc, op[0], op[1]));
1363          emit(MACH(dst_null_d(), op[0], op[1]));
1364          emit(MOV(result_dst, src_reg(acc)));
1365       } else {
1366          emit(MUL(result_dst, op[0], op[1]));
1367       }
1368       break;
1369    case ir_binop_div:
1370       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1371       assert(ir->type->is_integer());
1372       emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1373       break;
1374    case ir_binop_mod:
1375       /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1376       assert(ir->type->is_integer());
1377       emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1378       break;
1379
1380    case ir_binop_less:
1381    case ir_binop_greater:
1382    case ir_binop_lequal:
1383    case ir_binop_gequal:
1384    case ir_binop_equal:
1385    case ir_binop_nequal: {
1386       emit(CMP(result_dst, op[0], op[1],
1387                brw_conditional_for_comparison(ir->operation)));
1388       emit(AND(result_dst, result_src, src_reg(0x1)));
1389       break;
1390    }
1391
1392    case ir_binop_all_equal:
1393       /* "==" operator producing a scalar boolean. */
1394       if (ir->operands[0]->type->is_vector() ||
1395           ir->operands[1]->type->is_vector()) {
1396          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1397          emit(MOV(result_dst, src_reg(0)));
1398          inst = emit(MOV(result_dst, src_reg(1)));
1399          inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1400       } else {
1401          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1402          emit(AND(result_dst, result_src, src_reg(0x1)));
1403       }
1404       break;
1405    case ir_binop_any_nequal:
1406       /* "!=" operator producing a scalar boolean. */
1407       if (ir->operands[0]->type->is_vector() ||
1408           ir->operands[1]->type->is_vector()) {
1409          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1410
1411          emit(MOV(result_dst, src_reg(0)));
1412          inst = emit(MOV(result_dst, src_reg(1)));
1413          inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1414       } else {
1415          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1416          emit(AND(result_dst, result_src, src_reg(0x1)));
1417       }
1418       break;
1419
1420    case ir_unop_any:
1421       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1422       emit(MOV(result_dst, src_reg(0)));
1423
1424       inst = emit(MOV(result_dst, src_reg(1)));
1425       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1426       break;
1427
1428    case ir_binop_logic_xor:
1429       emit(XOR(result_dst, op[0], op[1]));
1430       break;
1431
1432    case ir_binop_logic_or:
1433       emit(OR(result_dst, op[0], op[1]));
1434       break;
1435
1436    case ir_binop_logic_and:
1437       emit(AND(result_dst, op[0], op[1]));
1438       break;
1439
1440    case ir_binop_dot:
1441       assert(ir->operands[0]->type->is_vector());
1442       assert(ir->operands[0]->type == ir->operands[1]->type);
1443       emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1444       break;
1445
1446    case ir_unop_sqrt:
1447       emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1448       break;
1449    case ir_unop_rsq:
1450       emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1451       break;
1452
1453    case ir_unop_bitcast_i2f:
1454    case ir_unop_bitcast_u2f:
1455       this->result = op[0];
1456       this->result.type = BRW_REGISTER_TYPE_F;
1457       break;
1458
1459    case ir_unop_bitcast_f2i:
1460       this->result = op[0];
1461       this->result.type = BRW_REGISTER_TYPE_D;
1462       break;
1463
1464    case ir_unop_bitcast_f2u:
1465       this->result = op[0];
1466       this->result.type = BRW_REGISTER_TYPE_UD;
1467       break;
1468
1469    case ir_unop_i2f:
1470    case ir_unop_i2u:
1471    case ir_unop_u2i:
1472    case ir_unop_u2f:
1473    case ir_unop_b2f:
1474    case ir_unop_b2i:
1475    case ir_unop_f2i:
1476    case ir_unop_f2u:
1477       emit(MOV(result_dst, op[0]));
1478       break;
1479    case ir_unop_f2b:
1480    case ir_unop_i2b: {
1481       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1482       emit(AND(result_dst, result_src, src_reg(1)));
1483       break;
1484    }
1485
1486    case ir_unop_trunc:
1487       emit(RNDZ(result_dst, op[0]));
1488       break;
1489    case ir_unop_ceil:
1490       op[0].negate = !op[0].negate;
1491       inst = emit(RNDD(result_dst, op[0]));
1492       this->result.negate = true;
1493       break;
1494    case ir_unop_floor:
1495       inst = emit(RNDD(result_dst, op[0]));
1496       break;
1497    case ir_unop_fract:
1498       inst = emit(FRC(result_dst, op[0]));
1499       break;
1500    case ir_unop_round_even:
1501       emit(RNDE(result_dst, op[0]));
1502       break;
1503
1504    case ir_binop_min:
1505       emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1506       break;
1507    case ir_binop_max:
1508       emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1509       break;
1510
1511    case ir_binop_pow:
1512       emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1513       break;
1514
1515    case ir_unop_bit_not:
1516       inst = emit(NOT(result_dst, op[0]));
1517       break;
1518    case ir_binop_bit_and:
1519       inst = emit(AND(result_dst, op[0], op[1]));
1520       break;
1521    case ir_binop_bit_xor:
1522       inst = emit(XOR(result_dst, op[0], op[1]));
1523       break;
1524    case ir_binop_bit_or:
1525       inst = emit(OR(result_dst, op[0], op[1]));
1526       break;
1527
1528    case ir_binop_lshift:
1529       inst = emit(SHL(result_dst, op[0], op[1]));
1530       break;
1531
1532    case ir_binop_rshift:
1533       if (ir->type->base_type == GLSL_TYPE_INT)
1534          inst = emit(ASR(result_dst, op[0], op[1]));
1535       else
1536          inst = emit(SHR(result_dst, op[0], op[1]));
1537       break;
1538
1539    case ir_binop_ubo_load: {
1540       ir_constant *uniform_block = ir->operands[0]->as_constant();
1541       ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1542       unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1543       src_reg offset = op[1];
1544
1545       /* Now, load the vector from that offset. */
1546       assert(ir->type->is_vector() || ir->type->is_scalar());
1547
1548       src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1549       packed_consts.type = result.type;
1550       src_reg surf_index =
1551          src_reg(SURF_INDEX_VS_UBO(uniform_block->value.u[0]));
1552       if (const_offset_ir) {
1553          offset = src_reg(const_offset / 16);
1554       } else {
1555          emit(SHR(dst_reg(offset), offset, src_reg(4)));
1556       }
1557
1558       vec4_instruction *pull =
1559          emit(new(mem_ctx) vec4_instruction(this,
1560                                             VS_OPCODE_PULL_CONSTANT_LOAD,
1561                                             dst_reg(packed_consts),
1562                                             surf_index,
1563                                             offset));
1564       pull->base_mrf = 14;
1565       pull->mlen = 1;
1566
1567       packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1568       packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1569                                             const_offset % 16 / 4,
1570                                             const_offset % 16 / 4,
1571                                             const_offset % 16 / 4);
1572
1573       /* UBO bools are any nonzero int.  We store bools as either 0 or 1. */
1574       if (ir->type->base_type == GLSL_TYPE_BOOL) {
1575          emit(CMP(result_dst, packed_consts, src_reg(0u),
1576                   BRW_CONDITIONAL_NZ));
1577          emit(AND(result_dst, result, src_reg(0x1)));
1578       } else {
1579          emit(MOV(result_dst, packed_consts));
1580       }
1581       break;
1582    }
1583
1584    case ir_quadop_vector:
1585       assert(!"not reached: should be handled by lower_quadop_vector");
1586       break;
1587
1588    case ir_unop_pack_half_2x16:
1589       emit_pack_half_2x16(result_dst, op[0]);
1590       break;
1591    case ir_unop_unpack_half_2x16:
1592       emit_unpack_half_2x16(result_dst, op[0]);
1593       break;
1594    case ir_unop_pack_snorm_2x16:
1595    case ir_unop_pack_unorm_2x16:
1596    case ir_unop_unpack_snorm_2x16:
1597    case ir_unop_unpack_unorm_2x16:
1598       assert(!"not reached: should be handled by lower_packing_builtins");
1599       break;
1600    case ir_unop_unpack_half_2x16_split_x:
1601    case ir_unop_unpack_half_2x16_split_y:
1602    case ir_binop_pack_half_2x16_split:
1603       assert(!"not reached: should not occur in vertex shader");
1604       break;
1605    }
1606 }
1607
1608
1609 void
1610 vec4_visitor::visit(ir_swizzle *ir)
1611 {
1612    src_reg src;
1613    int i = 0;
1614    int swizzle[4];
1615
1616    /* Note that this is only swizzles in expressions, not those on the left
1617     * hand side of an assignment, which do write masking.  See ir_assignment
1618     * for that.
1619     */
1620
1621    ir->val->accept(this);
1622    src = this->result;
1623    assert(src.file != BAD_FILE);
1624
1625    for (i = 0; i < ir->type->vector_elements; i++) {
1626       switch (i) {
1627       case 0:
1628          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1629          break;
1630       case 1:
1631          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1632          break;
1633       case 2:
1634          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1635          break;
1636       case 3:
1637          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1638             break;
1639       }
1640    }
1641    for (; i < 4; i++) {
1642       /* Replicate the last channel out. */
1643       swizzle[i] = swizzle[ir->type->vector_elements - 1];
1644    }
1645
1646    src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1647
1648    this->result = src;
1649 }
1650
1651 void
1652 vec4_visitor::visit(ir_dereference_variable *ir)
1653 {
1654    const struct glsl_type *type = ir->type;
1655    dst_reg *reg = variable_storage(ir->var);
1656
1657    if (!reg) {
1658       fail("Failed to find variable storage for %s\n", ir->var->name);
1659       this->result = src_reg(brw_null_reg());
1660       return;
1661    }
1662
1663    this->result = src_reg(*reg);
1664
1665    /* System values get their swizzle from the dst_reg writemask */
1666    if (ir->var->mode == ir_var_system_value)
1667       return;
1668
1669    if (type->is_scalar() || type->is_vector() || type->is_matrix())
1670       this->result.swizzle = swizzle_for_size(type->vector_elements);
1671 }
1672
1673 void
1674 vec4_visitor::visit(ir_dereference_array *ir)
1675 {
1676    ir_constant *constant_index;
1677    src_reg src;
1678    int element_size = type_size(ir->type);
1679
1680    constant_index = ir->array_index->constant_expression_value();
1681
1682    ir->array->accept(this);
1683    src = this->result;
1684
1685    if (constant_index) {
1686       src.reg_offset += constant_index->value.i[0] * element_size;
1687    } else {
1688       /* Variable index array dereference.  It eats the "vec4" of the
1689        * base of the array and an index that offsets the Mesa register
1690        * index.
1691        */
1692       ir->array_index->accept(this);
1693
1694       src_reg index_reg;
1695
1696       if (element_size == 1) {
1697          index_reg = this->result;
1698       } else {
1699          index_reg = src_reg(this, glsl_type::int_type);
1700
1701          emit(MUL(dst_reg(index_reg), this->result, src_reg(element_size)));
1702       }
1703
1704       if (src.reladdr) {
1705          src_reg temp = src_reg(this, glsl_type::int_type);
1706
1707          emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1708
1709          index_reg = temp;
1710       }
1711
1712       src.reladdr = ralloc(mem_ctx, src_reg);
1713       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1714    }
1715
1716    /* If the type is smaller than a vec4, replicate the last channel out. */
1717    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1718       src.swizzle = swizzle_for_size(ir->type->vector_elements);
1719    else
1720       src.swizzle = BRW_SWIZZLE_NOOP;
1721    src.type = brw_type_for_base_type(ir->type);
1722
1723    this->result = src;
1724 }
1725
1726 void
1727 vec4_visitor::visit(ir_dereference_record *ir)
1728 {
1729    unsigned int i;
1730    const glsl_type *struct_type = ir->record->type;
1731    int offset = 0;
1732
1733    ir->record->accept(this);
1734
1735    for (i = 0; i < struct_type->length; i++) {
1736       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1737          break;
1738       offset += type_size(struct_type->fields.structure[i].type);
1739    }
1740
1741    /* If the type is smaller than a vec4, replicate the last channel out. */
1742    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1743       this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1744    else
1745       this->result.swizzle = BRW_SWIZZLE_NOOP;
1746    this->result.type = brw_type_for_base_type(ir->type);
1747
1748    this->result.reg_offset += offset;
1749 }
1750
1751 /**
1752  * We want to be careful in assignment setup to hit the actual storage
1753  * instead of potentially using a temporary like we might with the
1754  * ir_dereference handler.
1755  */
1756 static dst_reg
1757 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1758 {
1759    /* The LHS must be a dereference.  If the LHS is a variable indexed array
1760     * access of a vector, it must be separated into a series conditional moves
1761     * before reaching this point (see ir_vec_index_to_cond_assign).
1762     */
1763    assert(ir->as_dereference());
1764    ir_dereference_array *deref_array = ir->as_dereference_array();
1765    if (deref_array) {
1766       assert(!deref_array->array->type->is_vector());
1767    }
1768
1769    /* Use the rvalue deref handler for the most part.  We'll ignore
1770     * swizzles in it and write swizzles using writemask, though.
1771     */
1772    ir->accept(v);
1773    return dst_reg(v->result);
1774 }
1775
1776 void
1777 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1778                               const struct glsl_type *type, uint32_t predicate)
1779 {
1780    if (type->base_type == GLSL_TYPE_STRUCT) {
1781       for (unsigned int i = 0; i < type->length; i++) {
1782          emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1783       }
1784       return;
1785    }
1786
1787    if (type->is_array()) {
1788       for (unsigned int i = 0; i < type->length; i++) {
1789          emit_block_move(dst, src, type->fields.array, predicate);
1790       }
1791       return;
1792    }
1793
1794    if (type->is_matrix()) {
1795       const struct glsl_type *vec_type;
1796
1797       vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1798                                          type->vector_elements, 1);
1799
1800       for (int i = 0; i < type->matrix_columns; i++) {
1801          emit_block_move(dst, src, vec_type, predicate);
1802       }
1803       return;
1804    }
1805
1806    assert(type->is_scalar() || type->is_vector());
1807
1808    dst->type = brw_type_for_base_type(type);
1809    src->type = dst->type;
1810
1811    dst->writemask = (1 << type->vector_elements) - 1;
1812
1813    src->swizzle = swizzle_for_size(type->vector_elements);
1814
1815    vec4_instruction *inst = emit(MOV(*dst, *src));
1816    inst->predicate = predicate;
1817
1818    dst->reg_offset++;
1819    src->reg_offset++;
1820 }
1821
1822
1823 /* If the RHS processing resulted in an instruction generating a
1824  * temporary value, and it would be easy to rewrite the instruction to
1825  * generate its result right into the LHS instead, do so.  This ends
1826  * up reliably removing instructions where it can be tricky to do so
1827  * later without real UD chain information.
1828  */
1829 bool
1830 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
1831                                      dst_reg dst,
1832                                      src_reg src,
1833                                      vec4_instruction *pre_rhs_inst,
1834                                      vec4_instruction *last_rhs_inst)
1835 {
1836    /* This could be supported, but it would take more smarts. */
1837    if (ir->condition)
1838       return false;
1839
1840    if (pre_rhs_inst == last_rhs_inst)
1841       return false; /* No instructions generated to work with. */
1842
1843    /* Make sure the last instruction generated our source reg. */
1844    if (src.file != GRF ||
1845        src.file != last_rhs_inst->dst.file ||
1846        src.reg != last_rhs_inst->dst.reg ||
1847        src.reg_offset != last_rhs_inst->dst.reg_offset ||
1848        src.reladdr ||
1849        src.abs ||
1850        src.negate ||
1851        last_rhs_inst->predicate != BRW_PREDICATE_NONE)
1852       return false;
1853
1854    /* Check that that last instruction fully initialized the channels
1855     * we want to use, in the order we want to use them.  We could
1856     * potentially reswizzle the operands of many instructions so that
1857     * we could handle out of order channels, but don't yet.
1858     */
1859
1860    for (unsigned i = 0; i < 4; i++) {
1861       if (dst.writemask & (1 << i)) {
1862          if (!(last_rhs_inst->dst.writemask & (1 << i)))
1863             return false;
1864
1865          if (BRW_GET_SWZ(src.swizzle, i) != i)
1866             return false;
1867       }
1868    }
1869
1870    /* Success!  Rewrite the instruction. */
1871    last_rhs_inst->dst.file = dst.file;
1872    last_rhs_inst->dst.reg = dst.reg;
1873    last_rhs_inst->dst.reg_offset = dst.reg_offset;
1874    last_rhs_inst->dst.reladdr = dst.reladdr;
1875    last_rhs_inst->dst.writemask &= dst.writemask;
1876
1877    return true;
1878 }
1879
1880 void
1881 vec4_visitor::visit(ir_assignment *ir)
1882 {
1883    dst_reg dst = get_assignment_lhs(ir->lhs, this);
1884    uint32_t predicate = BRW_PREDICATE_NONE;
1885
1886    if (!ir->lhs->type->is_scalar() &&
1887        !ir->lhs->type->is_vector()) {
1888       ir->rhs->accept(this);
1889       src_reg src = this->result;
1890
1891       if (ir->condition) {
1892          emit_bool_to_cond_code(ir->condition, &predicate);
1893       }
1894
1895       /* emit_block_move doesn't account for swizzles in the source register.
1896        * This should be ok, since the source register is a structure or an
1897        * array, and those can't be swizzled.  But double-check to be sure.
1898        */
1899       assert(src.swizzle ==
1900              (ir->rhs->type->is_matrix()
1901               ? swizzle_for_size(ir->rhs->type->vector_elements)
1902               : BRW_SWIZZLE_NOOP));
1903
1904       emit_block_move(&dst, &src, ir->rhs->type, predicate);
1905       return;
1906    }
1907
1908    /* Now we're down to just a scalar/vector with writemasks. */
1909    int i;
1910
1911    vec4_instruction *pre_rhs_inst, *last_rhs_inst;
1912    pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1913
1914    ir->rhs->accept(this);
1915
1916    last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1917
1918    src_reg src = this->result;
1919
1920    int swizzles[4];
1921    int first_enabled_chan = 0;
1922    int src_chan = 0;
1923
1924    assert(ir->lhs->type->is_vector() ||
1925           ir->lhs->type->is_scalar());
1926    dst.writemask = ir->write_mask;
1927
1928    for (int i = 0; i < 4; i++) {
1929       if (dst.writemask & (1 << i)) {
1930          first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
1931          break;
1932       }
1933    }
1934
1935    /* Swizzle a small RHS vector into the channels being written.
1936     *
1937     * glsl ir treats write_mask as dictating how many channels are
1938     * present on the RHS while in our instructions we need to make
1939     * those channels appear in the slots of the vec4 they're written to.
1940     */
1941    for (int i = 0; i < 4; i++) {
1942       if (dst.writemask & (1 << i))
1943          swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
1944       else
1945          swizzles[i] = first_enabled_chan;
1946    }
1947    src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
1948                               swizzles[2], swizzles[3]);
1949
1950    if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
1951       return;
1952    }
1953
1954    if (ir->condition) {
1955       emit_bool_to_cond_code(ir->condition, &predicate);
1956    }
1957
1958    for (i = 0; i < type_size(ir->lhs->type); i++) {
1959       vec4_instruction *inst = emit(MOV(dst, src));
1960       inst->predicate = predicate;
1961
1962       dst.reg_offset++;
1963       src.reg_offset++;
1964    }
1965 }
1966
1967 void
1968 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
1969 {
1970    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
1971       foreach_list(node, &ir->components) {
1972          ir_constant *field_value = (ir_constant *)node;
1973
1974          emit_constant_values(dst, field_value);
1975       }
1976       return;
1977    }
1978
1979    if (ir->type->is_array()) {
1980       for (unsigned int i = 0; i < ir->type->length; i++) {
1981          emit_constant_values(dst, ir->array_elements[i]);
1982       }
1983       return;
1984    }
1985
1986    if (ir->type->is_matrix()) {
1987       for (int i = 0; i < ir->type->matrix_columns; i++) {
1988          float *vec = &ir->value.f[i * ir->type->vector_elements];
1989
1990          for (int j = 0; j < ir->type->vector_elements; j++) {
1991             dst->writemask = 1 << j;
1992             dst->type = BRW_REGISTER_TYPE_F;
1993
1994             emit(MOV(*dst, src_reg(vec[j])));
1995          }
1996          dst->reg_offset++;
1997       }
1998       return;
1999    }
2000
2001    int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2002
2003    for (int i = 0; i < ir->type->vector_elements; i++) {
2004       if (!(remaining_writemask & (1 << i)))
2005          continue;
2006
2007       dst->writemask = 1 << i;
2008       dst->type = brw_type_for_base_type(ir->type);
2009
2010       /* Find other components that match the one we're about to
2011        * write.  Emits fewer instructions for things like vec4(0.5,
2012        * 1.5, 1.5, 1.5).
2013        */
2014       for (int j = i + 1; j < ir->type->vector_elements; j++) {
2015          if (ir->type->base_type == GLSL_TYPE_BOOL) {
2016             if (ir->value.b[i] == ir->value.b[j])
2017                dst->writemask |= (1 << j);
2018          } else {
2019             /* u, i, and f storage all line up, so no need for a
2020              * switch case for comparing each type.
2021              */
2022             if (ir->value.u[i] == ir->value.u[j])
2023                dst->writemask |= (1 << j);
2024          }
2025       }
2026
2027       switch (ir->type->base_type) {
2028       case GLSL_TYPE_FLOAT:
2029          emit(MOV(*dst, src_reg(ir->value.f[i])));
2030          break;
2031       case GLSL_TYPE_INT:
2032          emit(MOV(*dst, src_reg(ir->value.i[i])));
2033          break;
2034       case GLSL_TYPE_UINT:
2035          emit(MOV(*dst, src_reg(ir->value.u[i])));
2036          break;
2037       case GLSL_TYPE_BOOL:
2038          emit(MOV(*dst, src_reg(ir->value.b[i])));
2039          break;
2040       default:
2041          assert(!"Non-float/uint/int/bool constant");
2042          break;
2043       }
2044
2045       remaining_writemask &= ~dst->writemask;
2046    }
2047    dst->reg_offset++;
2048 }
2049
2050 void
2051 vec4_visitor::visit(ir_constant *ir)
2052 {
2053    dst_reg dst = dst_reg(this, ir->type);
2054    this->result = src_reg(dst);
2055
2056    emit_constant_values(&dst, ir);
2057 }
2058
2059 void
2060 vec4_visitor::visit(ir_call *ir)
2061 {
2062    assert(!"not reached");
2063 }
2064
2065 void
2066 vec4_visitor::visit(ir_texture *ir)
2067 {
2068    int sampler = _mesa_get_sampler_uniform_value(ir->sampler, prog, &vp->Base);
2069
2070    /* Should be lowered by do_lower_texture_projection */
2071    assert(!ir->projector);
2072
2073    /* Generate code to compute all the subexpression trees.  This has to be
2074     * done before loading any values into MRFs for the sampler message since
2075     * generating these values may involve SEND messages that need the MRFs.
2076     */
2077    src_reg coordinate;
2078    if (ir->coordinate) {
2079       ir->coordinate->accept(this);
2080       coordinate = this->result;
2081    }
2082
2083    src_reg shadow_comparitor;
2084    if (ir->shadow_comparitor) {
2085       ir->shadow_comparitor->accept(this);
2086       shadow_comparitor = this->result;
2087    }
2088
2089    const glsl_type *lod_type;
2090    src_reg lod, dPdx, dPdy;
2091    switch (ir->op) {
2092    case ir_tex:
2093       lod = src_reg(0.0f);
2094       lod_type = glsl_type::float_type;
2095       break;
2096    case ir_txf:
2097    case ir_txl:
2098    case ir_txs:
2099       ir->lod_info.lod->accept(this);
2100       lod = this->result;
2101       lod_type = ir->lod_info.lod->type;
2102       break;
2103    case ir_txd:
2104       ir->lod_info.grad.dPdx->accept(this);
2105       dPdx = this->result;
2106
2107       ir->lod_info.grad.dPdy->accept(this);
2108       dPdy = this->result;
2109
2110       lod_type = ir->lod_info.grad.dPdx->type;
2111       break;
2112    case ir_txb:
2113       break;
2114    }
2115
2116    vec4_instruction *inst = NULL;
2117    switch (ir->op) {
2118    case ir_tex:
2119    case ir_txl:
2120       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXL);
2121       break;
2122    case ir_txd:
2123       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXD);
2124       break;
2125    case ir_txf:
2126       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF);
2127       break;
2128    case ir_txs:
2129       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
2130       break;
2131    case ir_txb:
2132       assert(!"TXB is not valid for vertex shaders.");
2133    }
2134
2135    bool use_texture_offset = ir->offset != NULL && ir->op != ir_txf;
2136
2137    /* Texel offsets go in the message header; Gen4 also requires headers. */
2138    inst->header_present = use_texture_offset || intel->gen < 5;
2139    inst->base_mrf = 2;
2140    inst->mlen = inst->header_present + 1; /* always at least one */
2141    inst->sampler = sampler;
2142    inst->dst = dst_reg(this, ir->type);
2143    inst->dst.writemask = WRITEMASK_XYZW;
2144    inst->shadow_compare = ir->shadow_comparitor != NULL;
2145
2146    if (use_texture_offset)
2147       inst->texture_offset = brw_texture_offset(ir->offset->as_constant());
2148
2149    /* MRF for the first parameter */
2150    int param_base = inst->base_mrf + inst->header_present;
2151
2152    if (ir->op == ir_txs) {
2153       int writemask = intel->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2154       emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2155    } else {
2156       int i, coord_mask = 0, zero_mask = 0;
2157       /* Load the coordinate */
2158       /* FINISHME: gl_clamp_mask and saturate */
2159       for (i = 0; i < ir->coordinate->type->vector_elements; i++)
2160          coord_mask |= (1 << i);
2161       for (; i < 4; i++)
2162          zero_mask |= (1 << i);
2163
2164       if (ir->offset && ir->op == ir_txf) {
2165          /* It appears that the ld instruction used for txf does its
2166           * address bounds check before adding in the offset.  To work
2167           * around this, just add the integer offset to the integer
2168           * texel coordinate, and don't put the offset in the header.
2169           */
2170          ir_constant *offset = ir->offset->as_constant();
2171          assert(offset);
2172
2173          for (int j = 0; j < ir->coordinate->type->vector_elements; j++) {
2174             src_reg src = coordinate;
2175             src.swizzle = BRW_SWIZZLE4(BRW_GET_SWZ(src.swizzle, j),
2176                                        BRW_GET_SWZ(src.swizzle, j),
2177                                        BRW_GET_SWZ(src.swizzle, j),
2178                                        BRW_GET_SWZ(src.swizzle, j));
2179             emit(ADD(dst_reg(MRF, param_base, ir->coordinate->type, 1 << j),
2180                      src, offset->value.i[j]));
2181          }
2182       } else {
2183          emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2184                   coordinate));
2185       }
2186       emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2187                src_reg(0)));
2188       /* Load the shadow comparitor */
2189       if (ir->shadow_comparitor) {
2190          emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2191                           WRITEMASK_X),
2192                   shadow_comparitor));
2193          inst->mlen++;
2194       }
2195
2196       /* Load the LOD info */
2197       if (ir->op == ir_tex || ir->op == ir_txl) {
2198          int mrf, writemask;
2199          if (intel->gen >= 5) {
2200             mrf = param_base + 1;
2201             if (ir->shadow_comparitor) {
2202                writemask = WRITEMASK_Y;
2203                /* mlen already incremented */
2204             } else {
2205                writemask = WRITEMASK_X;
2206                inst->mlen++;
2207             }
2208          } else /* intel->gen == 4 */ {
2209             mrf = param_base;
2210             writemask = WRITEMASK_Z;
2211          }
2212          emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2213       } else if (ir->op == ir_txf) {
2214          emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W),
2215                   lod));
2216       } else if (ir->op == ir_txd) {
2217          const glsl_type *type = lod_type;
2218
2219          if (intel->gen >= 5) {
2220             dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2221             dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2222             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2223             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2224             inst->mlen++;
2225
2226             if (ir->type->vector_elements == 3) {
2227                dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2228                dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2229                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2230                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2231                inst->mlen++;
2232             }
2233          } else /* intel->gen == 4 */ {
2234             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2235             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2236             inst->mlen += 2;
2237          }
2238       }
2239    }
2240
2241    emit(inst);
2242
2243    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2244     * spec requires layers.
2245     */
2246    if (ir->op == ir_txs) {
2247       glsl_type const *type = ir->sampler->type;
2248       if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2249           type->sampler_array) {
2250          emit_math(SHADER_OPCODE_INT_QUOTIENT,
2251                    with_writemask(inst->dst, WRITEMASK_Z),
2252                    src_reg(inst->dst), src_reg(6));
2253       }
2254    }
2255
2256    swizzle_result(ir, src_reg(inst->dst), sampler);
2257 }
2258
2259 void
2260 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, int sampler)
2261 {
2262    int s = c->key.tex.swizzles[sampler];
2263
2264    this->result = src_reg(this, ir->type);
2265    dst_reg swizzled_result(this->result);
2266
2267    if (ir->op == ir_txs || ir->type == glsl_type::float_type
2268                         || s == SWIZZLE_NOOP) {
2269       emit(MOV(swizzled_result, orig_val));
2270       return;
2271    }
2272
2273    int zero_mask = 0, one_mask = 0, copy_mask = 0;
2274    int swizzle[4];
2275
2276    for (int i = 0; i < 4; i++) {
2277       switch (GET_SWZ(s, i)) {
2278       case SWIZZLE_ZERO:
2279          zero_mask |= (1 << i);
2280          break;
2281       case SWIZZLE_ONE:
2282          one_mask |= (1 << i);
2283          break;
2284       default:
2285          copy_mask |= (1 << i);
2286          swizzle[i] = GET_SWZ(s, i);
2287          break;
2288       }
2289    }
2290
2291    if (copy_mask) {
2292       orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2293       swizzled_result.writemask = copy_mask;
2294       emit(MOV(swizzled_result, orig_val));
2295    }
2296
2297    if (zero_mask) {
2298       swizzled_result.writemask = zero_mask;
2299       emit(MOV(swizzled_result, src_reg(0.0f)));
2300    }
2301
2302    if (one_mask) {
2303       swizzled_result.writemask = one_mask;
2304       emit(MOV(swizzled_result, src_reg(1.0f)));
2305    }
2306 }
2307
2308 void
2309 vec4_visitor::visit(ir_return *ir)
2310 {
2311    assert(!"not reached");
2312 }
2313
2314 void
2315 vec4_visitor::visit(ir_discard *ir)
2316 {
2317    assert(!"not reached");
2318 }
2319
2320 void
2321 vec4_visitor::visit(ir_if *ir)
2322 {
2323    /* Don't point the annotation at the if statement, because then it plus
2324     * the then and else blocks get printed.
2325     */
2326    this->base_ir = ir->condition;
2327
2328    if (intel->gen == 6) {
2329       emit_if_gen6(ir);
2330    } else {
2331       uint32_t predicate;
2332       emit_bool_to_cond_code(ir->condition, &predicate);
2333       emit(IF(predicate));
2334    }
2335
2336    visit_instructions(&ir->then_instructions);
2337
2338    if (!ir->else_instructions.is_empty()) {
2339       this->base_ir = ir->condition;
2340       emit(BRW_OPCODE_ELSE);
2341
2342       visit_instructions(&ir->else_instructions);
2343    }
2344
2345    this->base_ir = ir->condition;
2346    emit(BRW_OPCODE_ENDIF);
2347 }
2348
2349 void
2350 vec4_visitor::emit_ndc_computation()
2351 {
2352    /* Get the position */
2353    src_reg pos = src_reg(output_reg[VERT_RESULT_HPOS]);
2354
2355    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2356    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2357    output_reg[BRW_VERT_RESULT_NDC] = ndc;
2358
2359    current_annotation = "NDC";
2360    dst_reg ndc_w = ndc;
2361    ndc_w.writemask = WRITEMASK_W;
2362    src_reg pos_w = pos;
2363    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2364    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2365
2366    dst_reg ndc_xyz = ndc;
2367    ndc_xyz.writemask = WRITEMASK_XYZ;
2368
2369    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2370 }
2371
2372 void
2373 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2374 {
2375    if (intel->gen < 6 &&
2376        ((c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) ||
2377         c->key.userclip_active || brw->has_negative_rhw_bug)) {
2378       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2379       dst_reg header1_w = header1;
2380       header1_w.writemask = WRITEMASK_W;
2381       GLuint i;
2382
2383       emit(MOV(header1, 0u));
2384
2385       if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
2386          src_reg psiz = src_reg(output_reg[VERT_RESULT_PSIZ]);
2387
2388          current_annotation = "Point size";
2389          emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2390          emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2391       }
2392
2393       current_annotation = "Clipping flags";
2394       for (i = 0; i < c->key.nr_userclip_plane_consts; i++) {
2395          vec4_instruction *inst;
2396
2397          inst = emit(DP4(dst_null_f(), src_reg(output_reg[VERT_RESULT_HPOS]),
2398                          src_reg(this->userplane[i])));
2399          inst->conditional_mod = BRW_CONDITIONAL_L;
2400
2401          inst = emit(OR(header1_w, src_reg(header1_w), 1u << i));
2402          inst->predicate = BRW_PREDICATE_NORMAL;
2403       }
2404
2405       /* i965 clipping workaround:
2406        * 1) Test for -ve rhw
2407        * 2) If set,
2408        *      set ndc = (0,0,0,0)
2409        *      set ucp[6] = 1
2410        *
2411        * Later, clipping will detect ucp[6] and ensure the primitive is
2412        * clipped against all fixed planes.
2413        */
2414       if (brw->has_negative_rhw_bug) {
2415 #if 0
2416          /* FINISHME */
2417          brw_CMP(p,
2418                  vec8(brw_null_reg()),
2419                  BRW_CONDITIONAL_L,
2420                  brw_swizzle1(output_reg[BRW_VERT_RESULT_NDC], 3),
2421                  brw_imm_f(0));
2422
2423          brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<6));
2424          brw_MOV(p, output_reg[BRW_VERT_RESULT_NDC], brw_imm_f(0));
2425          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2426 #endif
2427       }
2428
2429       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2430    } else if (intel->gen < 6) {
2431       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2432    } else {
2433       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2434       if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
2435          emit(MOV(brw_writemask(reg, WRITEMASK_W),
2436                   src_reg(output_reg[VERT_RESULT_PSIZ])));
2437       }
2438    }
2439 }
2440
2441 void
2442 vec4_visitor::emit_clip_distances(struct brw_reg reg, int offset)
2443 {
2444    if (intel->gen < 6) {
2445       /* Clip distance slots are set aside in gen5, but they are not used.  It
2446        * is not clear whether we actually need to set aside space for them,
2447        * but the performance cost is negligible.
2448        */
2449       return;
2450    }
2451
2452    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2453     *
2454     *     "If a linked set of shaders forming the vertex stage contains no
2455     *     static write to gl_ClipVertex or gl_ClipDistance, but the
2456     *     application has requested clipping against user clip planes through
2457     *     the API, then the coordinate written to gl_Position is used for
2458     *     comparison against the user clip planes."
2459     *
2460     * This function is only called if the shader didn't write to
2461     * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
2462     * if the user wrote to it; otherwise we use gl_Position.
2463     */
2464    gl_vert_result clip_vertex = VERT_RESULT_CLIP_VERTEX;
2465    if (!(c->prog_data.outputs_written
2466          & BITFIELD64_BIT(VERT_RESULT_CLIP_VERTEX))) {
2467       clip_vertex = VERT_RESULT_HPOS;
2468    }
2469
2470    for (int i = 0; i + offset < c->key.nr_userclip_plane_consts && i < 4;
2471         ++i) {
2472       emit(DP4(dst_reg(brw_writemask(reg, 1 << i)),
2473                src_reg(output_reg[clip_vertex]),
2474                src_reg(this->userplane[i + offset])));
2475    }
2476 }
2477
2478 void
2479 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int vert_result)
2480 {
2481    assert (vert_result < VERT_RESULT_MAX);
2482    reg.type = output_reg[vert_result].type;
2483    current_annotation = output_reg_annotation[vert_result];
2484    /* Copy the register, saturating if necessary */
2485    vec4_instruction *inst = emit(MOV(reg,
2486                                      src_reg(output_reg[vert_result])));
2487    if ((vert_result == VERT_RESULT_COL0 ||
2488         vert_result == VERT_RESULT_COL1 ||
2489         vert_result == VERT_RESULT_BFC0 ||
2490         vert_result == VERT_RESULT_BFC1) &&
2491        c->key.clamp_vertex_color) {
2492       inst->saturate = true;
2493    }
2494 }
2495
2496 void
2497 vec4_visitor::emit_urb_slot(int mrf, int vert_result)
2498 {
2499    struct brw_reg hw_reg = brw_message_reg(mrf);
2500    dst_reg reg = dst_reg(MRF, mrf);
2501    reg.type = BRW_REGISTER_TYPE_F;
2502
2503    switch (vert_result) {
2504    case VERT_RESULT_PSIZ:
2505       /* PSIZ is always in slot 0, and is coupled with other flags. */
2506       current_annotation = "indices, point width, clip flags";
2507       emit_psiz_and_flags(hw_reg);
2508       break;
2509    case BRW_VERT_RESULT_NDC:
2510       current_annotation = "NDC";
2511       emit(MOV(reg, src_reg(output_reg[BRW_VERT_RESULT_NDC])));
2512       break;
2513    case BRW_VERT_RESULT_HPOS_DUPLICATE:
2514    case VERT_RESULT_HPOS:
2515       current_annotation = "gl_Position";
2516       emit(MOV(reg, src_reg(output_reg[VERT_RESULT_HPOS])));
2517       break;
2518    case VERT_RESULT_CLIP_DIST0:
2519    case VERT_RESULT_CLIP_DIST1:
2520       if (this->c->key.uses_clip_distance) {
2521          emit_generic_urb_slot(reg, vert_result);
2522       } else {
2523          current_annotation = "user clip distances";
2524          emit_clip_distances(hw_reg, (vert_result - VERT_RESULT_CLIP_DIST0) * 4);
2525       }
2526       break;
2527    case VERT_RESULT_EDGE:
2528       /* This is present when doing unfilled polygons.  We're supposed to copy
2529        * the edge flag from the user-provided vertex array
2530        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
2531        * of that attribute (starts as 1.0f).  This is then used in clipping to
2532        * determine which edges should be drawn as wireframe.
2533        */
2534       current_annotation = "edge flag";
2535       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
2536                                     glsl_type::float_type, WRITEMASK_XYZW))));
2537       break;
2538    case BRW_VERT_RESULT_PAD:
2539       /* No need to write to this slot */
2540       break;
2541    default:
2542       emit_generic_urb_slot(reg, vert_result);
2543       break;
2544    }
2545 }
2546
2547 static int
2548 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
2549 {
2550    struct intel_context *intel = &brw->intel;
2551
2552    if (intel->gen >= 6) {
2553       /* URB data written (does not include the message header reg) must
2554        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
2555        * section 5.4.3.2.2: URB_INTERLEAVED.
2556        *
2557        * URB entries are allocated on a multiple of 1024 bits, so an
2558        * extra 128 bits written here to make the end align to 256 is
2559        * no problem.
2560        */
2561       if ((mlen % 2) != 1)
2562          mlen++;
2563    }
2564
2565    return mlen;
2566 }
2567
2568 /**
2569  * Generates the VUE payload plus the 1 or 2 URB write instructions to
2570  * complete the VS thread.
2571  *
2572  * The VUE layout is documented in Volume 2a.
2573  */
2574 void
2575 vec4_visitor::emit_urb_writes()
2576 {
2577    /* MRF 0 is reserved for the debugger, so start with message header
2578     * in MRF 1.
2579     */
2580    int base_mrf = 1;
2581    int mrf = base_mrf;
2582    /* In the process of generating our URB write message contents, we
2583     * may need to unspill a register or load from an array.  Those
2584     * reads would use MRFs 14-15.
2585     */
2586    int max_usable_mrf = 13;
2587
2588    /* The following assertion verifies that max_usable_mrf causes an
2589     * even-numbered amount of URB write data, which will meet gen6's
2590     * requirements for length alignment.
2591     */
2592    assert ((max_usable_mrf - base_mrf) % 2 == 0);
2593
2594    /* First mrf is the g0-based message header containing URB handles and such,
2595     * which is implied in VS_OPCODE_URB_WRITE.
2596     */
2597    mrf++;
2598
2599    if (intel->gen < 6) {
2600       emit_ndc_computation();
2601    }
2602
2603    /* Set up the VUE data for the first URB write */
2604    int slot;
2605    for (slot = 0; slot < c->prog_data.vue_map.num_slots; ++slot) {
2606       emit_urb_slot(mrf++, c->prog_data.vue_map.slot_to_vert_result[slot]);
2607
2608       /* If this was max_usable_mrf, we can't fit anything more into this URB
2609        * WRITE.
2610        */
2611       if (mrf > max_usable_mrf) {
2612          slot++;
2613          break;
2614       }
2615    }
2616
2617    current_annotation = "URB write";
2618    vec4_instruction *inst = emit(VS_OPCODE_URB_WRITE);
2619    inst->base_mrf = base_mrf;
2620    inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2621    inst->eot = (slot >= c->prog_data.vue_map.num_slots);
2622
2623    /* Optional second URB write */
2624    if (!inst->eot) {
2625       mrf = base_mrf + 1;
2626
2627       for (; slot < c->prog_data.vue_map.num_slots; ++slot) {
2628          assert(mrf < max_usable_mrf);
2629
2630          emit_urb_slot(mrf++, c->prog_data.vue_map.slot_to_vert_result[slot]);
2631       }
2632
2633       current_annotation = "URB write";
2634       inst = emit(VS_OPCODE_URB_WRITE);
2635       inst->base_mrf = base_mrf;
2636       inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2637       inst->eot = true;
2638       /* URB destination offset.  In the previous write, we got MRFs
2639        * 2-13 minus the one header MRF, so 12 regs.  URB offset is in
2640        * URB row increments, and each of our MRFs is half of one of
2641        * those, since we're doing interleaved writes.
2642        */
2643       inst->offset = (max_usable_mrf - base_mrf) / 2;
2644    }
2645 }
2646
2647 src_reg
2648 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
2649                                  src_reg *reladdr, int reg_offset)
2650 {
2651    /* Because we store the values to scratch interleaved like our
2652     * vertex data, we need to scale the vec4 index by 2.
2653     */
2654    int message_header_scale = 2;
2655
2656    /* Pre-gen6, the message header uses byte offsets instead of vec4
2657     * (16-byte) offset units.
2658     */
2659    if (intel->gen < 6)
2660       message_header_scale *= 16;
2661
2662    if (reladdr) {
2663       src_reg index = src_reg(this, glsl_type::int_type);
2664
2665       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2666       emit_before(inst, MUL(dst_reg(index),
2667                             index, src_reg(message_header_scale)));
2668
2669       return index;
2670    } else {
2671       return src_reg(reg_offset * message_header_scale);
2672    }
2673 }
2674
2675 src_reg
2676 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
2677                                        src_reg *reladdr, int reg_offset)
2678 {
2679    if (reladdr) {
2680       src_reg index = src_reg(this, glsl_type::int_type);
2681
2682       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2683
2684       /* Pre-gen6, the message header uses byte offsets instead of vec4
2685        * (16-byte) offset units.
2686        */
2687       if (intel->gen < 6) {
2688          emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
2689       }
2690
2691       return index;
2692    } else {
2693       int message_header_scale = intel->gen < 6 ? 16 : 1;
2694       return src_reg(reg_offset * message_header_scale);
2695    }
2696 }
2697
2698 /**
2699  * Emits an instruction before @inst to load the value named by @orig_src
2700  * from scratch space at @base_offset to @temp.
2701  *
2702  * @base_offset is measured in 32-byte units (the size of a register).
2703  */
2704 void
2705 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
2706                                 dst_reg temp, src_reg orig_src,
2707                                 int base_offset)
2708 {
2709    int reg_offset = base_offset + orig_src.reg_offset;
2710    src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
2711
2712    emit_before(inst, SCRATCH_READ(temp, index));
2713 }
2714
2715 /**
2716  * Emits an instruction after @inst to store the value to be written
2717  * to @orig_dst to scratch space at @base_offset, from @temp.
2718  *
2719  * @base_offset is measured in 32-byte units (the size of a register).
2720  */
2721 void
2722 vec4_visitor::emit_scratch_write(vec4_instruction *inst, int base_offset)
2723 {
2724    int reg_offset = base_offset + inst->dst.reg_offset;
2725    src_reg index = get_scratch_offset(inst, inst->dst.reladdr, reg_offset);
2726
2727    /* Create a temporary register to store *inst's result in.
2728     *
2729     * We have to be careful in MOVing from our temporary result register in
2730     * the scratch write.  If we swizzle from channels of the temporary that
2731     * weren't initialized, it will confuse live interval analysis, which will
2732     * make spilling fail to make progress.
2733     */
2734    src_reg temp = src_reg(this, glsl_type::vec4_type);
2735    temp.type = inst->dst.type;
2736    int first_writemask_chan = ffs(inst->dst.writemask) - 1;
2737    int swizzles[4];
2738    for (int i = 0; i < 4; i++)
2739       if (inst->dst.writemask & (1 << i))
2740          swizzles[i] = i;
2741       else
2742          swizzles[i] = first_writemask_chan;
2743    temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2744                                swizzles[2], swizzles[3]);
2745
2746    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
2747                                        inst->dst.writemask));
2748    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
2749    write->predicate = inst->predicate;
2750    write->ir = inst->ir;
2751    write->annotation = inst->annotation;
2752    inst->insert_after(write);
2753
2754    inst->dst.file = temp.file;
2755    inst->dst.reg = temp.reg;
2756    inst->dst.reg_offset = temp.reg_offset;
2757    inst->dst.reladdr = NULL;
2758 }
2759
2760 /**
2761  * We can't generally support array access in GRF space, because a
2762  * single instruction's destination can only span 2 contiguous
2763  * registers.  So, we send all GRF arrays that get variable index
2764  * access to scratch space.
2765  */
2766 void
2767 vec4_visitor::move_grf_array_access_to_scratch()
2768 {
2769    int scratch_loc[this->virtual_grf_count];
2770
2771    for (int i = 0; i < this->virtual_grf_count; i++) {
2772       scratch_loc[i] = -1;
2773    }
2774
2775    /* First, calculate the set of virtual GRFs that need to be punted
2776     * to scratch due to having any array access on them, and where in
2777     * scratch.
2778     */
2779    foreach_list(node, &this->instructions) {
2780       vec4_instruction *inst = (vec4_instruction *)node;
2781
2782       if (inst->dst.file == GRF && inst->dst.reladdr &&
2783           scratch_loc[inst->dst.reg] == -1) {
2784          scratch_loc[inst->dst.reg] = c->last_scratch;
2785          c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
2786       }
2787
2788       for (int i = 0 ; i < 3; i++) {
2789          src_reg *src = &inst->src[i];
2790
2791          if (src->file == GRF && src->reladdr &&
2792              scratch_loc[src->reg] == -1) {
2793             scratch_loc[src->reg] = c->last_scratch;
2794             c->last_scratch += this->virtual_grf_sizes[src->reg];
2795          }
2796       }
2797    }
2798
2799    /* Now, for anything that will be accessed through scratch, rewrite
2800     * it to load/store.  Note that this is a _safe list walk, because
2801     * we may generate a new scratch_write instruction after the one
2802     * we're processing.
2803     */
2804    foreach_list_safe(node, &this->instructions) {
2805       vec4_instruction *inst = (vec4_instruction *)node;
2806
2807       /* Set up the annotation tracking for new generated instructions. */
2808       base_ir = inst->ir;
2809       current_annotation = inst->annotation;
2810
2811       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
2812          emit_scratch_write(inst, scratch_loc[inst->dst.reg]);
2813       }
2814
2815       for (int i = 0 ; i < 3; i++) {
2816          if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
2817             continue;
2818
2819          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
2820
2821          emit_scratch_read(inst, temp, inst->src[i],
2822                            scratch_loc[inst->src[i].reg]);
2823
2824          inst->src[i].file = temp.file;
2825          inst->src[i].reg = temp.reg;
2826          inst->src[i].reg_offset = temp.reg_offset;
2827          inst->src[i].reladdr = NULL;
2828       }
2829    }
2830 }
2831
2832 /**
2833  * Emits an instruction before @inst to load the value named by @orig_src
2834  * from the pull constant buffer (surface) at @base_offset to @temp.
2835  */
2836 void
2837 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
2838                                       dst_reg temp, src_reg orig_src,
2839                                       int base_offset)
2840 {
2841    int reg_offset = base_offset + orig_src.reg_offset;
2842    src_reg index = src_reg((unsigned)SURF_INDEX_VERT_CONST_BUFFER);
2843    src_reg offset = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
2844    vec4_instruction *load;
2845
2846    load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
2847                                         temp, index, offset);
2848    load->base_mrf = 14;
2849    load->mlen = 1;
2850    emit_before(inst, load);
2851 }
2852
2853 /**
2854  * Implements array access of uniforms by inserting a
2855  * PULL_CONSTANT_LOAD instruction.
2856  *
2857  * Unlike temporary GRF array access (where we don't support it due to
2858  * the difficulty of doing relative addressing on instruction
2859  * destinations), we could potentially do array access of uniforms
2860  * that were loaded in GRF space as push constants.  In real-world
2861  * usage we've seen, though, the arrays being used are always larger
2862  * than we could load as push constants, so just always move all
2863  * uniform array access out to a pull constant buffer.
2864  */
2865 void
2866 vec4_visitor::move_uniform_array_access_to_pull_constants()
2867 {
2868    int pull_constant_loc[this->uniforms];
2869
2870    for (int i = 0; i < this->uniforms; i++) {
2871       pull_constant_loc[i] = -1;
2872    }
2873
2874    /* Walk through and find array access of uniforms.  Put a copy of that
2875     * uniform in the pull constant buffer.
2876     *
2877     * Note that we don't move constant-indexed accesses to arrays.  No
2878     * testing has been done of the performance impact of this choice.
2879     */
2880    foreach_list_safe(node, &this->instructions) {
2881       vec4_instruction *inst = (vec4_instruction *)node;
2882
2883       for (int i = 0 ; i < 3; i++) {
2884          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2885             continue;
2886
2887          int uniform = inst->src[i].reg;
2888
2889          /* If this array isn't already present in the pull constant buffer,
2890           * add it.
2891           */
2892          if (pull_constant_loc[uniform] == -1) {
2893             const float **values = &prog_data->param[uniform * 4];
2894
2895             pull_constant_loc[uniform] = prog_data->nr_pull_params / 4;
2896
2897             for (int j = 0; j < uniform_size[uniform] * 4; j++) {
2898                prog_data->pull_param[prog_data->nr_pull_params++] = values[j];
2899             }
2900          }
2901
2902          /* Set up the annotation tracking for new generated instructions. */
2903          base_ir = inst->ir;
2904          current_annotation = inst->annotation;
2905
2906          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
2907
2908          emit_pull_constant_load(inst, temp, inst->src[i],
2909                                  pull_constant_loc[uniform]);
2910
2911          inst->src[i].file = temp.file;
2912          inst->src[i].reg = temp.reg;
2913          inst->src[i].reg_offset = temp.reg_offset;
2914          inst->src[i].reladdr = NULL;
2915       }
2916    }
2917
2918    /* Now there are no accesses of the UNIFORM file with a reladdr, so
2919     * no need to track them as larger-than-vec4 objects.  This will be
2920     * relied on in cutting out unused uniform vectors from push
2921     * constants.
2922     */
2923    split_uniform_registers();
2924 }
2925
2926 void
2927 vec4_visitor::resolve_ud_negate(src_reg *reg)
2928 {
2929    if (reg->type != BRW_REGISTER_TYPE_UD ||
2930        !reg->negate)
2931       return;
2932
2933    src_reg temp = src_reg(this, glsl_type::uvec4_type);
2934    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
2935    *reg = temp;
2936 }
2937
2938 vec4_visitor::vec4_visitor(struct brw_context *brw,
2939                            struct brw_vs_compile *c,
2940                            struct gl_shader_program *prog,
2941                            struct brw_shader *shader,
2942                            void *mem_ctx)
2943 {
2944    this->c = c;
2945    this->brw = brw;
2946    this->intel = &brw->intel;
2947    this->ctx = &intel->ctx;
2948    this->prog = prog;
2949    this->shader = shader;
2950
2951    this->mem_ctx = mem_ctx;
2952    this->failed = false;
2953
2954    this->base_ir = NULL;
2955    this->current_annotation = NULL;
2956    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
2957
2958    this->c = c;
2959    this->vp = &c->vp->program;
2960    this->prog_data = &c->prog_data;
2961
2962    this->variable_ht = hash_table_ctor(0,
2963                                        hash_table_pointer_hash,
2964                                        hash_table_pointer_compare);
2965
2966    this->virtual_grf_def = NULL;
2967    this->virtual_grf_use = NULL;
2968    this->virtual_grf_sizes = NULL;
2969    this->virtual_grf_count = 0;
2970    this->virtual_grf_reg_map = NULL;
2971    this->virtual_grf_reg_count = 0;
2972    this->virtual_grf_array_size = 0;
2973    this->live_intervals_valid = false;
2974
2975    this->max_grf = intel->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
2976
2977    this->uniforms = 0;
2978 }
2979
2980 vec4_visitor::~vec4_visitor()
2981 {
2982    hash_table_dtor(this->variable_ht);
2983 }
2984
2985
2986 void
2987 vec4_visitor::fail(const char *format, ...)
2988 {
2989    va_list va;
2990    char *msg;
2991
2992    if (failed)
2993       return;
2994
2995    failed = true;
2996
2997    va_start(va, format);
2998    msg = ralloc_vasprintf(mem_ctx, format, va);
2999    va_end(va);
3000    msg = ralloc_asprintf(mem_ctx, "VS compile failed: %s\n", msg);
3001
3002    this->fail_msg = msg;
3003
3004    if (INTEL_DEBUG & DEBUG_VS) {
3005       fprintf(stderr, "%s",  msg);
3006    }
3007 }
3008
3009 } /* namespace brw */