src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "glsl/ir_uniform.h"
  26 extern "C" {
  27 #include "program/sampler.h"
  28 }
  29
  30 namespace brw {
  31
  32 vec4_instruction::vec4_instruction(vec4_visitor *v,
  33                                    enum opcode opcode, dst_reg dst,
  34                                    src_reg src0, src_reg src1, src_reg src2)
  35 {
  36    this->opcode = opcode;
  37    this->dst = dst;
  38    this->src[0] = src0;
  39    this->src[1] = src1;
  40    this->src[2] = src2;
  41    this->saturate = false;
  42    this->force_writemask_all = false;
  43    this->no_dd_clear = false;
  44    this->no_dd_check = false;
  45    this->conditional_mod = BRW_CONDITIONAL_NONE;
  46    this->sampler = 0;
  47    this->texture_offset = 0;
  48    this->target = 0;
  49    this->shadow_compare = false;
  50    this->ir = v->base_ir;
  51    this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
  52    this->header_present = false;
  53    this->mlen = 0;
  54    this->base_mrf = 0;
  55    this->offset = 0;
  56    this->ir = NULL;
  57    this->annotation = v->current_annotation;
  58 }
  59
  60 vec4_instruction *
  61 vec4_visitor::emit(vec4_instruction *inst)
  62 {
  63    this->instructions.push_tail(inst);
  64
  65    return inst;
  66 }
  67
  68 vec4_instruction *
  69 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
  70 {
  71    new_inst->ir = inst->ir;
  72    new_inst->annotation = inst->annotation;
  73
  74    inst->insert_before(new_inst);
  75
  76    return inst;
  77 }
  78
  79 vec4_instruction *
  80 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
  81                    src_reg src0, src_reg src1, src_reg src2)
  82 {
  83    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
  84                                              src0, src1, src2));
  85 }
  86
  87
  88 vec4_instruction *
  89 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
  90 {
  91    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
  92 }
  93
  94 vec4_instruction *
  95 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
  96 {
  97    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
  98 }
  99
 100 vec4_instruction *
 101 vec4_visitor::emit(enum opcode opcode, dst_reg dst)
 102 {
 103    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst));
 104 }
 105
 106 vec4_instruction *
 107 vec4_visitor::emit(enum opcode opcode)
 108 {
 109    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
 110 }
 111
 112 #define ALU1(op)                                                        \
 113    vec4_instruction *                                                   \
 114    vec4_visitor::op(dst_reg dst, src_reg src0)                          \
 115    {                                                                    \
 116       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 117                                            src0);                       \
 118    }
 119
 120 #define ALU2(op)                                                        \
 121    vec4_instruction *                                                   \
 122    vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1)            \
 123    {                                                                    \
 124       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 125                                            src0, src1);                 \
 126    }
 127
 128 #define ALU3(op)                                                        \
 129    vec4_instruction *                                                   \
 130    vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1, src_reg src2)\
 131    {                                                                    \
 132       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 133                                            src0, src1, src2);           \
 134    }
 135
 136 ALU1(NOT)
 137 ALU1(MOV)
 138 ALU1(FRC)
 139 ALU1(RNDD)
 140 ALU1(RNDE)
 141 ALU1(RNDZ)
 142 ALU1(F32TO16)
 143 ALU1(F16TO32)
 144 ALU2(ADD)
 145 ALU2(MUL)
 146 ALU2(MACH)
 147 ALU2(AND)
 148 ALU2(OR)
 149 ALU2(XOR)
 150 ALU2(DP3)
 151 ALU2(DP4)
 152 ALU2(DPH)
 153 ALU2(SHL)
 154 ALU2(SHR)
 155 ALU2(ASR)
 156 ALU3(LRP)
 157 ALU1(BFREV)
 158 ALU3(BFE)
 159 ALU2(BFI1)
 160 ALU3(BFI2)
 161 ALU1(FBH)
 162 ALU1(FBL)
 163 ALU1(CBIT)
 164 ALU3(MAD)
 165 ALU2(ADDC)
 166 ALU2(SUBB)
 167
 168 /** Gen4 predicated IF. */
 169 vec4_instruction *
 170 vec4_visitor::IF(uint32_t predicate)
 171 {
 172    vec4_instruction *inst;
 173
 174    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
 175    inst->predicate = predicate;
 176
 177    return inst;
 178 }
 179
 180 /** Gen6+ IF with embedded comparison. */
 181 vec4_instruction *
 182 vec4_visitor::IF(src_reg src0, src_reg src1, uint32_t condition)
 183 {
 184    assert(brw->gen >= 6);
 185
 186    vec4_instruction *inst;
 187
 188    resolve_ud_negate(&src0);
 189    resolve_ud_negate(&src1);
 190
 191    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
 192                                         src0, src1);
 193    inst->conditional_mod = condition;
 194
 195    return inst;
 196 }
 197
 198 /**
 199  * CMP: Sets the low bit of the destination channels with the result
 200  * of the comparison, while the upper bits are undefined, and updates
 201  * the flag register with the packed 16 bits of the result.
 202  */
 203 vec4_instruction *
 204 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, uint32_t condition)
 205 {
 206    vec4_instruction *inst;
 207
 208    /* original gen4 does type conversion to the destination type
 209     * before before comparison, producing garbage results for floating
 210     * point comparisons.
 211     */
 212    if (brw->gen == 4) {
 213       dst.type = src0.type;
 214       if (dst.file == HW_REG)
 215          dst.fixed_hw_reg.type = dst.type;
 216    }
 217
 218    resolve_ud_negate(&src0);
 219    resolve_ud_negate(&src1);
 220
 221    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
 222    inst->conditional_mod = condition;
 223
 224    return inst;
 225 }
 226
 227 vec4_instruction *
 228 vec4_visitor::SCRATCH_READ(dst_reg dst, src_reg index)
 229 {
 230    vec4_instruction *inst;
 231
 232    inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_READ,
 233                                         dst, index);
 234    inst->base_mrf = 14;
 235    inst->mlen = 2;
 236
 237    return inst;
 238 }
 239
 240 vec4_instruction *
 241 vec4_visitor::SCRATCH_WRITE(dst_reg dst, src_reg src, src_reg index)
 242 {
 243    vec4_instruction *inst;
 244
 245    inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_WRITE,
 246                                         dst, src, index);
 247    inst->base_mrf = 13;
 248    inst->mlen = 3;
 249
 250    return inst;
 251 }
 252
 253 void
 254 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
 255 {
 256    static enum opcode dot_opcodes[] = {
 257       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
 258    };
 259
 260    emit(dot_opcodes[elements - 2], dst, src0, src1);
 261 }
 262
 263 src_reg
 264 vec4_visitor::fix_3src_operand(src_reg src)
 265 {
 266    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
 267     * able to use vertical stride of zero to replicate the vec4 uniform, like
 268     *
 269     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
 270     *
 271     * But you can't, since vertical stride is always four in three-source
 272     * instructions. Instead, insert a MOV instruction to do the replication so
 273     * that the three-source instruction can consume it.
 274     */
 275
 276    /* The MOV is only needed if the source is a uniform or immediate. */
 277    if (src.file != UNIFORM && src.file != IMM)
 278       return src;
 279
 280    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 281    expanded.type = src.type;
 282    emit(MOV(expanded, src));
 283    return src_reg(expanded);
 284 }
 285
 286 src_reg
 287 vec4_visitor::fix_math_operand(src_reg src)
 288 {
 289    /* The gen6 math instruction ignores the source modifiers --
 290     * swizzle, abs, negate, and at least some parts of the register
 291     * region description.
 292     *
 293     * Rather than trying to enumerate all these cases, *always* expand the
 294     * operand to a temp GRF for gen6.
 295     *
 296     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 297     * can't use.
 298     */
 299
 300    if (brw->gen == 7 && src.file != IMM)
 301       return src;
 302
 303    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 304    expanded.type = src.type;
 305    emit(MOV(expanded, src));
 306    return src_reg(expanded);
 307 }
 308
 309 void
 310 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
 311 {
 312    src = fix_math_operand(src);
 313
 314    if (dst.writemask != WRITEMASK_XYZW) {
 315       /* The gen6 math instruction must be align1, so we can't do
 316        * writemasks.
 317        */
 318       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 319
 320       emit(opcode, temp_dst, src);
 321
 322       emit(MOV(dst, src_reg(temp_dst)));
 323    } else {
 324       emit(opcode, dst, src);
 325    }
 326 }
 327
 328 void
 329 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
 330 {
 331    vec4_instruction *inst = emit(opcode, dst, src);
 332    inst->base_mrf = 1;
 333    inst->mlen = 1;
 334 }
 335
 336 void
 337 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
 338 {
 339    switch (opcode) {
 340    case SHADER_OPCODE_RCP:
 341    case SHADER_OPCODE_RSQ:
 342    case SHADER_OPCODE_SQRT:
 343    case SHADER_OPCODE_EXP2:
 344    case SHADER_OPCODE_LOG2:
 345    case SHADER_OPCODE_SIN:
 346    case SHADER_OPCODE_COS:
 347       break;
 348    default:
 349       assert(!"not reached: bad math opcode");
 350       return;
 351    }
 352
 353    if (brw->gen >= 6) {
 354       return emit_math1_gen6(opcode, dst, src);
 355    } else {
 356       return emit_math1_gen4(opcode, dst, src);
 357    }
 358 }
 359
 360 void
 361 vec4_visitor::emit_math2_gen6(enum opcode opcode,
 362                               dst_reg dst, src_reg src0, src_reg src1)
 363 {
 364    src0 = fix_math_operand(src0);
 365    src1 = fix_math_operand(src1);
 366
 367    if (dst.writemask != WRITEMASK_XYZW) {
 368       /* The gen6 math instruction must be align1, so we can't do
 369        * writemasks.
 370        */
 371       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 372       temp_dst.type = dst.type;
 373
 374       emit(opcode, temp_dst, src0, src1);
 375
 376       emit(MOV(dst, src_reg(temp_dst)));
 377    } else {
 378       emit(opcode, dst, src0, src1);
 379    }
 380 }
 381
 382 void
 383 vec4_visitor::emit_math2_gen4(enum opcode opcode,
 384                               dst_reg dst, src_reg src0, src_reg src1)
 385 {
 386    vec4_instruction *inst = emit(opcode, dst, src0, src1);
 387    inst->base_mrf = 1;
 388    inst->mlen = 2;
 389 }
 390
 391 void
 392 vec4_visitor::emit_math(enum opcode opcode,
 393                         dst_reg dst, src_reg src0, src_reg src1)
 394 {
 395    switch (opcode) {
 396    case SHADER_OPCODE_POW:
 397    case SHADER_OPCODE_INT_QUOTIENT:
 398    case SHADER_OPCODE_INT_REMAINDER:
 399       break;
 400    default:
 401       assert(!"not reached: unsupported binary math opcode");
 402       return;
 403    }
 404
 405    if (brw->gen >= 6) {
 406       return emit_math2_gen6(opcode, dst, src0, src1);
 407    } else {
 408       return emit_math2_gen4(opcode, dst, src0, src1);
 409    }
 410 }
 411
 412 void
 413 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 414 {
 415    if (brw->gen < 7)
 416       assert(!"ir_unop_pack_half_2x16 should be lowered");
 417
 418    assert(dst.type == BRW_REGISTER_TYPE_UD);
 419    assert(src0.type == BRW_REGISTER_TYPE_F);
 420
 421    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
 422     *
 423     *   Because this instruction does not have a 16-bit floating-point type,
 424     *   the destination data type must be Word (W).
 425     *
 426     *   The destination must be DWord-aligned and specify a horizontal stride
 427     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
 428     *   each destination channel and the upper word is not modified.
 429     *
 430     * The above restriction implies that the f32to16 instruction must use
 431     * align1 mode, because only in align1 mode is it possible to specify
 432     * horizontal stride.  We choose here to defy the hardware docs and emit
 433     * align16 instructions.
 434     *
 435     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
 436     * instructions. I was partially successful in that the code passed all
 437     * tests.  However, the code was dubiously correct and fragile, and the
 438     * tests were not harsh enough to probe that frailty. Not trusting the
 439     * code, I chose instead to remain in align16 mode in defiance of the hw
 440     * docs).
 441     *
 442     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
 443     * simulator, emitting a f32to16 in align16 mode with UD as destination
 444     * data type is safe. The behavior differs from that specified in the PRM
 445     * in that the upper word of each destination channel is cleared to 0.
 446     */
 447
 448    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 449    src_reg tmp_src(tmp_dst);
 450
 451 #if 0
 452    /* Verify the undocumented behavior on which the following instructions
 453     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
 454     * then the result of the bit-or instruction below will be incorrect.
 455     *
 456     * You should inspect the disasm output in order to verify that the MOV is
 457     * not optimized away.
 458     */
 459    emit(MOV(tmp_dst, src_reg(0x12345678u)));
 460 #endif
 461
 462    /* Give tmp the form below, where "." means untouched.
 463     *
 464     *     w z          y          x w z          y          x
 465     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
 466     *
 467     * That the upper word of each write-channel be 0 is required for the
 468     * following bit-shift and bit-or instructions to work. Note that this
 469     * relies on the undocumented hardware behavior mentioned above.
 470     */
 471    tmp_dst.writemask = WRITEMASK_XY;
 472    emit(F32TO16(tmp_dst, src0));
 473
 474    /* Give the write-channels of dst the form:
 475     *   0xhhhh0000
 476     */
 477    tmp_src.swizzle = SWIZZLE_Y;
 478    emit(SHL(dst, tmp_src, src_reg(16u)));
 479
 480    /* Finally, give the write-channels of dst the form of packHalf2x16's
 481     * output:
 482     *   0xhhhhllll
 483     */
 484    tmp_src.swizzle = SWIZZLE_X;
 485    emit(OR(dst, src_reg(dst), tmp_src));
 486 }
 487
 488 void
 489 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 490 {
 491    if (brw->gen < 7)
 492       assert(!"ir_unop_unpack_half_2x16 should be lowered");
 493
 494    assert(dst.type == BRW_REGISTER_TYPE_F);
 495    assert(src0.type == BRW_REGISTER_TYPE_UD);
 496
 497    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
 498     *
 499     *   Because this instruction does not have a 16-bit floating-point type,
 500     *   the source data type must be Word (W). The destination type must be
 501     *   F (Float).
 502     *
 503     * To use W as the source data type, we must adjust horizontal strides,
 504     * which is only possible in align1 mode. All my [chadv] attempts at
 505     * emitting align1 instructions for unpackHalf2x16 failed to pass the
 506     * Piglit tests, so I gave up.
 507     *
 508     * I've verified that, on gen7 hardware and the simulator, it is safe to
 509     * emit f16to32 in align16 mode with UD as source data type.
 510     */
 511
 512    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 513    src_reg tmp_src(tmp_dst);
 514
 515    tmp_dst.writemask = WRITEMASK_X;
 516    emit(AND(tmp_dst, src0, src_reg(0xffffu)));
 517
 518    tmp_dst.writemask = WRITEMASK_Y;
 519    emit(SHR(tmp_dst, src0, src_reg(16u)));
 520
 521    dst.writemask = WRITEMASK_XY;
 522    emit(F16TO32(dst, tmp_src));
 523 }
 524
 525 void
 526 vec4_visitor::visit_instructions(const exec_list *list)
 527 {
 528    foreach_list(node, list) {
 529       ir_instruction *ir = (ir_instruction *)node;
 530
 531       base_ir = ir;
 532       ir->accept(this);
 533    }
 534 }
 535
 536
 537 static int
 538 type_size(const struct glsl_type *type)
 539 {
 540    unsigned int i;
 541    int size;
 542
 543    switch (type->base_type) {
 544    case GLSL_TYPE_UINT:
 545    case GLSL_TYPE_INT:
 546    case GLSL_TYPE_FLOAT:
 547    case GLSL_TYPE_BOOL:
 548       if (type->is_matrix()) {
 549          return type->matrix_columns;
 550       } else {
 551          /* Regardless of size of vector, it gets a vec4. This is bad
 552           * packing for things like floats, but otherwise arrays become a
 553           * mess.  Hopefully a later pass over the code can pack scalars
 554           * down if appropriate.
 555           */
 556          return 1;
 557       }
 558    case GLSL_TYPE_ARRAY:
 559       assert(type->length > 0);
 560       return type_size(type->fields.array) * type->length;
 561    case GLSL_TYPE_STRUCT:
 562       size = 0;
 563       for (i = 0; i < type->length; i++) {
 564          size += type_size(type->fields.structure[i].type);
 565       }
 566       return size;
 567    case GLSL_TYPE_SAMPLER:
 568       /* Samplers take up one slot in UNIFORMS[], but they're baked in
 569        * at link time.
 570        */
 571       return 1;
 572    case GLSL_TYPE_ATOMIC_UINT:
 573       return 0;
 574    case GLSL_TYPE_VOID:
 575    case GLSL_TYPE_ERROR:
 576    case GLSL_TYPE_INTERFACE:
 577       assert(0);
 578       break;
 579    }
 580
 581    return 0;
 582 }
 583
 584 int
 585 vec4_visitor::virtual_grf_alloc(int size)
 586 {
 587    if (virtual_grf_array_size <= virtual_grf_count) {
 588       if (virtual_grf_array_size == 0)
 589          virtual_grf_array_size = 16;
 590       else
 591          virtual_grf_array_size *= 2;
 592       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 593                                    virtual_grf_array_size);
 594       virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
 595                                      virtual_grf_array_size);
 596    }
 597    virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
 598    virtual_grf_reg_count += size;
 599    virtual_grf_sizes[virtual_grf_count] = size;
 600    return virtual_grf_count++;
 601 }
 602
 603 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 604 {
 605    init();
 606
 607    this->file = GRF;
 608    this->reg = v->virtual_grf_alloc(type_size(type));
 609
 610    if (type->is_array() || type->is_record()) {
 611       this->swizzle = BRW_SWIZZLE_NOOP;
 612    } else {
 613       this->swizzle = swizzle_for_size(type->vector_elements);
 614    }
 615
 616    this->type = brw_type_for_base_type(type);
 617 }
 618
 619 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 620 {
 621    init();
 622
 623    this->file = GRF;
 624    this->reg = v->virtual_grf_alloc(type_size(type));
 625
 626    if (type->is_array() || type->is_record()) {
 627       this->writemask = WRITEMASK_XYZW;
 628    } else {
 629       this->writemask = (1 << type->vector_elements) - 1;
 630    }
 631
 632    this->type = brw_type_for_base_type(type);
 633 }
 634
 635 /* Our support for uniforms is piggy-backed on the struct
 636  * gl_fragment_program, because that's where the values actually
 637  * get stored, rather than in some global gl_shader_program uniform
 638  * store.
 639  */
 640 void
 641 vec4_visitor::setup_uniform_values(ir_variable *ir)
 642 {
 643    int namelen = strlen(ir->name);
 644
 645    /* The data for our (non-builtin) uniforms is stored in a series of
 646     * gl_uniform_driver_storage structs for each subcomponent that
 647     * glGetUniformLocation() could name.  We know it's been set up in the same
 648     * order we'd walk the type, so walk the list of storage and find anything
 649     * with our name, or the prefix of a component that starts with our name.
 650     */
 651    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 652       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 653
 654       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 655           (storage->name[namelen] != 0 &&
 656            storage->name[namelen] != '.' &&
 657            storage->name[namelen] != '[')) {
 658          continue;
 659       }
 660
 661       gl_constant_value *components = storage->storage;
 662       unsigned vector_count = (MAX2(storage->array_elements, 1) *
 663                                storage->type->matrix_columns);
 664
 665       for (unsigned s = 0; s < vector_count; s++) {
 666          uniform_vector_size[uniforms] = storage->type->vector_elements;
 667
 668          int i;
 669          for (i = 0; i < uniform_vector_size[uniforms]; i++) {
 670             prog_data->param[uniforms * 4 + i] = &components->f;
 671             components++;
 672          }
 673          for (; i < 4; i++) {
 674             static float zero = 0;
 675             prog_data->param[uniforms * 4 + i] = &zero;
 676          }
 677
 678          uniforms++;
 679       }
 680    }
 681 }
 682
 683 void
 684 vec4_visitor::setup_uniform_clipplane_values()
 685 {
 686    gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
 687
 688    for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
 689       this->uniform_vector_size[this->uniforms] = 4;
 690       this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
 691       this->userplane[i].type = BRW_REGISTER_TYPE_F;
 692       for (int j = 0; j < 4; ++j) {
 693          prog_data->param[this->uniforms * 4 + j] = &clip_planes[i][j];
 694       }
 695       ++this->uniforms;
 696    }
 697 }
 698
 699 /* Our support for builtin uniforms is even scarier than non-builtin.
 700  * It sits on top of the PROG_STATE_VAR parameters that are
 701  * automatically updated from GL context state.
 702  */
 703 void
 704 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
 705 {
 706    const ir_state_slot *const slots = ir->state_slots;
 707    assert(ir->state_slots != NULL);
 708
 709    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 710       /* This state reference has already been setup by ir_to_mesa,
 711        * but we'll get the same index back here.  We can reference
 712        * ParameterValues directly, since unlike brw_fs.cpp, we never
 713        * add new state references during compile.
 714        */
 715       int index = _mesa_add_state_reference(this->prog->Parameters,
 716                                             (gl_state_index *)slots[i].tokens);
 717       float *values = &this->prog->Parameters->ParameterValues[index][0].f;
 718
 719       this->uniform_vector_size[this->uniforms] = 0;
 720       /* Add each of the unique swizzled channels of the element.
 721        * This will end up matching the size of the glsl_type of this field.
 722        */
 723       int last_swiz = -1;
 724       for (unsigned int j = 0; j < 4; j++) {
 725          int swiz = GET_SWZ(slots[i].swizzle, j);
 726          last_swiz = swiz;
 727
 728          prog_data->param[this->uniforms * 4 + j] = &values[swiz];
 729          if (swiz <= last_swiz)
 730             this->uniform_vector_size[this->uniforms]++;
 731       }
 732       this->uniforms++;
 733    }
 734 }
 735
 736 dst_reg *
 737 vec4_visitor::variable_storage(ir_variable *var)
 738 {
 739    return (dst_reg *)hash_table_find(this->variable_ht, var);
 740 }
 741
 742 void
 743 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir, uint32_t *predicate)
 744 {
 745    ir_expression *expr = ir->as_expression();
 746
 747    *predicate = BRW_PREDICATE_NORMAL;
 748
 749    if (expr) {
 750       src_reg op[2];
 751       vec4_instruction *inst;
 752
 753       assert(expr->get_num_operands() <= 2);
 754       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 755          expr->operands[i]->accept(this);
 756          op[i] = this->result;
 757
 758          resolve_ud_negate(&op[i]);
 759       }
 760
 761       switch (expr->operation) {
 762       case ir_unop_logic_not:
 763          inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
 764          inst->conditional_mod = BRW_CONDITIONAL_Z;
 765          break;
 766
 767       case ir_binop_logic_xor:
 768          inst = emit(XOR(dst_null_d(), op[0], op[1]));
 769          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 770          break;
 771
 772       case ir_binop_logic_or:
 773          inst = emit(OR(dst_null_d(), op[0], op[1]));
 774          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 775          break;
 776
 777       case ir_binop_logic_and:
 778          inst = emit(AND(dst_null_d(), op[0], op[1]));
 779          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 780          break;
 781
 782       case ir_unop_f2b:
 783          if (brw->gen >= 6) {
 784             emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
 785          } else {
 786             inst = emit(MOV(dst_null_f(), op[0]));
 787             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 788          }
 789          break;
 790
 791       case ir_unop_i2b:
 792          if (brw->gen >= 6) {
 793             emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 794          } else {
 795             inst = emit(MOV(dst_null_d(), op[0]));
 796             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 797          }
 798          break;
 799
 800       case ir_binop_all_equal:
 801          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 802          *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
 803          break;
 804
 805       case ir_binop_any_nequal:
 806          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 807          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 808          break;
 809
 810       case ir_unop_any:
 811          inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 812          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 813          break;
 814
 815       case ir_binop_greater:
 816       case ir_binop_gequal:
 817       case ir_binop_less:
 818       case ir_binop_lequal:
 819       case ir_binop_equal:
 820       case ir_binop_nequal:
 821          emit(CMP(dst_null_d(), op[0], op[1],
 822                   brw_conditional_for_comparison(expr->operation)));
 823          break;
 824
 825       default:
 826          assert(!"not reached");
 827          break;
 828       }
 829       return;
 830    }
 831
 832    ir->accept(this);
 833
 834    resolve_ud_negate(&this->result);
 835
 836    if (brw->gen >= 6) {
 837       vec4_instruction *inst = emit(AND(dst_null_d(),
 838                                         this->result, src_reg(1)));
 839       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 840    } else {
 841       vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
 842       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 843    }
 844 }
 845
 846 /**
 847  * Emit a gen6 IF statement with the comparison folded into the IF
 848  * instruction.
 849  */
 850 void
 851 vec4_visitor::emit_if_gen6(ir_if *ir)
 852 {
 853    ir_expression *expr = ir->condition->as_expression();
 854
 855    if (expr) {
 856       src_reg op[2];
 857       dst_reg temp;
 858
 859       assert(expr->get_num_operands() <= 2);
 860       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 861          expr->operands[i]->accept(this);
 862          op[i] = this->result;
 863       }
 864
 865       switch (expr->operation) {
 866       case ir_unop_logic_not:
 867          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
 868          return;
 869
 870       case ir_binop_logic_xor:
 871          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
 872          return;
 873
 874       case ir_binop_logic_or:
 875          temp = dst_reg(this, glsl_type::bool_type);
 876          emit(OR(temp, op[0], op[1]));
 877          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 878          return;
 879
 880       case ir_binop_logic_and:
 881          temp = dst_reg(this, glsl_type::bool_type);
 882          emit(AND(temp, op[0], op[1]));
 883          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 884          return;
 885
 886       case ir_unop_f2b:
 887          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 888          return;
 889
 890       case ir_unop_i2b:
 891          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 892          return;
 893
 894       case ir_binop_greater:
 895       case ir_binop_gequal:
 896       case ir_binop_less:
 897       case ir_binop_lequal:
 898       case ir_binop_equal:
 899       case ir_binop_nequal:
 900          emit(IF(op[0], op[1],
 901                  brw_conditional_for_comparison(expr->operation)));
 902          return;
 903
 904       case ir_binop_all_equal:
 905          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 906          emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
 907          return;
 908
 909       case ir_binop_any_nequal:
 910          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 911          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 912          return;
 913
 914       case ir_unop_any:
 915          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 916          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 917          return;
 918
 919       default:
 920          assert(!"not reached");
 921          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 922          return;
 923       }
 924       return;
 925    }
 926
 927    ir->condition->accept(this);
 928
 929    emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
 930 }
 931
 932 dst_reg
 933 with_writemask(dst_reg const & r, int mask)
 934 {
 935    dst_reg result = r;
 936    result.writemask = mask;
 937    return result;
 938 }
 939
 940
 941 void
 942 vec4_visitor::visit(ir_variable *ir)
 943 {
 944    dst_reg *reg = NULL;
 945
 946    if (variable_storage(ir))
 947       return;
 948
 949    switch (ir->mode) {
 950    case ir_var_shader_in:
 951       reg = new(mem_ctx) dst_reg(ATTR, ir->location);
 952       break;
 953
 954    case ir_var_shader_out:
 955       reg = new(mem_ctx) dst_reg(this, ir->type);
 956
 957       for (int i = 0; i < type_size(ir->type); i++) {
 958          output_reg[ir->location + i] = *reg;
 959          output_reg[ir->location + i].reg_offset = i;
 960          output_reg[ir->location + i].type =
 961             brw_type_for_base_type(ir->type->get_scalar_type());
 962          output_reg_annotation[ir->location + i] = ir->name;
 963       }
 964       break;
 965
 966    case ir_var_auto:
 967    case ir_var_temporary:
 968       reg = new(mem_ctx) dst_reg(this, ir->type);
 969       break;
 970
 971    case ir_var_uniform:
 972       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
 973
 974       /* Thanks to the lower_ubo_reference pass, we will see only
 975        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
 976        * variables, so no need for them to be in variable_ht.
 977        */
 978       if (ir->is_in_uniform_block())
 979          return;
 980
 981       /* Track how big the whole uniform variable is, in case we need to put a
 982        * copy of its data into pull constants for array access.
 983        */
 984       this->uniform_size[this->uniforms] = type_size(ir->type);
 985
 986       if (!strncmp(ir->name, "gl_", 3)) {
 987          setup_builtin_uniform_values(ir);
 988       } else {
 989          setup_uniform_values(ir);
 990       }
 991       break;
 992
 993    case ir_var_system_value:
 994       reg = make_reg_for_system_value(ir);
 995       break;
 996
 997    default:
 998       assert(!"not reached");
 999    }
1000
1001    reg->type = brw_type_for_base_type(ir->type);
1002    hash_table_insert(this->variable_ht, reg, ir);
1003 }
1004
1005 void
1006 vec4_visitor::visit(ir_loop *ir)
1007 {
1008    dst_reg counter;
1009
1010    /* We don't want debugging output to print the whole body of the
1011     * loop as the annotation.
1012     */
1013    this->base_ir = NULL;
1014
1015    if (ir->counter != NULL) {
1016       this->base_ir = ir->counter;
1017       ir->counter->accept(this);
1018       counter = *(variable_storage(ir->counter));
1019
1020       if (ir->from != NULL) {
1021          this->base_ir = ir->from;
1022          ir->from->accept(this);
1023
1024          emit(MOV(counter, this->result));
1025       }
1026    }
1027
1028    emit(BRW_OPCODE_DO);
1029
1030    if (ir->to) {
1031       this->base_ir = ir->to;
1032       ir->to->accept(this);
1033
1034       emit(CMP(dst_null_d(), src_reg(counter), this->result,
1035                brw_conditional_for_comparison(ir->cmp)));
1036
1037       vec4_instruction *inst = emit(BRW_OPCODE_BREAK);
1038       inst->predicate = BRW_PREDICATE_NORMAL;
1039    }
1040
1041    visit_instructions(&ir->body_instructions);
1042
1043
1044    if (ir->increment) {
1045       this->base_ir = ir->increment;
1046       ir->increment->accept(this);
1047       emit(ADD(counter, src_reg(counter), this->result));
1048    }
1049
1050    emit(BRW_OPCODE_WHILE);
1051 }
1052
1053 void
1054 vec4_visitor::visit(ir_loop_jump *ir)
1055 {
1056    switch (ir->mode) {
1057    case ir_loop_jump::jump_break:
1058       emit(BRW_OPCODE_BREAK);
1059       break;
1060    case ir_loop_jump::jump_continue:
1061       emit(BRW_OPCODE_CONTINUE);
1062       break;
1063    }
1064 }
1065
1066
1067 void
1068 vec4_visitor::visit(ir_function_signature *ir)
1069 {
1070    assert(0);
1071    (void)ir;
1072 }
1073
1074 void
1075 vec4_visitor::visit(ir_function *ir)
1076 {
1077    /* Ignore function bodies other than main() -- we shouldn't see calls to
1078     * them since they should all be inlined.
1079     */
1080    if (strcmp(ir->name, "main") == 0) {
1081       const ir_function_signature *sig;
1082       exec_list empty;
1083
1084       sig = ir->matching_signature(NULL, &empty);
1085
1086       assert(sig);
1087
1088       visit_instructions(&sig->body);
1089    }
1090 }
1091
1092 bool
1093 vec4_visitor::try_emit_sat(ir_expression *ir)
1094 {
1095    ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
1096    if (!sat_src)
1097       return false;
1098
1099    sat_src->accept(this);
1100    src_reg src = this->result;
1101
1102    this->result = src_reg(this, ir->type);
1103    vec4_instruction *inst;
1104    inst = emit(MOV(dst_reg(this->result), src));
1105    inst->saturate = true;
1106
1107    return true;
1108 }
1109
1110 bool
1111 vec4_visitor::try_emit_mad(ir_expression *ir, int mul_arg)
1112 {
1113    /* 3-src instructions were introduced in gen6. */
1114    if (brw->gen < 6)
1115       return false;
1116
1117    /* MAD can only handle floating-point data. */
1118    if (ir->type->base_type != GLSL_TYPE_FLOAT)
1119       return false;
1120
1121    ir_rvalue *nonmul = ir->operands[1 - mul_arg];
1122    ir_expression *mul = ir->operands[mul_arg]->as_expression();
1123
1124    if (!mul || mul->operation != ir_binop_mul)
1125       return false;
1126
1127    nonmul->accept(this);
1128    src_reg src0 = fix_3src_operand(this->result);
1129
1130    mul->operands[0]->accept(this);
1131    src_reg src1 = fix_3src_operand(this->result);
1132
1133    mul->operands[1]->accept(this);
1134    src_reg src2 = fix_3src_operand(this->result);
1135
1136    this->result = src_reg(this, ir->type);
1137    emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1138
1139    return true;
1140 }
1141
1142 void
1143 vec4_visitor::emit_bool_comparison(unsigned int op,
1144                                  dst_reg dst, src_reg src0, src_reg src1)
1145 {
1146    /* original gen4 does destination conversion before comparison. */
1147    if (brw->gen < 5)
1148       dst.type = src0.type;
1149
1150    emit(CMP(dst, src0, src1, brw_conditional_for_comparison(op)));
1151
1152    dst.type = BRW_REGISTER_TYPE_D;
1153    emit(AND(dst, src_reg(dst), src_reg(0x1)));
1154 }
1155
1156 void
1157 vec4_visitor::emit_minmax(uint32_t conditionalmod, dst_reg dst,
1158                           src_reg src0, src_reg src1)
1159 {
1160    vec4_instruction *inst;
1161
1162    if (brw->gen >= 6) {
1163       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1164       inst->conditional_mod = conditionalmod;
1165    } else {
1166       emit(CMP(dst, src0, src1, conditionalmod));
1167
1168       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1169       inst->predicate = BRW_PREDICATE_NORMAL;
1170    }
1171 }
1172
1173 static bool
1174 is_16bit_constant(ir_rvalue *rvalue)
1175 {
1176    ir_constant *constant = rvalue->as_constant();
1177    if (!constant)
1178       return false;
1179
1180    if (constant->type != glsl_type::int_type &&
1181        constant->type != glsl_type::uint_type)
1182       return false;
1183
1184    return constant->value.u[0] < (1 << 16);
1185 }
1186
1187 void
1188 vec4_visitor::visit(ir_expression *ir)
1189 {
1190    unsigned int operand;
1191    src_reg op[Elements(ir->operands)];
1192    src_reg result_src;
1193    dst_reg result_dst;
1194    vec4_instruction *inst;
1195
1196    if (try_emit_sat(ir))
1197       return;
1198
1199    if (ir->operation == ir_binop_add) {
1200       if (try_emit_mad(ir, 0) || try_emit_mad(ir, 1))
1201          return;
1202    }
1203
1204    for (operand = 0; operand < ir->get_num_operands(); operand++) {
1205       this->result.file = BAD_FILE;
1206       ir->operands[operand]->accept(this);
1207       if (this->result.file == BAD_FILE) {
1208          printf("Failed to get tree for expression operand:\n");
1209          ir->operands[operand]->print();
1210          exit(1);
1211       }
1212       op[operand] = this->result;
1213
1214       /* Matrix expression operands should have been broken down to vector
1215        * operations already.
1216        */
1217       assert(!ir->operands[operand]->type->is_matrix());
1218    }
1219
1220    int vector_elements = ir->operands[0]->type->vector_elements;
1221    if (ir->operands[1]) {
1222       vector_elements = MAX2(vector_elements,
1223                              ir->operands[1]->type->vector_elements);
1224    }
1225
1226    this->result.file = BAD_FILE;
1227
1228    /* Storage for our result.  Ideally for an assignment we'd be using
1229     * the actual storage for the result here, instead.
1230     */
1231    result_src = src_reg(this, ir->type);
1232    /* convenience for the emit functions below. */
1233    result_dst = dst_reg(result_src);
1234    /* If nothing special happens, this is the result. */
1235    this->result = result_src;
1236    /* Limit writes to the channels that will be used by result_src later.
1237     * This does limit this temp's use as a temporary for multi-instruction
1238     * sequences.
1239     */
1240    result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1241
1242    switch (ir->operation) {
1243    case ir_unop_logic_not:
1244       /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
1245        * ones complement of the whole register, not just bit 0.
1246        */
1247       emit(XOR(result_dst, op[0], src_reg(1)));
1248       break;
1249    case ir_unop_neg:
1250       op[0].negate = !op[0].negate;
1251       emit(MOV(result_dst, op[0]));
1252       break;
1253    case ir_unop_abs:
1254       op[0].abs = true;
1255       op[0].negate = false;
1256       emit(MOV(result_dst, op[0]));
1257       break;
1258
1259    case ir_unop_sign:
1260       emit(MOV(result_dst, src_reg(0.0f)));
1261
1262       emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_G));
1263       inst = emit(MOV(result_dst, src_reg(1.0f)));
1264       inst->predicate = BRW_PREDICATE_NORMAL;
1265
1266       emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_L));
1267       inst = emit(MOV(result_dst, src_reg(-1.0f)));
1268       inst->predicate = BRW_PREDICATE_NORMAL;
1269
1270       break;
1271
1272    case ir_unop_rcp:
1273       emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1274       break;
1275
1276    case ir_unop_exp2:
1277       emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1278       break;
1279    case ir_unop_log2:
1280       emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1281       break;
1282    case ir_unop_exp:
1283    case ir_unop_log:
1284       assert(!"not reached: should be handled by ir_explog_to_explog2");
1285       break;
1286    case ir_unop_sin:
1287    case ir_unop_sin_reduced:
1288       emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1289       break;
1290    case ir_unop_cos:
1291    case ir_unop_cos_reduced:
1292       emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1293       break;
1294
1295    case ir_unop_dFdx:
1296    case ir_unop_dFdy:
1297       assert(!"derivatives not valid in vertex shader");
1298       break;
1299
1300    case ir_unop_bitfield_reverse:
1301       emit(BFREV(result_dst, op[0]));
1302       break;
1303    case ir_unop_bit_count:
1304       emit(CBIT(result_dst, op[0]));
1305       break;
1306    case ir_unop_find_msb: {
1307       src_reg temp = src_reg(this, glsl_type::uint_type);
1308
1309       inst = emit(FBH(dst_reg(temp), op[0]));
1310       inst->dst.writemask = WRITEMASK_XYZW;
1311
1312       /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1313        * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1314        * subtract the result from 31 to convert the MSB count into an LSB count.
1315        */
1316
1317       /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1318       temp.swizzle = BRW_SWIZZLE_NOOP;
1319       emit(MOV(result_dst, temp));
1320
1321       src_reg src_tmp = src_reg(result_dst);
1322       emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1323
1324       src_tmp.negate = true;
1325       inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1326       inst->predicate = BRW_PREDICATE_NORMAL;
1327       break;
1328    }
1329    case ir_unop_find_lsb:
1330       emit(FBL(result_dst, op[0]));
1331       break;
1332
1333    case ir_unop_noise:
1334       assert(!"not reached: should be handled by lower_noise");
1335       break;
1336
1337    case ir_binop_add:
1338       emit(ADD(result_dst, op[0], op[1]));
1339       break;
1340    case ir_binop_sub:
1341       assert(!"not reached: should be handled by ir_sub_to_add_neg");
1342       break;
1343
1344    case ir_binop_mul:
1345       if (ir->type->is_integer()) {
1346          /* For integer multiplication, the MUL uses the low 16 bits of one of
1347           * the operands (src0 through SNB, src1 on IVB and later).  The MACH
1348           * accumulates in the contribution of the upper 16 bits of that
1349           * operand.  If we can determine that one of the args is in the low
1350           * 16 bits, though, we can just emit a single MUL.
1351           */
1352          if (is_16bit_constant(ir->operands[0])) {
1353             if (brw->gen < 7)
1354                emit(MUL(result_dst, op[0], op[1]));
1355             else
1356                emit(MUL(result_dst, op[1], op[0]));
1357          } else if (is_16bit_constant(ir->operands[1])) {
1358             if (brw->gen < 7)
1359                emit(MUL(result_dst, op[1], op[0]));
1360             else
1361                emit(MUL(result_dst, op[0], op[1]));
1362          } else {
1363             struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1364
1365             emit(MUL(acc, op[0], op[1]));
1366             emit(MACH(dst_null_d(), op[0], op[1]));
1367             emit(MOV(result_dst, src_reg(acc)));
1368          }
1369       } else {
1370          emit(MUL(result_dst, op[0], op[1]));
1371       }
1372       break;
1373    case ir_binop_imul_high: {
1374       struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1375
1376       emit(MUL(acc, op[0], op[1]));
1377       emit(MACH(result_dst, op[0], op[1]));
1378       break;
1379    }
1380    case ir_binop_div:
1381       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1382       assert(ir->type->is_integer());
1383       emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1384       break;
1385    case ir_binop_carry: {
1386       struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1387
1388       emit(ADDC(dst_null_ud(), op[0], op[1]));
1389       emit(MOV(result_dst, src_reg(acc)));
1390       break;
1391    }
1392    case ir_binop_borrow: {
1393       struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1394
1395       emit(SUBB(dst_null_ud(), op[0], op[1]));
1396       emit(MOV(result_dst, src_reg(acc)));
1397       break;
1398    }
1399    case ir_binop_mod:
1400       /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1401       assert(ir->type->is_integer());
1402       emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1403       break;
1404
1405    case ir_binop_less:
1406    case ir_binop_greater:
1407    case ir_binop_lequal:
1408    case ir_binop_gequal:
1409    case ir_binop_equal:
1410    case ir_binop_nequal: {
1411       emit(CMP(result_dst, op[0], op[1],
1412                brw_conditional_for_comparison(ir->operation)));
1413       emit(AND(result_dst, result_src, src_reg(0x1)));
1414       break;
1415    }
1416
1417    case ir_binop_all_equal:
1418       /* "==" operator producing a scalar boolean. */
1419       if (ir->operands[0]->type->is_vector() ||
1420           ir->operands[1]->type->is_vector()) {
1421          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1422          emit(MOV(result_dst, src_reg(0)));
1423          inst = emit(MOV(result_dst, src_reg(1)));
1424          inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1425       } else {
1426          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1427          emit(AND(result_dst, result_src, src_reg(0x1)));
1428       }
1429       break;
1430    case ir_binop_any_nequal:
1431       /* "!=" operator producing a scalar boolean. */
1432       if (ir->operands[0]->type->is_vector() ||
1433           ir->operands[1]->type->is_vector()) {
1434          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1435
1436          emit(MOV(result_dst, src_reg(0)));
1437          inst = emit(MOV(result_dst, src_reg(1)));
1438          inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1439       } else {
1440          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1441          emit(AND(result_dst, result_src, src_reg(0x1)));
1442       }
1443       break;
1444
1445    case ir_unop_any:
1446       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1447       emit(MOV(result_dst, src_reg(0)));
1448
1449       inst = emit(MOV(result_dst, src_reg(1)));
1450       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1451       break;
1452
1453    case ir_binop_logic_xor:
1454       emit(XOR(result_dst, op[0], op[1]));
1455       break;
1456
1457    case ir_binop_logic_or:
1458       emit(OR(result_dst, op[0], op[1]));
1459       break;
1460
1461    case ir_binop_logic_and:
1462       emit(AND(result_dst, op[0], op[1]));
1463       break;
1464
1465    case ir_binop_dot:
1466       assert(ir->operands[0]->type->is_vector());
1467       assert(ir->operands[0]->type == ir->operands[1]->type);
1468       emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1469       break;
1470
1471    case ir_unop_sqrt:
1472       emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1473       break;
1474    case ir_unop_rsq:
1475       emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1476       break;
1477
1478    case ir_unop_bitcast_i2f:
1479    case ir_unop_bitcast_u2f:
1480       this->result = op[0];
1481       this->result.type = BRW_REGISTER_TYPE_F;
1482       break;
1483
1484    case ir_unop_bitcast_f2i:
1485       this->result = op[0];
1486       this->result.type = BRW_REGISTER_TYPE_D;
1487       break;
1488
1489    case ir_unop_bitcast_f2u:
1490       this->result = op[0];
1491       this->result.type = BRW_REGISTER_TYPE_UD;
1492       break;
1493
1494    case ir_unop_i2f:
1495    case ir_unop_i2u:
1496    case ir_unop_u2i:
1497    case ir_unop_u2f:
1498    case ir_unop_b2f:
1499    case ir_unop_b2i:
1500    case ir_unop_f2i:
1501    case ir_unop_f2u:
1502       emit(MOV(result_dst, op[0]));
1503       break;
1504    case ir_unop_f2b:
1505    case ir_unop_i2b: {
1506       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1507       emit(AND(result_dst, result_src, src_reg(1)));
1508       break;
1509    }
1510
1511    case ir_unop_trunc:
1512       emit(RNDZ(result_dst, op[0]));
1513       break;
1514    case ir_unop_ceil:
1515       op[0].negate = !op[0].negate;
1516       inst = emit(RNDD(result_dst, op[0]));
1517       this->result.negate = true;
1518       break;
1519    case ir_unop_floor:
1520       inst = emit(RNDD(result_dst, op[0]));
1521       break;
1522    case ir_unop_fract:
1523       inst = emit(FRC(result_dst, op[0]));
1524       break;
1525    case ir_unop_round_even:
1526       emit(RNDE(result_dst, op[0]));
1527       break;
1528
1529    case ir_binop_min:
1530       emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1531       break;
1532    case ir_binop_max:
1533       emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1534       break;
1535
1536    case ir_binop_pow:
1537       emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1538       break;
1539
1540    case ir_unop_bit_not:
1541       inst = emit(NOT(result_dst, op[0]));
1542       break;
1543    case ir_binop_bit_and:
1544       inst = emit(AND(result_dst, op[0], op[1]));
1545       break;
1546    case ir_binop_bit_xor:
1547       inst = emit(XOR(result_dst, op[0], op[1]));
1548       break;
1549    case ir_binop_bit_or:
1550       inst = emit(OR(result_dst, op[0], op[1]));
1551       break;
1552
1553    case ir_binop_lshift:
1554       inst = emit(SHL(result_dst, op[0], op[1]));
1555       break;
1556
1557    case ir_binop_rshift:
1558       if (ir->type->base_type == GLSL_TYPE_INT)
1559          inst = emit(ASR(result_dst, op[0], op[1]));
1560       else
1561          inst = emit(SHR(result_dst, op[0], op[1]));
1562       break;
1563
1564    case ir_binop_bfm:
1565       emit(BFI1(result_dst, op[0], op[1]));
1566       break;
1567
1568    case ir_binop_ubo_load: {
1569       ir_constant *uniform_block = ir->operands[0]->as_constant();
1570       ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1571       unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1572       src_reg offset = op[1];
1573
1574       /* Now, load the vector from that offset. */
1575       assert(ir->type->is_vector() || ir->type->is_scalar());
1576
1577       src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1578       packed_consts.type = result.type;
1579       src_reg surf_index =
1580          src_reg(prog_data->base.binding_table.ubo_start + uniform_block->value.u[0]);
1581       if (const_offset_ir) {
1582          offset = src_reg(const_offset / 16);
1583       } else {
1584          emit(SHR(dst_reg(offset), offset, src_reg(4)));
1585       }
1586
1587       vec4_instruction *pull =
1588          emit(new(mem_ctx) vec4_instruction(this,
1589                                             VS_OPCODE_PULL_CONSTANT_LOAD,
1590                                             dst_reg(packed_consts),
1591                                             surf_index,
1592                                             offset));
1593       pull->base_mrf = 14;
1594       pull->mlen = 1;
1595
1596       packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1597       packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1598                                             const_offset % 16 / 4,
1599                                             const_offset % 16 / 4,
1600                                             const_offset % 16 / 4);
1601
1602       /* UBO bools are any nonzero int.  We store bools as either 0 or 1. */
1603       if (ir->type->base_type == GLSL_TYPE_BOOL) {
1604          emit(CMP(result_dst, packed_consts, src_reg(0u),
1605                   BRW_CONDITIONAL_NZ));
1606          emit(AND(result_dst, result, src_reg(0x1)));
1607       } else {
1608          emit(MOV(result_dst, packed_consts));
1609       }
1610       break;
1611    }
1612
1613    case ir_binop_vector_extract:
1614       assert(!"should have been lowered by vec_index_to_cond_assign");
1615       break;
1616
1617    case ir_triop_fma:
1618       op[0] = fix_3src_operand(op[0]);
1619       op[1] = fix_3src_operand(op[1]);
1620       op[2] = fix_3src_operand(op[2]);
1621       /* Note that the instruction's argument order is reversed from GLSL
1622        * and the IR.
1623        */
1624       emit(MAD(result_dst, op[2], op[1], op[0]));
1625       break;
1626
1627    case ir_triop_lrp:
1628       op[0] = fix_3src_operand(op[0]);
1629       op[1] = fix_3src_operand(op[1]);
1630       op[2] = fix_3src_operand(op[2]);
1631       /* Note that the instruction's argument order is reversed from GLSL
1632        * and the IR.
1633        */
1634       emit(LRP(result_dst, op[2], op[1], op[0]));
1635       break;
1636
1637    case ir_triop_csel:
1638       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1639       inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1640       inst->predicate = BRW_PREDICATE_NORMAL;
1641       break;
1642
1643    case ir_triop_bfi:
1644       op[0] = fix_3src_operand(op[0]);
1645       op[1] = fix_3src_operand(op[1]);
1646       op[2] = fix_3src_operand(op[2]);
1647       emit(BFI2(result_dst, op[0], op[1], op[2]));
1648       break;
1649
1650    case ir_triop_bitfield_extract:
1651       op[0] = fix_3src_operand(op[0]);
1652       op[1] = fix_3src_operand(op[1]);
1653       op[2] = fix_3src_operand(op[2]);
1654       /* Note that the instruction's argument order is reversed from GLSL
1655        * and the IR.
1656        */
1657       emit(BFE(result_dst, op[2], op[1], op[0]));
1658       break;
1659
1660    case ir_triop_vector_insert:
1661       assert(!"should have been lowered by lower_vector_insert");
1662       break;
1663
1664    case ir_quadop_bitfield_insert:
1665       assert(!"not reached: should be handled by "
1666               "bitfield_insert_to_bfm_bfi\n");
1667       break;
1668
1669    case ir_quadop_vector:
1670       assert(!"not reached: should be handled by lower_quadop_vector");
1671       break;
1672
1673    case ir_unop_pack_half_2x16:
1674       emit_pack_half_2x16(result_dst, op[0]);
1675       break;
1676    case ir_unop_unpack_half_2x16:
1677       emit_unpack_half_2x16(result_dst, op[0]);
1678       break;
1679    case ir_unop_pack_snorm_2x16:
1680    case ir_unop_pack_snorm_4x8:
1681    case ir_unop_pack_unorm_2x16:
1682    case ir_unop_pack_unorm_4x8:
1683    case ir_unop_unpack_snorm_2x16:
1684    case ir_unop_unpack_snorm_4x8:
1685    case ir_unop_unpack_unorm_2x16:
1686    case ir_unop_unpack_unorm_4x8:
1687       assert(!"not reached: should be handled by lower_packing_builtins");
1688       break;
1689    case ir_unop_unpack_half_2x16_split_x:
1690    case ir_unop_unpack_half_2x16_split_y:
1691    case ir_binop_pack_half_2x16_split:
1692       assert(!"not reached: should not occur in vertex shader");
1693       break;
1694    case ir_binop_ldexp:
1695       assert(!"not reached: should be handled by ldexp_to_arith()");
1696       break;
1697    }
1698 }
1699
1700
1701 void
1702 vec4_visitor::visit(ir_swizzle *ir)
1703 {
1704    src_reg src;
1705    int i = 0;
1706    int swizzle[4];
1707
1708    /* Note that this is only swizzles in expressions, not those on the left
1709     * hand side of an assignment, which do write masking.  See ir_assignment
1710     * for that.
1711     */
1712
1713    ir->val->accept(this);
1714    src = this->result;
1715    assert(src.file != BAD_FILE);
1716
1717    for (i = 0; i < ir->type->vector_elements; i++) {
1718       switch (i) {
1719       case 0:
1720          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1721          break;
1722       case 1:
1723          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1724          break;
1725       case 2:
1726          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1727          break;
1728       case 3:
1729          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1730             break;
1731       }
1732    }
1733    for (; i < 4; i++) {
1734       /* Replicate the last channel out. */
1735       swizzle[i] = swizzle[ir->type->vector_elements - 1];
1736    }
1737
1738    src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1739
1740    this->result = src;
1741 }
1742
1743 void
1744 vec4_visitor::visit(ir_dereference_variable *ir)
1745 {
1746    const struct glsl_type *type = ir->type;
1747    dst_reg *reg = variable_storage(ir->var);
1748
1749    if (!reg) {
1750       fail("Failed to find variable storage for %s\n", ir->var->name);
1751       this->result = src_reg(brw_null_reg());
1752       return;
1753    }
1754
1755    this->result = src_reg(*reg);
1756
1757    /* System values get their swizzle from the dst_reg writemask */
1758    if (ir->var->mode == ir_var_system_value)
1759       return;
1760
1761    if (type->is_scalar() || type->is_vector() || type->is_matrix())
1762       this->result.swizzle = swizzle_for_size(type->vector_elements);
1763 }
1764
1765
1766 int
1767 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1768 {
1769    /* Under normal circumstances array elements are stored consecutively, so
1770     * the stride is equal to the size of the array element.
1771     */
1772    return type_size(ir->type);
1773 }
1774
1775
1776 void
1777 vec4_visitor::visit(ir_dereference_array *ir)
1778 {
1779    ir_constant *constant_index;
1780    src_reg src;
1781    int array_stride = compute_array_stride(ir);
1782
1783    constant_index = ir->array_index->constant_expression_value();
1784
1785    ir->array->accept(this);
1786    src = this->result;
1787
1788    if (constant_index) {
1789       src.reg_offset += constant_index->value.i[0] * array_stride;
1790    } else {
1791       /* Variable index array dereference.  It eats the "vec4" of the
1792        * base of the array and an index that offsets the Mesa register
1793        * index.
1794        */
1795       ir->array_index->accept(this);
1796
1797       src_reg index_reg;
1798
1799       if (array_stride == 1) {
1800          index_reg = this->result;
1801       } else {
1802          index_reg = src_reg(this, glsl_type::int_type);
1803
1804          emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
1805       }
1806
1807       if (src.reladdr) {
1808          src_reg temp = src_reg(this, glsl_type::int_type);
1809
1810          emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1811
1812          index_reg = temp;
1813       }
1814
1815       src.reladdr = ralloc(mem_ctx, src_reg);
1816       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1817    }
1818
1819    /* If the type is smaller than a vec4, replicate the last channel out. */
1820    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1821       src.swizzle = swizzle_for_size(ir->type->vector_elements);
1822    else
1823       src.swizzle = BRW_SWIZZLE_NOOP;
1824    src.type = brw_type_for_base_type(ir->type);
1825
1826    this->result = src;
1827 }
1828
1829 void
1830 vec4_visitor::visit(ir_dereference_record *ir)
1831 {
1832    unsigned int i;
1833    const glsl_type *struct_type = ir->record->type;
1834    int offset = 0;
1835
1836    ir->record->accept(this);
1837
1838    for (i = 0; i < struct_type->length; i++) {
1839       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1840          break;
1841       offset += type_size(struct_type->fields.structure[i].type);
1842    }
1843
1844    /* If the type is smaller than a vec4, replicate the last channel out. */
1845    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1846       this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1847    else
1848       this->result.swizzle = BRW_SWIZZLE_NOOP;
1849    this->result.type = brw_type_for_base_type(ir->type);
1850
1851    this->result.reg_offset += offset;
1852 }
1853
1854 /**
1855  * We want to be careful in assignment setup to hit the actual storage
1856  * instead of potentially using a temporary like we might with the
1857  * ir_dereference handler.
1858  */
1859 static dst_reg
1860 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1861 {
1862    /* The LHS must be a dereference.  If the LHS is a variable indexed array
1863     * access of a vector, it must be separated into a series conditional moves
1864     * before reaching this point (see ir_vec_index_to_cond_assign).
1865     */
1866    assert(ir->as_dereference());
1867    ir_dereference_array *deref_array = ir->as_dereference_array();
1868    if (deref_array) {
1869       assert(!deref_array->array->type->is_vector());
1870    }
1871
1872    /* Use the rvalue deref handler for the most part.  We'll ignore
1873     * swizzles in it and write swizzles using writemask, though.
1874     */
1875    ir->accept(v);
1876    return dst_reg(v->result);
1877 }
1878
1879 void
1880 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1881                               const struct glsl_type *type, uint32_t predicate)
1882 {
1883    if (type->base_type == GLSL_TYPE_STRUCT) {
1884       for (unsigned int i = 0; i < type->length; i++) {
1885          emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1886       }
1887       return;
1888    }
1889
1890    if (type->is_array()) {
1891       for (unsigned int i = 0; i < type->length; i++) {
1892          emit_block_move(dst, src, type->fields.array, predicate);
1893       }
1894       return;
1895    }
1896
1897    if (type->is_matrix()) {
1898       const struct glsl_type *vec_type;
1899
1900       vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1901                                          type->vector_elements, 1);
1902
1903       for (int i = 0; i < type->matrix_columns; i++) {
1904          emit_block_move(dst, src, vec_type, predicate);
1905       }
1906       return;
1907    }
1908
1909    assert(type->is_scalar() || type->is_vector());
1910
1911    dst->type = brw_type_for_base_type(type);
1912    src->type = dst->type;
1913
1914    dst->writemask = (1 << type->vector_elements) - 1;
1915
1916    src->swizzle = swizzle_for_size(type->vector_elements);
1917
1918    vec4_instruction *inst = emit(MOV(*dst, *src));
1919    inst->predicate = predicate;
1920
1921    dst->reg_offset++;
1922    src->reg_offset++;
1923 }
1924
1925
1926 /* If the RHS processing resulted in an instruction generating a
1927  * temporary value, and it would be easy to rewrite the instruction to
1928  * generate its result right into the LHS instead, do so.  This ends
1929  * up reliably removing instructions where it can be tricky to do so
1930  * later without real UD chain information.
1931  */
1932 bool
1933 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
1934                                      dst_reg dst,
1935                                      src_reg src,
1936                                      vec4_instruction *pre_rhs_inst,
1937                                      vec4_instruction *last_rhs_inst)
1938 {
1939    /* This could be supported, but it would take more smarts. */
1940    if (ir->condition)
1941       return false;
1942
1943    if (pre_rhs_inst == last_rhs_inst)
1944       return false; /* No instructions generated to work with. */
1945
1946    /* Make sure the last instruction generated our source reg. */
1947    if (src.file != GRF ||
1948        src.file != last_rhs_inst->dst.file ||
1949        src.reg != last_rhs_inst->dst.reg ||
1950        src.reg_offset != last_rhs_inst->dst.reg_offset ||
1951        src.reladdr ||
1952        src.abs ||
1953        src.negate ||
1954        last_rhs_inst->predicate != BRW_PREDICATE_NONE)
1955       return false;
1956
1957    /* Check that that last instruction fully initialized the channels
1958     * we want to use, in the order we want to use them.  We could
1959     * potentially reswizzle the operands of many instructions so that
1960     * we could handle out of order channels, but don't yet.
1961     */
1962
1963    for (unsigned i = 0; i < 4; i++) {
1964       if (dst.writemask & (1 << i)) {
1965          if (!(last_rhs_inst->dst.writemask & (1 << i)))
1966             return false;
1967
1968          if (BRW_GET_SWZ(src.swizzle, i) != i)
1969             return false;
1970       }
1971    }
1972
1973    /* Success!  Rewrite the instruction. */
1974    last_rhs_inst->dst.file = dst.file;
1975    last_rhs_inst->dst.reg = dst.reg;
1976    last_rhs_inst->dst.reg_offset = dst.reg_offset;
1977    last_rhs_inst->dst.reladdr = dst.reladdr;
1978    last_rhs_inst->dst.writemask &= dst.writemask;
1979
1980    return true;
1981 }
1982
1983 void
1984 vec4_visitor::visit(ir_assignment *ir)
1985 {
1986    dst_reg dst = get_assignment_lhs(ir->lhs, this);
1987    uint32_t predicate = BRW_PREDICATE_NONE;
1988
1989    if (!ir->lhs->type->is_scalar() &&
1990        !ir->lhs->type->is_vector()) {
1991       ir->rhs->accept(this);
1992       src_reg src = this->result;
1993
1994       if (ir->condition) {
1995          emit_bool_to_cond_code(ir->condition, &predicate);
1996       }
1997
1998       /* emit_block_move doesn't account for swizzles in the source register.
1999        * This should be ok, since the source register is a structure or an
2000        * array, and those can't be swizzled.  But double-check to be sure.
2001        */
2002       assert(src.swizzle ==
2003              (ir->rhs->type->is_matrix()
2004               ? swizzle_for_size(ir->rhs->type->vector_elements)
2005               : BRW_SWIZZLE_NOOP));
2006
2007       emit_block_move(&dst, &src, ir->rhs->type, predicate);
2008       return;
2009    }
2010
2011    /* Now we're down to just a scalar/vector with writemasks. */
2012    int i;
2013
2014    vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2015    pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2016
2017    ir->rhs->accept(this);
2018
2019    last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2020
2021    src_reg src = this->result;
2022
2023    int swizzles[4];
2024    int first_enabled_chan = 0;
2025    int src_chan = 0;
2026
2027    assert(ir->lhs->type->is_vector() ||
2028           ir->lhs->type->is_scalar());
2029    dst.writemask = ir->write_mask;
2030
2031    for (int i = 0; i < 4; i++) {
2032       if (dst.writemask & (1 << i)) {
2033          first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
2034          break;
2035       }
2036    }
2037
2038    /* Swizzle a small RHS vector into the channels being written.
2039     *
2040     * glsl ir treats write_mask as dictating how many channels are
2041     * present on the RHS while in our instructions we need to make
2042     * those channels appear in the slots of the vec4 they're written to.
2043     */
2044    for (int i = 0; i < 4; i++) {
2045       if (dst.writemask & (1 << i))
2046          swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
2047       else
2048          swizzles[i] = first_enabled_chan;
2049    }
2050    src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2051                               swizzles[2], swizzles[3]);
2052
2053    if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2054       return;
2055    }
2056
2057    if (ir->condition) {
2058       emit_bool_to_cond_code(ir->condition, &predicate);
2059    }
2060
2061    for (i = 0; i < type_size(ir->lhs->type); i++) {
2062       vec4_instruction *inst = emit(MOV(dst, src));
2063       inst->predicate = predicate;
2064
2065       dst.reg_offset++;
2066       src.reg_offset++;
2067    }
2068 }
2069
2070 void
2071 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2072 {
2073    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2074       foreach_list(node, &ir->components) {
2075          ir_constant *field_value = (ir_constant *)node;
2076
2077          emit_constant_values(dst, field_value);
2078       }
2079       return;
2080    }
2081
2082    if (ir->type->is_array()) {
2083       for (unsigned int i = 0; i < ir->type->length; i++) {
2084          emit_constant_values(dst, ir->array_elements[i]);
2085       }
2086       return;
2087    }
2088
2089    if (ir->type->is_matrix()) {
2090       for (int i = 0; i < ir->type->matrix_columns; i++) {
2091          float *vec = &ir->value.f[i * ir->type->vector_elements];
2092
2093          for (int j = 0; j < ir->type->vector_elements; j++) {
2094             dst->writemask = 1 << j;
2095             dst->type = BRW_REGISTER_TYPE_F;
2096
2097             emit(MOV(*dst, src_reg(vec[j])));
2098          }
2099          dst->reg_offset++;
2100       }
2101       return;
2102    }
2103
2104    int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2105
2106    for (int i = 0; i < ir->type->vector_elements; i++) {
2107       if (!(remaining_writemask & (1 << i)))
2108          continue;
2109
2110       dst->writemask = 1 << i;
2111       dst->type = brw_type_for_base_type(ir->type);
2112
2113       /* Find other components that match the one we're about to
2114        * write.  Emits fewer instructions for things like vec4(0.5,
2115        * 1.5, 1.5, 1.5).
2116        */
2117       for (int j = i + 1; j < ir->type->vector_elements; j++) {
2118          if (ir->type->base_type == GLSL_TYPE_BOOL) {
2119             if (ir->value.b[i] == ir->value.b[j])
2120                dst->writemask |= (1 << j);
2121          } else {
2122             /* u, i, and f storage all line up, so no need for a
2123              * switch case for comparing each type.
2124              */
2125             if (ir->value.u[i] == ir->value.u[j])
2126                dst->writemask |= (1 << j);
2127          }
2128       }
2129
2130       switch (ir->type->base_type) {
2131       case GLSL_TYPE_FLOAT:
2132          emit(MOV(*dst, src_reg(ir->value.f[i])));
2133          break;
2134       case GLSL_TYPE_INT:
2135          emit(MOV(*dst, src_reg(ir->value.i[i])));
2136          break;
2137       case GLSL_TYPE_UINT:
2138          emit(MOV(*dst, src_reg(ir->value.u[i])));
2139          break;
2140       case GLSL_TYPE_BOOL:
2141          emit(MOV(*dst, src_reg(ir->value.b[i])));
2142          break;
2143       default:
2144          assert(!"Non-float/uint/int/bool constant");
2145          break;
2146       }
2147
2148       remaining_writemask &= ~dst->writemask;
2149    }
2150    dst->reg_offset++;
2151 }
2152
2153 void
2154 vec4_visitor::visit(ir_constant *ir)
2155 {
2156    dst_reg dst = dst_reg(this, ir->type);
2157    this->result = src_reg(dst);
2158
2159    emit_constant_values(&dst, ir);
2160 }
2161
2162 void
2163 vec4_visitor::visit(ir_call *ir)
2164 {
2165    assert(!"not reached");
2166 }
2167
2168 void
2169 vec4_visitor::visit(ir_texture *ir)
2170 {
2171    int sampler =
2172       _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2173
2174    /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2175     * emitting anything other than setting up the constant result.
2176     */
2177    if (ir->op == ir_tg4) {
2178       ir_constant *chan = ir->lod_info.component->as_constant();
2179       int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2180       if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2181          dst_reg result(this, ir->type);
2182          this->result = src_reg(result);
2183          emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2184          return;
2185       }
2186    }
2187
2188    /* Should be lowered by do_lower_texture_projection */
2189    assert(!ir->projector);
2190
2191    /* Should be lowered */
2192    assert(!ir->offset || !ir->offset->type->is_array());
2193
2194    /* Generate code to compute all the subexpression trees.  This has to be
2195     * done before loading any values into MRFs for the sampler message since
2196     * generating these values may involve SEND messages that need the MRFs.
2197     */
2198    src_reg coordinate;
2199    if (ir->coordinate) {
2200       ir->coordinate->accept(this);
2201       coordinate = this->result;
2202    }
2203
2204    src_reg shadow_comparitor;
2205    if (ir->shadow_comparitor) {
2206       ir->shadow_comparitor->accept(this);
2207       shadow_comparitor = this->result;
2208    }
2209
2210    bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2211    src_reg offset_value;
2212    if (has_nonconstant_offset) {
2213       ir->offset->accept(this);
2214       offset_value = src_reg(this->result);
2215    }
2216
2217    const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2218    src_reg lod, dPdx, dPdy, sample_index;
2219    switch (ir->op) {
2220    case ir_tex:
2221       lod = src_reg(0.0f);
2222       lod_type = glsl_type::float_type;
2223       break;
2224    case ir_txf:
2225    case ir_txl:
2226    case ir_txs:
2227       ir->lod_info.lod->accept(this);
2228       lod = this->result;
2229       lod_type = ir->lod_info.lod->type;
2230       break;
2231    case ir_query_levels:
2232       lod = src_reg(0);
2233       lod_type = glsl_type::int_type;
2234       break;
2235    case ir_txf_ms:
2236       ir->lod_info.sample_index->accept(this);
2237       sample_index = this->result;
2238       sample_index_type = ir->lod_info.sample_index->type;
2239       break;
2240    case ir_txd:
2241       ir->lod_info.grad.dPdx->accept(this);
2242       dPdx = this->result;
2243
2244       ir->lod_info.grad.dPdy->accept(this);
2245       dPdy = this->result;
2246
2247       lod_type = ir->lod_info.grad.dPdx->type;
2248       break;
2249    case ir_txb:
2250    case ir_lod:
2251    case ir_tg4:
2252       break;
2253    }
2254
2255    vec4_instruction *inst = NULL;
2256    switch (ir->op) {
2257    case ir_tex:
2258    case ir_txl:
2259       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXL);
2260       break;
2261    case ir_txd:
2262       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXD);
2263       break;
2264    case ir_txf:
2265       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF);
2266       break;
2267    case ir_txf_ms:
2268       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_MS);
2269       break;
2270    case ir_txs:
2271       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
2272       break;
2273    case ir_tg4:
2274       if (has_nonconstant_offset)
2275          inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TG4_OFFSET);
2276       else
2277          inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TG4);
2278       break;
2279    case ir_query_levels:
2280       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
2281       break;
2282    case ir_txb:
2283       assert(!"TXB is not valid for vertex shaders.");
2284       break;
2285    case ir_lod:
2286       assert(!"LOD is not valid for vertex shaders.");
2287       break;
2288    default:
2289       assert(!"Unrecognized tex op");
2290    }
2291
2292    bool use_texture_offset = ir->offset != NULL && ir->op != ir_txf;
2293
2294    /* Texel offsets go in the message header; Gen4 also requires headers. */
2295    inst->header_present = use_texture_offset || brw->gen < 5 || ir->op == ir_tg4;
2296    inst->base_mrf = 2;
2297    inst->mlen = inst->header_present + 1; /* always at least one */
2298    inst->sampler = sampler;
2299    inst->dst = dst_reg(this, ir->type);
2300    inst->dst.writemask = WRITEMASK_XYZW;
2301    inst->shadow_compare = ir->shadow_comparitor != NULL;
2302
2303    if (use_texture_offset)
2304       inst->texture_offset = brw_texture_offset(ctx, ir->offset->as_constant());
2305
2306    /* Stuff the channel select bits in the top of the texture offset */
2307    if (ir->op == ir_tg4)
2308       inst->texture_offset |= gather_channel(ir, sampler)<<16;
2309
2310    /* MRF for the first parameter */
2311    int param_base = inst->base_mrf + inst->header_present;
2312
2313    if (ir->op == ir_txs || ir->op == ir_query_levels) {
2314       int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2315       emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2316    } else {
2317       /* Load the coordinate */
2318       /* FINISHME: gl_clamp_mask and saturate */
2319       int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2320       int zero_mask = 0xf & ~coord_mask;
2321
2322       emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2323                coordinate));
2324
2325       if (zero_mask != 0) {
2326          emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2327                   src_reg(0)));
2328       }
2329       /* Load the shadow comparitor */
2330       if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2331          emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2332                           WRITEMASK_X),
2333                   shadow_comparitor));
2334          inst->mlen++;
2335       }
2336
2337       /* Load the LOD info */
2338       if (ir->op == ir_tex || ir->op == ir_txl) {
2339          int mrf, writemask;
2340          if (brw->gen >= 5) {
2341             mrf = param_base + 1;
2342             if (ir->shadow_comparitor) {
2343                writemask = WRITEMASK_Y;
2344                /* mlen already incremented */
2345             } else {
2346                writemask = WRITEMASK_X;
2347                inst->mlen++;
2348             }
2349          } else /* brw->gen == 4 */ {
2350             mrf = param_base;
2351             writemask = WRITEMASK_W;
2352          }
2353          emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2354       } else if (ir->op == ir_txf) {
2355          emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2356       } else if (ir->op == ir_txf_ms) {
2357          emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2358                   sample_index));
2359          inst->mlen++;
2360
2361          /* on Gen7, there is an additional MCS parameter here after SI,
2362           * but we don't bother to emit it since it's always zero. If
2363           * we start supporting texturing from CMS surfaces, this will have
2364           * to change
2365           */
2366       } else if (ir->op == ir_txd) {
2367          const glsl_type *type = lod_type;
2368
2369          if (brw->gen >= 5) {
2370             dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2371             dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2372             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2373             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2374             inst->mlen++;
2375
2376             if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2377                dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2378                dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2379                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2380                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2381                inst->mlen++;
2382
2383                if (ir->shadow_comparitor) {
2384                   emit(MOV(dst_reg(MRF, param_base + 2,
2385                                    ir->shadow_comparitor->type, WRITEMASK_Z),
2386                            shadow_comparitor));
2387                }
2388             }
2389          } else /* brw->gen == 4 */ {
2390             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2391             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2392             inst->mlen += 2;
2393          }
2394       } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2395          if (ir->shadow_comparitor) {
2396             emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2397                      shadow_comparitor));
2398          }
2399
2400          emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2401                   offset_value));
2402          inst->mlen++;
2403       }
2404    }
2405
2406    emit(inst);
2407
2408    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2409     * spec requires layers.
2410     */
2411    if (ir->op == ir_txs) {
2412       glsl_type const *type = ir->sampler->type;
2413       if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2414           type->sampler_array) {
2415          emit_math(SHADER_OPCODE_INT_QUOTIENT,
2416                    with_writemask(inst->dst, WRITEMASK_Z),
2417                    src_reg(inst->dst), src_reg(6));
2418       }
2419    }
2420
2421    swizzle_result(ir, src_reg(inst->dst), sampler);
2422 }
2423
2424 /**
2425  * Set up the gather channel based on the swizzle, for gather4.
2426  */
2427 uint32_t
2428 vec4_visitor::gather_channel(ir_texture *ir, int sampler)
2429 {
2430    ir_constant *chan = ir->lod_info.component->as_constant();
2431    int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2432    switch (swiz) {
2433       case SWIZZLE_X: return 0;
2434       case SWIZZLE_Y:
2435          /* gather4 sampler is broken for green channel on RG32F --
2436           * we must ask for blue instead.
2437           */
2438          if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2439             return 2;
2440          return 1;
2441       case SWIZZLE_Z: return 2;
2442       case SWIZZLE_W: return 3;
2443       default:
2444          assert(!"Not reached"); /* zero, one swizzles handled already */
2445          return 0;
2446    }
2447 }
2448
2449 void
2450 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, int sampler)
2451 {
2452    int s = key->tex.swizzles[sampler];
2453
2454    this->result = src_reg(this, ir->type);
2455    dst_reg swizzled_result(this->result);
2456
2457    if (ir->op == ir_query_levels) {
2458       /* # levels is in .w */
2459       orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2460       emit(MOV(swizzled_result, orig_val));
2461       return;
2462    }
2463
2464    if (ir->op == ir_txs || ir->type == glsl_type::float_type
2465                         || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2466       emit(MOV(swizzled_result, orig_val));
2467       return;
2468    }
2469
2470
2471    int zero_mask = 0, one_mask = 0, copy_mask = 0;
2472    int swizzle[4] = {0};
2473
2474    for (int i = 0; i < 4; i++) {
2475       switch (GET_SWZ(s, i)) {
2476       case SWIZZLE_ZERO:
2477          zero_mask |= (1 << i);
2478          break;
2479       case SWIZZLE_ONE:
2480          one_mask |= (1 << i);
2481          break;
2482       default:
2483          copy_mask |= (1 << i);
2484          swizzle[i] = GET_SWZ(s, i);
2485          break;
2486       }
2487    }
2488
2489    if (copy_mask) {
2490       orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2491       swizzled_result.writemask = copy_mask;
2492       emit(MOV(swizzled_result, orig_val));
2493    }
2494
2495    if (zero_mask) {
2496       swizzled_result.writemask = zero_mask;
2497       emit(MOV(swizzled_result, src_reg(0.0f)));
2498    }
2499
2500    if (one_mask) {
2501       swizzled_result.writemask = one_mask;
2502       emit(MOV(swizzled_result, src_reg(1.0f)));
2503    }
2504 }
2505
2506 void
2507 vec4_visitor::visit(ir_return *ir)
2508 {
2509    assert(!"not reached");
2510 }
2511
2512 void
2513 vec4_visitor::visit(ir_discard *ir)
2514 {
2515    assert(!"not reached");
2516 }
2517
2518 void
2519 vec4_visitor::visit(ir_if *ir)
2520 {
2521    /* Don't point the annotation at the if statement, because then it plus
2522     * the then and else blocks get printed.
2523     */
2524    this->base_ir = ir->condition;
2525
2526    if (brw->gen == 6) {
2527       emit_if_gen6(ir);
2528    } else {
2529       uint32_t predicate;
2530       emit_bool_to_cond_code(ir->condition, &predicate);
2531       emit(IF(predicate));
2532    }
2533
2534    visit_instructions(&ir->then_instructions);
2535
2536    if (!ir->else_instructions.is_empty()) {
2537       this->base_ir = ir->condition;
2538       emit(BRW_OPCODE_ELSE);
2539
2540       visit_instructions(&ir->else_instructions);
2541    }
2542
2543    this->base_ir = ir->condition;
2544    emit(BRW_OPCODE_ENDIF);
2545 }
2546
2547 void
2548 vec4_visitor::visit(ir_emit_vertex *)
2549 {
2550    assert(!"not reached");
2551 }
2552
2553 void
2554 vec4_visitor::visit(ir_end_primitive *)
2555 {
2556    assert(!"not reached");
2557 }
2558
2559 void
2560 vec4_visitor::emit_ndc_computation()
2561 {
2562    /* Get the position */
2563    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2564
2565    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2566    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2567    output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2568
2569    current_annotation = "NDC";
2570    dst_reg ndc_w = ndc;
2571    ndc_w.writemask = WRITEMASK_W;
2572    src_reg pos_w = pos;
2573    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2574    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2575
2576    dst_reg ndc_xyz = ndc;
2577    ndc_xyz.writemask = WRITEMASK_XYZ;
2578
2579    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2580 }
2581
2582 void
2583 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2584 {
2585    if (brw->gen < 6 &&
2586        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2587         key->userclip_active || brw->has_negative_rhw_bug)) {
2588       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2589       dst_reg header1_w = header1;
2590       header1_w.writemask = WRITEMASK_W;
2591
2592       emit(MOV(header1, 0u));
2593
2594       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2595          src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2596
2597          current_annotation = "Point size";
2598          emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2599          emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2600       }
2601
2602       if (key->userclip_active) {
2603          current_annotation = "Clipping flags";
2604          dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
2605          dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
2606
2607          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
2608          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
2609          emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
2610
2611          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
2612          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
2613          emit(SHL(flags1, src_reg(flags1), src_reg(4)));
2614          emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
2615       }
2616
2617       /* i965 clipping workaround:
2618        * 1) Test for -ve rhw
2619        * 2) If set,
2620        *      set ndc = (0,0,0,0)
2621        *      set ucp[6] = 1
2622        *
2623        * Later, clipping will detect ucp[6] and ensure the primitive is
2624        * clipped against all fixed planes.
2625        */
2626       if (brw->has_negative_rhw_bug) {
2627          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
2628          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
2629          emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
2630          vec4_instruction *inst;
2631          inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
2632          inst->predicate = BRW_PREDICATE_NORMAL;
2633          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
2634          inst->predicate = BRW_PREDICATE_NORMAL;
2635       }
2636
2637       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2638    } else if (brw->gen < 6) {
2639       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2640    } else {
2641       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2642       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2643          emit(MOV(brw_writemask(reg, WRITEMASK_W),
2644                   src_reg(output_reg[VARYING_SLOT_PSIZ])));
2645       }
2646       if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
2647          emit(MOV(retype(brw_writemask(reg, WRITEMASK_Y), BRW_REGISTER_TYPE_D),
2648                   src_reg(output_reg[VARYING_SLOT_LAYER])));
2649       }
2650    }
2651 }
2652
2653 void
2654 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
2655 {
2656    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2657     *
2658     *     "If a linked set of shaders forming the vertex stage contains no
2659     *     static write to gl_ClipVertex or gl_ClipDistance, but the
2660     *     application has requested clipping against user clip planes through
2661     *     the API, then the coordinate written to gl_Position is used for
2662     *     comparison against the user clip planes."
2663     *
2664     * This function is only called if the shader didn't write to
2665     * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
2666     * if the user wrote to it; otherwise we use gl_Position.
2667     */
2668    gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
2669    if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
2670       clip_vertex = VARYING_SLOT_POS;
2671    }
2672
2673    for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
2674         ++i) {
2675       reg.writemask = 1 << i;
2676       emit(DP4(reg,
2677                src_reg(output_reg[clip_vertex]),
2678                src_reg(this->userplane[i + offset])));
2679    }
2680 }
2681
2682 void
2683 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
2684 {
2685    assert (varying < VARYING_SLOT_MAX);
2686    reg.type = output_reg[varying].type;
2687    current_annotation = output_reg_annotation[varying];
2688    /* Copy the register, saturating if necessary */
2689    vec4_instruction *inst = emit(MOV(reg,
2690                                      src_reg(output_reg[varying])));
2691    if ((varying == VARYING_SLOT_COL0 ||
2692         varying == VARYING_SLOT_COL1 ||
2693         varying == VARYING_SLOT_BFC0 ||
2694         varying == VARYING_SLOT_BFC1) &&
2695        key->clamp_vertex_color) {
2696       inst->saturate = true;
2697    }
2698 }
2699
2700 void
2701 vec4_visitor::emit_urb_slot(int mrf, int varying)
2702 {
2703    struct brw_reg hw_reg = brw_message_reg(mrf);
2704    dst_reg reg = dst_reg(MRF, mrf);
2705    reg.type = BRW_REGISTER_TYPE_F;
2706
2707    switch (varying) {
2708    case VARYING_SLOT_PSIZ:
2709       /* PSIZ is always in slot 0, and is coupled with other flags. */
2710       current_annotation = "indices, point width, clip flags";
2711       emit_psiz_and_flags(hw_reg);
2712       break;
2713    case BRW_VARYING_SLOT_NDC:
2714       current_annotation = "NDC";
2715       emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
2716       break;
2717    case VARYING_SLOT_POS:
2718       current_annotation = "gl_Position";
2719       emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
2720       break;
2721    case VARYING_SLOT_EDGE:
2722       /* This is present when doing unfilled polygons.  We're supposed to copy
2723        * the edge flag from the user-provided vertex array
2724        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
2725        * of that attribute (starts as 1.0f).  This is then used in clipping to
2726        * determine which edges should be drawn as wireframe.
2727        */
2728       current_annotation = "edge flag";
2729       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
2730                                     glsl_type::float_type, WRITEMASK_XYZW))));
2731       break;
2732    case BRW_VARYING_SLOT_PAD:
2733       /* No need to write to this slot */
2734       break;
2735    default:
2736       emit_generic_urb_slot(reg, varying);
2737       break;
2738    }
2739 }
2740
2741 static int
2742 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
2743 {
2744    if (brw->gen >= 6) {
2745       /* URB data written (does not include the message header reg) must
2746        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
2747        * section 5.4.3.2.2: URB_INTERLEAVED.
2748        *
2749        * URB entries are allocated on a multiple of 1024 bits, so an
2750        * extra 128 bits written here to make the end align to 256 is
2751        * no problem.
2752        */
2753       if ((mlen % 2) != 1)
2754          mlen++;
2755    }
2756
2757    return mlen;
2758 }
2759
2760
2761 /**
2762  * Generates the VUE payload plus the necessary URB write instructions to
2763  * output it.
2764  *
2765  * The VUE layout is documented in Volume 2a.
2766  */
2767 void
2768 vec4_visitor::emit_vertex()
2769 {
2770    /* MRF 0 is reserved for the debugger, so start with message header
2771     * in MRF 1.
2772     */
2773    int base_mrf = 1;
2774    int mrf = base_mrf;
2775    /* In the process of generating our URB write message contents, we
2776     * may need to unspill a register or load from an array.  Those
2777     * reads would use MRFs 14-15.
2778     */
2779    int max_usable_mrf = 13;
2780
2781    /* The following assertion verifies that max_usable_mrf causes an
2782     * even-numbered amount of URB write data, which will meet gen6's
2783     * requirements for length alignment.
2784     */
2785    assert ((max_usable_mrf - base_mrf) % 2 == 0);
2786
2787    /* First mrf is the g0-based message header containing URB handles and
2788     * such.
2789     */
2790    emit_urb_write_header(mrf++);
2791
2792    if (brw->gen < 6) {
2793       emit_ndc_computation();
2794    }
2795
2796    /* Lower legacy ff and ClipVertex clipping to clip distances */
2797    if (key->userclip_active && !prog->UsesClipDistanceOut) {
2798       current_annotation = "user clip distances";
2799
2800       output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
2801       output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
2802
2803       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
2804       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
2805    }
2806
2807    /* We may need to split this up into several URB writes, so do them in a
2808     * loop.
2809     */
2810    int slot = 0;
2811    bool complete = false;
2812    do {
2813       /* URB offset is in URB row increments, and each of our MRFs is half of
2814        * one of those, since we're doing interleaved writes.
2815        */
2816       int offset = slot / 2;
2817
2818       mrf = base_mrf + 1;
2819       for (; slot < prog_data->vue_map.num_slots; ++slot) {
2820          emit_urb_slot(mrf++, prog_data->vue_map.slot_to_varying[slot]);
2821
2822          /* If this was max_usable_mrf, we can't fit anything more into this
2823           * URB WRITE.
2824           */
2825          if (mrf > max_usable_mrf) {
2826             slot++;
2827             break;
2828          }
2829       }
2830
2831       complete = slot >= prog_data->vue_map.num_slots;
2832       current_annotation = "URB write";
2833       vec4_instruction *inst = emit_urb_write_opcode(complete);
2834       inst->base_mrf = base_mrf;
2835       inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2836       inst->offset += offset;
2837    } while(!complete);
2838 }
2839
2840
2841 src_reg
2842 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
2843                                  src_reg *reladdr, int reg_offset)
2844 {
2845    /* Because we store the values to scratch interleaved like our
2846     * vertex data, we need to scale the vec4 index by 2.
2847     */
2848    int message_header_scale = 2;
2849
2850    /* Pre-gen6, the message header uses byte offsets instead of vec4
2851     * (16-byte) offset units.
2852     */
2853    if (brw->gen < 6)
2854       message_header_scale *= 16;
2855
2856    if (reladdr) {
2857       src_reg index = src_reg(this, glsl_type::int_type);
2858
2859       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2860       emit_before(inst, MUL(dst_reg(index),
2861                             index, src_reg(message_header_scale)));
2862
2863       return index;
2864    } else {
2865       return src_reg(reg_offset * message_header_scale);
2866    }
2867 }
2868
2869 src_reg
2870 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
2871                                        src_reg *reladdr, int reg_offset)
2872 {
2873    if (reladdr) {
2874       src_reg index = src_reg(this, glsl_type::int_type);
2875
2876       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2877
2878       /* Pre-gen6, the message header uses byte offsets instead of vec4
2879        * (16-byte) offset units.
2880        */
2881       if (brw->gen < 6) {
2882          emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
2883       }
2884
2885       return index;
2886    } else {
2887       int message_header_scale = brw->gen < 6 ? 16 : 1;
2888       return src_reg(reg_offset * message_header_scale);
2889    }
2890 }
2891
2892 /**
2893  * Emits an instruction before @inst to load the value named by @orig_src
2894  * from scratch space at @base_offset to @temp.
2895  *
2896  * @base_offset is measured in 32-byte units (the size of a register).
2897  */
2898 void
2899 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
2900                                 dst_reg temp, src_reg orig_src,
2901                                 int base_offset)
2902 {
2903    int reg_offset = base_offset + orig_src.reg_offset;
2904    src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
2905
2906    emit_before(inst, SCRATCH_READ(temp, index));
2907 }
2908
2909 /**
2910  * Emits an instruction after @inst to store the value to be written
2911  * to @orig_dst to scratch space at @base_offset, from @temp.
2912  *
2913  * @base_offset is measured in 32-byte units (the size of a register).
2914  */
2915 void
2916 vec4_visitor::emit_scratch_write(vec4_instruction *inst, int base_offset)
2917 {
2918    int reg_offset = base_offset + inst->dst.reg_offset;
2919    src_reg index = get_scratch_offset(inst, inst->dst.reladdr, reg_offset);
2920
2921    /* Create a temporary register to store *inst's result in.
2922     *
2923     * We have to be careful in MOVing from our temporary result register in
2924     * the scratch write.  If we swizzle from channels of the temporary that
2925     * weren't initialized, it will confuse live interval analysis, which will
2926     * make spilling fail to make progress.
2927     */
2928    src_reg temp = src_reg(this, glsl_type::vec4_type);
2929    temp.type = inst->dst.type;
2930    int first_writemask_chan = ffs(inst->dst.writemask) - 1;
2931    int swizzles[4];
2932    for (int i = 0; i < 4; i++)
2933       if (inst->dst.writemask & (1 << i))
2934          swizzles[i] = i;
2935       else
2936          swizzles[i] = first_writemask_chan;
2937    temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2938                                swizzles[2], swizzles[3]);
2939
2940    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
2941                                        inst->dst.writemask));
2942    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
2943    write->predicate = inst->predicate;
2944    write->ir = inst->ir;
2945    write->annotation = inst->annotation;
2946    inst->insert_after(write);
2947
2948    inst->dst.file = temp.file;
2949    inst->dst.reg = temp.reg;
2950    inst->dst.reg_offset = temp.reg_offset;
2951    inst->dst.reladdr = NULL;
2952 }
2953
2954 /**
2955  * We can't generally support array access in GRF space, because a
2956  * single instruction's destination can only span 2 contiguous
2957  * registers.  So, we send all GRF arrays that get variable index
2958  * access to scratch space.
2959  */
2960 void
2961 vec4_visitor::move_grf_array_access_to_scratch()
2962 {
2963    int scratch_loc[this->virtual_grf_count];
2964
2965    for (int i = 0; i < this->virtual_grf_count; i++) {
2966       scratch_loc[i] = -1;
2967    }
2968
2969    /* First, calculate the set of virtual GRFs that need to be punted
2970     * to scratch due to having any array access on them, and where in
2971     * scratch.
2972     */
2973    foreach_list(node, &this->instructions) {
2974       vec4_instruction *inst = (vec4_instruction *)node;
2975
2976       if (inst->dst.file == GRF && inst->dst.reladdr &&
2977           scratch_loc[inst->dst.reg] == -1) {
2978          scratch_loc[inst->dst.reg] = c->last_scratch;
2979          c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
2980       }
2981
2982       for (int i = 0 ; i < 3; i++) {
2983          src_reg *src = &inst->src[i];
2984
2985          if (src->file == GRF && src->reladdr &&
2986              scratch_loc[src->reg] == -1) {
2987             scratch_loc[src->reg] = c->last_scratch;
2988             c->last_scratch += this->virtual_grf_sizes[src->reg];
2989          }
2990       }
2991    }
2992
2993    /* Now, for anything that will be accessed through scratch, rewrite
2994     * it to load/store.  Note that this is a _safe list walk, because
2995     * we may generate a new scratch_write instruction after the one
2996     * we're processing.
2997     */
2998    foreach_list_safe(node, &this->instructions) {
2999       vec4_instruction *inst = (vec4_instruction *)node;
3000
3001       /* Set up the annotation tracking for new generated instructions. */
3002       base_ir = inst->ir;
3003       current_annotation = inst->annotation;
3004
3005       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
3006          emit_scratch_write(inst, scratch_loc[inst->dst.reg]);
3007       }
3008
3009       for (int i = 0 ; i < 3; i++) {
3010          if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
3011             continue;
3012
3013          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3014
3015          emit_scratch_read(inst, temp, inst->src[i],
3016                            scratch_loc[inst->src[i].reg]);
3017
3018          inst->src[i].file = temp.file;
3019          inst->src[i].reg = temp.reg;
3020          inst->src[i].reg_offset = temp.reg_offset;
3021          inst->src[i].reladdr = NULL;
3022       }
3023    }
3024 }
3025
3026 /**
3027  * Emits an instruction before @inst to load the value named by @orig_src
3028  * from the pull constant buffer (surface) at @base_offset to @temp.
3029  */
3030 void
3031 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
3032                                       dst_reg temp, src_reg orig_src,
3033                                       int base_offset)
3034 {
3035    int reg_offset = base_offset + orig_src.reg_offset;
3036    src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3037    src_reg offset = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
3038    vec4_instruction *load;
3039
3040    if (brw->gen >= 7) {
3041       dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
3042       grf_offset.type = offset.type;
3043       emit_before(inst, MOV(grf_offset, offset));
3044
3045       load = new(mem_ctx) vec4_instruction(this,
3046                                            VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
3047                                            temp, index, src_reg(grf_offset));
3048    } else {
3049       load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
3050                                            temp, index, offset);
3051       load->base_mrf = 14;
3052       load->mlen = 1;
3053    }
3054    emit_before(inst, load);
3055 }
3056
3057 /**
3058  * Implements array access of uniforms by inserting a
3059  * PULL_CONSTANT_LOAD instruction.
3060  *
3061  * Unlike temporary GRF array access (where we don't support it due to
3062  * the difficulty of doing relative addressing on instruction
3063  * destinations), we could potentially do array access of uniforms
3064  * that were loaded in GRF space as push constants.  In real-world
3065  * usage we've seen, though, the arrays being used are always larger
3066  * than we could load as push constants, so just always move all
3067  * uniform array access out to a pull constant buffer.
3068  */
3069 void
3070 vec4_visitor::move_uniform_array_access_to_pull_constants()
3071 {
3072    int pull_constant_loc[this->uniforms];
3073
3074    for (int i = 0; i < this->uniforms; i++) {
3075       pull_constant_loc[i] = -1;
3076    }
3077
3078    /* Walk through and find array access of uniforms.  Put a copy of that
3079     * uniform in the pull constant buffer.
3080     *
3081     * Note that we don't move constant-indexed accesses to arrays.  No
3082     * testing has been done of the performance impact of this choice.
3083     */
3084    foreach_list_safe(node, &this->instructions) {
3085       vec4_instruction *inst = (vec4_instruction *)node;
3086
3087       for (int i = 0 ; i < 3; i++) {
3088          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3089             continue;
3090
3091          int uniform = inst->src[i].reg;
3092
3093          /* If this array isn't already present in the pull constant buffer,
3094           * add it.
3095           */
3096          if (pull_constant_loc[uniform] == -1) {
3097             const float **values = &prog_data->param[uniform * 4];
3098
3099             pull_constant_loc[uniform] = prog_data->nr_pull_params / 4;
3100
3101             for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3102                prog_data->pull_param[prog_data->nr_pull_params++]
3103                   = values[j];
3104             }
3105          }
3106
3107          /* Set up the annotation tracking for new generated instructions. */
3108          base_ir = inst->ir;
3109          current_annotation = inst->annotation;
3110
3111          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3112
3113          emit_pull_constant_load(inst, temp, inst->src[i],
3114                                  pull_constant_loc[uniform]);
3115
3116          inst->src[i].file = temp.file;
3117          inst->src[i].reg = temp.reg;
3118          inst->src[i].reg_offset = temp.reg_offset;
3119          inst->src[i].reladdr = NULL;
3120       }
3121    }
3122
3123    /* Now there are no accesses of the UNIFORM file with a reladdr, so
3124     * no need to track them as larger-than-vec4 objects.  This will be
3125     * relied on in cutting out unused uniform vectors from push
3126     * constants.
3127     */
3128    split_uniform_registers();
3129 }
3130
3131 void
3132 vec4_visitor::resolve_ud_negate(src_reg *reg)
3133 {
3134    if (reg->type != BRW_REGISTER_TYPE_UD ||
3135        !reg->negate)
3136       return;
3137
3138    src_reg temp = src_reg(this, glsl_type::uvec4_type);
3139    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3140    *reg = temp;
3141 }
3142
3143 vec4_visitor::vec4_visitor(struct brw_context *brw,
3144                            struct brw_vec4_compile *c,
3145                            struct gl_program *prog,
3146                            const struct brw_vec4_prog_key *key,
3147                            struct brw_vec4_prog_data *prog_data,
3148                            struct gl_shader_program *shader_prog,
3149                            struct brw_shader *shader,
3150                            void *mem_ctx,
3151                            bool debug_flag,
3152                            bool no_spills)
3153    : debug_flag(debug_flag), no_spills(no_spills)
3154 {
3155    this->brw = brw;
3156    this->ctx = &brw->ctx;
3157    this->shader_prog = shader_prog;
3158    this->shader = shader;
3159
3160    this->mem_ctx = mem_ctx;
3161    this->failed = false;
3162
3163    this->base_ir = NULL;
3164    this->current_annotation = NULL;
3165    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3166
3167    this->c = c;
3168    this->prog = prog;
3169    this->key = key;
3170    this->prog_data = prog_data;
3171    this->stage_prog_data = &prog_data->base;
3172
3173    this->variable_ht = hash_table_ctor(0,
3174                                        hash_table_pointer_hash,
3175                                        hash_table_pointer_compare);
3176
3177    this->virtual_grf_start = NULL;
3178    this->virtual_grf_end = NULL;
3179    this->virtual_grf_sizes = NULL;
3180    this->virtual_grf_count = 0;
3181    this->virtual_grf_reg_map = NULL;
3182    this->virtual_grf_reg_count = 0;
3183    this->virtual_grf_array_size = 0;
3184    this->live_intervals_valid = false;
3185
3186    this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3187
3188    this->uniforms = 0;
3189 }
3190
3191 vec4_visitor::~vec4_visitor()
3192 {
3193    hash_table_dtor(this->variable_ht);
3194 }
3195
3196
3197 void
3198 vec4_visitor::fail(const char *format, ...)
3199 {
3200    va_list va;
3201    char *msg;
3202
3203    if (failed)
3204       return;
3205
3206    failed = true;
3207
3208    va_start(va, format);
3209    msg = ralloc_vasprintf(mem_ctx, format, va);
3210    va_end(va);
3211    msg = ralloc_asprintf(mem_ctx, "VS compile failed: %s\n", msg);
3212
3213    this->fail_msg = msg;
3214
3215    if (debug_flag) {
3216       fprintf(stderr, "%s",  msg);
3217    }
3218 }
3219
3220 } /* namespace brw */