src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "brw_cfg.h"
  26 #include "glsl/ir_uniform.h"
  27 extern "C" {
  28 #include "program/sampler.h"
  29 }
  30
  31 namespace brw {
  32
  33 vec4_instruction::vec4_instruction(vec4_visitor *v,
  34                                    enum opcode opcode, const dst_reg &dst,
  35                                    const src_reg &src0, const src_reg &src1,
  36                                    const src_reg &src2)
  37 {
  38    this->opcode = opcode;
  39    this->dst = dst;
  40    this->src[0] = src0;
  41    this->src[1] = src1;
  42    this->src[2] = src2;
  43    this->saturate = false;
  44    this->force_writemask_all = false;
  45    this->no_dd_clear = false;
  46    this->no_dd_check = false;
  47    this->writes_accumulator = false;
  48    this->conditional_mod = BRW_CONDITIONAL_NONE;
  49    this->texture_offset = 0;
  50    this->target = 0;
  51    this->shadow_compare = false;
  52    this->ir = v->base_ir;
  53    this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
  54    this->header_present = false;
  55    this->mlen = 0;
  56    this->base_mrf = 0;
  57    this->offset = 0;
  58    this->annotation = v->current_annotation;
  59 }
  60
  61 vec4_instruction *
  62 vec4_visitor::emit(vec4_instruction *inst)
  63 {
  64    this->instructions.push_tail(inst);
  65
  66    return inst;
  67 }
  68
  69 vec4_instruction *
  70 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
  71                           vec4_instruction *new_inst)
  72 {
  73    new_inst->ir = inst->ir;
  74    new_inst->annotation = inst->annotation;
  75
  76    inst->insert_before(block, new_inst);
  77
  78    return inst;
  79 }
  80
  81 vec4_instruction *
  82 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
  83                    src_reg src0, src_reg src1, src_reg src2)
  84 {
  85    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
  86                                              src0, src1, src2));
  87 }
  88
  89
  90 vec4_instruction *
  91 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
  92 {
  93    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
  94 }
  95
  96 vec4_instruction *
  97 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
  98 {
  99    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
 100 }
 101
 102 vec4_instruction *
 103 vec4_visitor::emit(enum opcode opcode, dst_reg dst)
 104 {
 105    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst));
 106 }
 107
 108 vec4_instruction *
 109 vec4_visitor::emit(enum opcode opcode)
 110 {
 111    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
 112 }
 113
 114 #define ALU1(op)                                                        \
 115    vec4_instruction *                                                   \
 116    vec4_visitor::op(const dst_reg &dst, const src_reg &src0)            \
 117    {                                                                    \
 118       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 119                                            src0);                       \
 120    }
 121
 122 #define ALU2(op)                                                        \
 123    vec4_instruction *                                                   \
 124    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 125                     const src_reg &src1)                                \
 126    {                                                                    \
 127       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 128                                            src0, src1);                 \
 129    }
 130
 131 #define ALU2_ACC(op)                                                    \
 132    vec4_instruction *                                                   \
 133    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 134                     const src_reg &src1)                                \
 135    {                                                                    \
 136       vec4_instruction *inst = new(mem_ctx) vec4_instruction(this,     \
 137                        BRW_OPCODE_##op, dst, src0, src1);               \
 138       inst->writes_accumulator = true;                                 \
 139       return inst;                                                     \
 140    }
 141
 142 #define ALU3(op)                                                        \
 143    vec4_instruction *                                                   \
 144    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 145                     const src_reg &src1, const src_reg &src2)           \
 146    {                                                                    \
 147       assert(brw->gen >= 6);                                            \
 148       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 149                                            src0, src1, src2);           \
 150    }
 151
 152 ALU1(NOT)
 153 ALU1(MOV)
 154 ALU1(FRC)
 155 ALU1(RNDD)
 156 ALU1(RNDE)
 157 ALU1(RNDZ)
 158 ALU1(F32TO16)
 159 ALU1(F16TO32)
 160 ALU2(ADD)
 161 ALU2(MUL)
 162 ALU2_ACC(MACH)
 163 ALU2(AND)
 164 ALU2(OR)
 165 ALU2(XOR)
 166 ALU2(DP3)
 167 ALU2(DP4)
 168 ALU2(DPH)
 169 ALU2(SHL)
 170 ALU2(SHR)
 171 ALU2(ASR)
 172 ALU3(LRP)
 173 ALU1(BFREV)
 174 ALU3(BFE)
 175 ALU2(BFI1)
 176 ALU3(BFI2)
 177 ALU1(FBH)
 178 ALU1(FBL)
 179 ALU1(CBIT)
 180 ALU3(MAD)
 181 ALU2_ACC(ADDC)
 182 ALU2_ACC(SUBB)
 183 ALU2(MAC)
 184
 185 /** Gen4 predicated IF. */
 186 vec4_instruction *
 187 vec4_visitor::IF(enum brw_predicate predicate)
 188 {
 189    vec4_instruction *inst;
 190
 191    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
 192    inst->predicate = predicate;
 193
 194    return inst;
 195 }
 196
 197 /** Gen6 IF with embedded comparison. */
 198 vec4_instruction *
 199 vec4_visitor::IF(src_reg src0, src_reg src1,
 200                  enum brw_conditional_mod condition)
 201 {
 202    assert(brw->gen == 6);
 203
 204    vec4_instruction *inst;
 205
 206    resolve_ud_negate(&src0);
 207    resolve_ud_negate(&src1);
 208
 209    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
 210                                         src0, src1);
 211    inst->conditional_mod = condition;
 212
 213    return inst;
 214 }
 215
 216 /**
 217  * CMP: Sets the low bit of the destination channels with the result
 218  * of the comparison, while the upper bits are undefined, and updates
 219  * the flag register with the packed 16 bits of the result.
 220  */
 221 vec4_instruction *
 222 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
 223                   enum brw_conditional_mod condition)
 224 {
 225    vec4_instruction *inst;
 226
 227    /* original gen4 does type conversion to the destination type
 228     * before before comparison, producing garbage results for floating
 229     * point comparisons.
 230     */
 231    if (brw->gen == 4) {
 232       dst.type = src0.type;
 233       if (dst.file == HW_REG)
 234          dst.fixed_hw_reg.type = dst.type;
 235    }
 236
 237    resolve_ud_negate(&src0);
 238    resolve_ud_negate(&src1);
 239
 240    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
 241    inst->conditional_mod = condition;
 242
 243    return inst;
 244 }
 245
 246 vec4_instruction *
 247 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
 248 {
 249    vec4_instruction *inst;
 250
 251    inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_READ,
 252                                         dst, index);
 253    inst->base_mrf = 14;
 254    inst->mlen = 2;
 255
 256    return inst;
 257 }
 258
 259 vec4_instruction *
 260 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
 261                             const src_reg &index)
 262 {
 263    vec4_instruction *inst;
 264
 265    inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_WRITE,
 266                                         dst, src, index);
 267    inst->base_mrf = 13;
 268    inst->mlen = 3;
 269
 270    return inst;
 271 }
 272
 273 void
 274 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
 275 {
 276    static enum opcode dot_opcodes[] = {
 277       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
 278    };
 279
 280    emit(dot_opcodes[elements - 2], dst, src0, src1);
 281 }
 282
 283 src_reg
 284 vec4_visitor::fix_3src_operand(src_reg src)
 285 {
 286    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
 287     * able to use vertical stride of zero to replicate the vec4 uniform, like
 288     *
 289     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
 290     *
 291     * But you can't, since vertical stride is always four in three-source
 292     * instructions. Instead, insert a MOV instruction to do the replication so
 293     * that the three-source instruction can consume it.
 294     */
 295
 296    /* The MOV is only needed if the source is a uniform or immediate. */
 297    if (src.file != UNIFORM && src.file != IMM)
 298       return src;
 299
 300    if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
 301       return src;
 302
 303    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 304    expanded.type = src.type;
 305    emit(MOV(expanded, src));
 306    return src_reg(expanded);
 307 }
 308
 309 src_reg
 310 vec4_visitor::fix_math_operand(src_reg src)
 311 {
 312    /* The gen6 math instruction ignores the source modifiers --
 313     * swizzle, abs, negate, and at least some parts of the register
 314     * region description.
 315     *
 316     * Rather than trying to enumerate all these cases, *always* expand the
 317     * operand to a temp GRF for gen6.
 318     *
 319     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 320     * can't use.
 321     */
 322
 323    if (brw->gen == 7 && src.file != IMM)
 324       return src;
 325
 326    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 327    expanded.type = src.type;
 328    emit(MOV(expanded, src));
 329    return src_reg(expanded);
 330 }
 331
 332 void
 333 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
 334 {
 335    src = fix_math_operand(src);
 336
 337    if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
 338       /* The gen6 math instruction must be align1, so we can't do
 339        * writemasks.
 340        */
 341       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 342
 343       emit(opcode, temp_dst, src);
 344
 345       emit(MOV(dst, src_reg(temp_dst)));
 346    } else {
 347       emit(opcode, dst, src);
 348    }
 349 }
 350
 351 void
 352 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
 353 {
 354    vec4_instruction *inst = emit(opcode, dst, src);
 355    inst->base_mrf = 1;
 356    inst->mlen = 1;
 357 }
 358
 359 void
 360 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
 361 {
 362    switch (opcode) {
 363    case SHADER_OPCODE_RCP:
 364    case SHADER_OPCODE_RSQ:
 365    case SHADER_OPCODE_SQRT:
 366    case SHADER_OPCODE_EXP2:
 367    case SHADER_OPCODE_LOG2:
 368    case SHADER_OPCODE_SIN:
 369    case SHADER_OPCODE_COS:
 370       break;
 371    default:
 372       unreachable("not reached: bad math opcode");
 373    }
 374
 375    if (brw->gen >= 8) {
 376       emit(opcode, dst, src);
 377    } else if (brw->gen >= 6) {
 378       emit_math1_gen6(opcode, dst, src);
 379    } else {
 380       emit_math1_gen4(opcode, dst, src);
 381    }
 382 }
 383
 384 void
 385 vec4_visitor::emit_math2_gen6(enum opcode opcode,
 386                               dst_reg dst, src_reg src0, src_reg src1)
 387 {
 388    src0 = fix_math_operand(src0);
 389    src1 = fix_math_operand(src1);
 390
 391    if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
 392       /* The gen6 math instruction must be align1, so we can't do
 393        * writemasks.
 394        */
 395       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 396       temp_dst.type = dst.type;
 397
 398       emit(opcode, temp_dst, src0, src1);
 399
 400       emit(MOV(dst, src_reg(temp_dst)));
 401    } else {
 402       emit(opcode, dst, src0, src1);
 403    }
 404 }
 405
 406 void
 407 vec4_visitor::emit_math2_gen4(enum opcode opcode,
 408                               dst_reg dst, src_reg src0, src_reg src1)
 409 {
 410    vec4_instruction *inst = emit(opcode, dst, src0, src1);
 411    inst->base_mrf = 1;
 412    inst->mlen = 2;
 413 }
 414
 415 void
 416 vec4_visitor::emit_math(enum opcode opcode,
 417                         dst_reg dst, src_reg src0, src_reg src1)
 418 {
 419    switch (opcode) {
 420    case SHADER_OPCODE_POW:
 421    case SHADER_OPCODE_INT_QUOTIENT:
 422    case SHADER_OPCODE_INT_REMAINDER:
 423       break;
 424    default:
 425       unreachable("not reached: unsupported binary math opcode");
 426    }
 427
 428    if (brw->gen >= 8) {
 429       emit(opcode, dst, src0, src1);
 430    } else if (brw->gen >= 6) {
 431       emit_math2_gen6(opcode, dst, src0, src1);
 432    } else {
 433       emit_math2_gen4(opcode, dst, src0, src1);
 434    }
 435 }
 436
 437 void
 438 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 439 {
 440    if (brw->gen < 7) {
 441       unreachable("ir_unop_pack_half_2x16 should be lowered");
 442    }
 443
 444    assert(dst.type == BRW_REGISTER_TYPE_UD);
 445    assert(src0.type == BRW_REGISTER_TYPE_F);
 446
 447    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
 448     *
 449     *   Because this instruction does not have a 16-bit floating-point type,
 450     *   the destination data type must be Word (W).
 451     *
 452     *   The destination must be DWord-aligned and specify a horizontal stride
 453     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
 454     *   each destination channel and the upper word is not modified.
 455     *
 456     * The above restriction implies that the f32to16 instruction must use
 457     * align1 mode, because only in align1 mode is it possible to specify
 458     * horizontal stride.  We choose here to defy the hardware docs and emit
 459     * align16 instructions.
 460     *
 461     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
 462     * instructions. I was partially successful in that the code passed all
 463     * tests.  However, the code was dubiously correct and fragile, and the
 464     * tests were not harsh enough to probe that frailty. Not trusting the
 465     * code, I chose instead to remain in align16 mode in defiance of the hw
 466     * docs).
 467     *
 468     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
 469     * simulator, emitting a f32to16 in align16 mode with UD as destination
 470     * data type is safe. The behavior differs from that specified in the PRM
 471     * in that the upper word of each destination channel is cleared to 0.
 472     */
 473
 474    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 475    src_reg tmp_src(tmp_dst);
 476
 477 #if 0
 478    /* Verify the undocumented behavior on which the following instructions
 479     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
 480     * then the result of the bit-or instruction below will be incorrect.
 481     *
 482     * You should inspect the disasm output in order to verify that the MOV is
 483     * not optimized away.
 484     */
 485    emit(MOV(tmp_dst, src_reg(0x12345678u)));
 486 #endif
 487
 488    /* Give tmp the form below, where "." means untouched.
 489     *
 490     *     w z          y          x w z          y          x
 491     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
 492     *
 493     * That the upper word of each write-channel be 0 is required for the
 494     * following bit-shift and bit-or instructions to work. Note that this
 495     * relies on the undocumented hardware behavior mentioned above.
 496     */
 497    tmp_dst.writemask = WRITEMASK_XY;
 498    emit(F32TO16(tmp_dst, src0));
 499
 500    /* Give the write-channels of dst the form:
 501     *   0xhhhh0000
 502     */
 503    tmp_src.swizzle = BRW_SWIZZLE_YYYY;
 504    emit(SHL(dst, tmp_src, src_reg(16u)));
 505
 506    /* Finally, give the write-channels of dst the form of packHalf2x16's
 507     * output:
 508     *   0xhhhhllll
 509     */
 510    tmp_src.swizzle = BRW_SWIZZLE_XXXX;
 511    emit(OR(dst, src_reg(dst), tmp_src));
 512 }
 513
 514 void
 515 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 516 {
 517    if (brw->gen < 7) {
 518       unreachable("ir_unop_unpack_half_2x16 should be lowered");
 519    }
 520
 521    assert(dst.type == BRW_REGISTER_TYPE_F);
 522    assert(src0.type == BRW_REGISTER_TYPE_UD);
 523
 524    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
 525     *
 526     *   Because this instruction does not have a 16-bit floating-point type,
 527     *   the source data type must be Word (W). The destination type must be
 528     *   F (Float).
 529     *
 530     * To use W as the source data type, we must adjust horizontal strides,
 531     * which is only possible in align1 mode. All my [chadv] attempts at
 532     * emitting align1 instructions for unpackHalf2x16 failed to pass the
 533     * Piglit tests, so I gave up.
 534     *
 535     * I've verified that, on gen7 hardware and the simulator, it is safe to
 536     * emit f16to32 in align16 mode with UD as source data type.
 537     */
 538
 539    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 540    src_reg tmp_src(tmp_dst);
 541
 542    tmp_dst.writemask = WRITEMASK_X;
 543    emit(AND(tmp_dst, src0, src_reg(0xffffu)));
 544
 545    tmp_dst.writemask = WRITEMASK_Y;
 546    emit(SHR(tmp_dst, src0, src_reg(16u)));
 547
 548    dst.writemask = WRITEMASK_XY;
 549    emit(F16TO32(dst, tmp_src));
 550 }
 551
 552 void
 553 vec4_visitor::visit_instructions(const exec_list *list)
 554 {
 555    foreach_in_list(ir_instruction, ir, list) {
 556       base_ir = ir;
 557       ir->accept(this);
 558    }
 559 }
 560
 561
 562 static int
 563 type_size(const struct glsl_type *type)
 564 {
 565    unsigned int i;
 566    int size;
 567
 568    switch (type->base_type) {
 569    case GLSL_TYPE_UINT:
 570    case GLSL_TYPE_INT:
 571    case GLSL_TYPE_FLOAT:
 572    case GLSL_TYPE_BOOL:
 573       if (type->is_matrix()) {
 574          return type->matrix_columns;
 575       } else {
 576          /* Regardless of size of vector, it gets a vec4. This is bad
 577           * packing for things like floats, but otherwise arrays become a
 578           * mess.  Hopefully a later pass over the code can pack scalars
 579           * down if appropriate.
 580           */
 581          return 1;
 582       }
 583    case GLSL_TYPE_ARRAY:
 584       assert(type->length > 0);
 585       return type_size(type->fields.array) * type->length;
 586    case GLSL_TYPE_STRUCT:
 587       size = 0;
 588       for (i = 0; i < type->length; i++) {
 589          size += type_size(type->fields.structure[i].type);
 590       }
 591       return size;
 592    case GLSL_TYPE_SAMPLER:
 593       /* Samplers take up no register space, since they're baked in at
 594        * link time.
 595        */
 596       return 0;
 597    case GLSL_TYPE_ATOMIC_UINT:
 598       return 0;
 599    case GLSL_TYPE_IMAGE:
 600    case GLSL_TYPE_VOID:
 601    case GLSL_TYPE_ERROR:
 602    case GLSL_TYPE_INTERFACE:
 603       unreachable("not reached");
 604    }
 605
 606    return 0;
 607 }
 608
 609 int
 610 vec4_visitor::virtual_grf_alloc(int size)
 611 {
 612    if (virtual_grf_array_size <= virtual_grf_count) {
 613       if (virtual_grf_array_size == 0)
 614          virtual_grf_array_size = 16;
 615       else
 616          virtual_grf_array_size *= 2;
 617       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 618                                    virtual_grf_array_size);
 619       virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
 620                                      virtual_grf_array_size);
 621    }
 622    virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
 623    virtual_grf_reg_count += size;
 624    virtual_grf_sizes[virtual_grf_count] = size;
 625    return virtual_grf_count++;
 626 }
 627
 628 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 629 {
 630    init();
 631
 632    this->file = GRF;
 633    this->reg = v->virtual_grf_alloc(type_size(type));
 634
 635    if (type->is_array() || type->is_record()) {
 636       this->swizzle = BRW_SWIZZLE_NOOP;
 637    } else {
 638       this->swizzle = swizzle_for_size(type->vector_elements);
 639    }
 640
 641    this->type = brw_type_for_base_type(type);
 642 }
 643
 644 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
 645 {
 646    assert(size > 0);
 647
 648    init();
 649
 650    this->file = GRF;
 651    this->reg = v->virtual_grf_alloc(type_size(type) * size);
 652
 653    this->swizzle = BRW_SWIZZLE_NOOP;
 654
 655    this->type = brw_type_for_base_type(type);
 656 }
 657
 658 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 659 {
 660    init();
 661
 662    this->file = GRF;
 663    this->reg = v->virtual_grf_alloc(type_size(type));
 664
 665    if (type->is_array() || type->is_record()) {
 666       this->writemask = WRITEMASK_XYZW;
 667    } else {
 668       this->writemask = (1 << type->vector_elements) - 1;
 669    }
 670
 671    this->type = brw_type_for_base_type(type);
 672 }
 673
 674 /* Our support for uniforms is piggy-backed on the struct
 675  * gl_fragment_program, because that's where the values actually
 676  * get stored, rather than in some global gl_shader_program uniform
 677  * store.
 678  */
 679 void
 680 vec4_visitor::setup_uniform_values(ir_variable *ir)
 681 {
 682    int namelen = strlen(ir->name);
 683
 684    /* The data for our (non-builtin) uniforms is stored in a series of
 685     * gl_uniform_driver_storage structs for each subcomponent that
 686     * glGetUniformLocation() could name.  We know it's been set up in the same
 687     * order we'd walk the type, so walk the list of storage and find anything
 688     * with our name, or the prefix of a component that starts with our name.
 689     */
 690    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 691       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 692
 693       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 694           (storage->name[namelen] != 0 &&
 695            storage->name[namelen] != '.' &&
 696            storage->name[namelen] != '[')) {
 697          continue;
 698       }
 699
 700       gl_constant_value *components = storage->storage;
 701       unsigned vector_count = (MAX2(storage->array_elements, 1) *
 702                                storage->type->matrix_columns);
 703
 704       for (unsigned s = 0; s < vector_count; s++) {
 705          assert(uniforms < uniform_array_size);
 706          uniform_vector_size[uniforms] = storage->type->vector_elements;
 707
 708          int i;
 709          for (i = 0; i < uniform_vector_size[uniforms]; i++) {
 710             stage_prog_data->param[uniforms * 4 + i] = components;
 711             components++;
 712          }
 713          for (; i < 4; i++) {
 714             static gl_constant_value zero = { 0.0 };
 715             stage_prog_data->param[uniforms * 4 + i] = &zero;
 716          }
 717
 718          uniforms++;
 719       }
 720    }
 721 }
 722
 723 void
 724 vec4_visitor::setup_uniform_clipplane_values()
 725 {
 726    gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
 727
 728    for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
 729       assert(this->uniforms < uniform_array_size);
 730       this->uniform_vector_size[this->uniforms] = 4;
 731       this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
 732       this->userplane[i].type = BRW_REGISTER_TYPE_F;
 733       for (int j = 0; j < 4; ++j) {
 734          stage_prog_data->param[this->uniforms * 4 + j] =
 735             (gl_constant_value *) &clip_planes[i][j];
 736       }
 737       ++this->uniforms;
 738    }
 739 }
 740
 741 /* Our support for builtin uniforms is even scarier than non-builtin.
 742  * It sits on top of the PROG_STATE_VAR parameters that are
 743  * automatically updated from GL context state.
 744  */
 745 void
 746 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
 747 {
 748    const ir_state_slot *const slots = ir->get_state_slots();
 749    assert(slots != NULL);
 750
 751    for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
 752       /* This state reference has already been setup by ir_to_mesa,
 753        * but we'll get the same index back here.  We can reference
 754        * ParameterValues directly, since unlike brw_fs.cpp, we never
 755        * add new state references during compile.
 756        */
 757       int index = _mesa_add_state_reference(this->prog->Parameters,
 758                                             (gl_state_index *)slots[i].tokens);
 759       gl_constant_value *values =
 760          &this->prog->Parameters->ParameterValues[index][0];
 761
 762       assert(this->uniforms < uniform_array_size);
 763       this->uniform_vector_size[this->uniforms] = 0;
 764       /* Add each of the unique swizzled channels of the element.
 765        * This will end up matching the size of the glsl_type of this field.
 766        */
 767       int last_swiz = -1;
 768       for (unsigned int j = 0; j < 4; j++) {
 769          int swiz = GET_SWZ(slots[i].swizzle, j);
 770          last_swiz = swiz;
 771
 772          stage_prog_data->param[this->uniforms * 4 + j] = &values[swiz];
 773          assert(this->uniforms < uniform_array_size);
 774          if (swiz <= last_swiz)
 775             this->uniform_vector_size[this->uniforms]++;
 776       }
 777       this->uniforms++;
 778    }
 779 }
 780
 781 dst_reg *
 782 vec4_visitor::variable_storage(ir_variable *var)
 783 {
 784    return (dst_reg *)hash_table_find(this->variable_ht, var);
 785 }
 786
 787 void
 788 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir,
 789                                      enum brw_predicate *predicate)
 790 {
 791    ir_expression *expr = ir->as_expression();
 792
 793    *predicate = BRW_PREDICATE_NORMAL;
 794
 795    if (expr && expr->operation != ir_binop_ubo_load) {
 796       src_reg op[3];
 797       vec4_instruction *inst;
 798
 799       assert(expr->get_num_operands() <= 3);
 800       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 801          expr->operands[i]->accept(this);
 802          op[i] = this->result;
 803
 804          resolve_ud_negate(&op[i]);
 805       }
 806
 807       switch (expr->operation) {
 808       case ir_unop_logic_not:
 809          inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
 810          inst->conditional_mod = BRW_CONDITIONAL_Z;
 811          break;
 812
 813       case ir_binop_logic_xor:
 814          inst = emit(XOR(dst_null_d(), op[0], op[1]));
 815          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 816          break;
 817
 818       case ir_binop_logic_or:
 819          inst = emit(OR(dst_null_d(), op[0], op[1]));
 820          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 821          break;
 822
 823       case ir_binop_logic_and:
 824          inst = emit(AND(dst_null_d(), op[0], op[1]));
 825          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 826          break;
 827
 828       case ir_unop_f2b:
 829          if (brw->gen >= 6) {
 830             emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
 831          } else {
 832             inst = emit(MOV(dst_null_f(), op[0]));
 833             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 834          }
 835          break;
 836
 837       case ir_unop_i2b:
 838          if (brw->gen >= 6) {
 839             emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 840          } else {
 841             inst = emit(MOV(dst_null_d(), op[0]));
 842             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 843          }
 844          break;
 845
 846       case ir_binop_all_equal:
 847          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 848          *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
 849          break;
 850
 851       case ir_binop_any_nequal:
 852          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 853          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 854          break;
 855
 856       case ir_unop_any:
 857          inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 858          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 859          break;
 860
 861       case ir_binop_greater:
 862       case ir_binop_gequal:
 863       case ir_binop_less:
 864       case ir_binop_lequal:
 865       case ir_binop_equal:
 866       case ir_binop_nequal:
 867          emit(CMP(dst_null_d(), op[0], op[1],
 868                   brw_conditional_for_comparison(expr->operation)));
 869          break;
 870
 871       case ir_triop_csel: {
 872          /* Expand the boolean condition into the flag register. */
 873          inst = emit(MOV(dst_null_d(), op[0]));
 874          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 875
 876          /* Select which boolean to return. */
 877          dst_reg temp(this, expr->operands[1]->type);
 878          inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
 879          inst->predicate = BRW_PREDICATE_NORMAL;
 880
 881          /* Expand the result to a condition code. */
 882          inst = emit(MOV(dst_null_d(), src_reg(temp)));
 883          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 884          break;
 885       }
 886
 887       default:
 888          unreachable("not reached");
 889       }
 890       return;
 891    }
 892
 893    ir->accept(this);
 894
 895    resolve_ud_negate(&this->result);
 896
 897    if (brw->gen >= 6) {
 898       vec4_instruction *inst = emit(AND(dst_null_d(),
 899                                         this->result, src_reg(1)));
 900       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 901    } else {
 902       vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
 903       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 904    }
 905 }
 906
 907 /**
 908  * Emit a gen6 IF statement with the comparison folded into the IF
 909  * instruction.
 910  */
 911 void
 912 vec4_visitor::emit_if_gen6(ir_if *ir)
 913 {
 914    ir_expression *expr = ir->condition->as_expression();
 915
 916    if (expr && expr->operation != ir_binop_ubo_load) {
 917       src_reg op[3];
 918       dst_reg temp;
 919
 920       assert(expr->get_num_operands() <= 3);
 921       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 922          expr->operands[i]->accept(this);
 923          op[i] = this->result;
 924       }
 925
 926       switch (expr->operation) {
 927       case ir_unop_logic_not:
 928          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
 929          return;
 930
 931       case ir_binop_logic_xor:
 932          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
 933          return;
 934
 935       case ir_binop_logic_or:
 936          temp = dst_reg(this, glsl_type::bool_type);
 937          emit(OR(temp, op[0], op[1]));
 938          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 939          return;
 940
 941       case ir_binop_logic_and:
 942          temp = dst_reg(this, glsl_type::bool_type);
 943          emit(AND(temp, op[0], op[1]));
 944          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 945          return;
 946
 947       case ir_unop_f2b:
 948          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 949          return;
 950
 951       case ir_unop_i2b:
 952          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 953          return;
 954
 955       case ir_binop_greater:
 956       case ir_binop_gequal:
 957       case ir_binop_less:
 958       case ir_binop_lequal:
 959       case ir_binop_equal:
 960       case ir_binop_nequal:
 961          emit(IF(op[0], op[1],
 962                  brw_conditional_for_comparison(expr->operation)));
 963          return;
 964
 965       case ir_binop_all_equal:
 966          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 967          emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
 968          return;
 969
 970       case ir_binop_any_nequal:
 971          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 972          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 973          return;
 974
 975       case ir_unop_any:
 976          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 977          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 978          return;
 979
 980       case ir_triop_csel: {
 981          /* Expand the boolean condition into the flag register. */
 982          vec4_instruction *inst = emit(MOV(dst_null_d(), op[0]));
 983          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 984
 985          /* Select which boolean to return. */
 986          dst_reg temp(this, expr->operands[1]->type);
 987          inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
 988          inst->predicate = BRW_PREDICATE_NORMAL;
 989
 990          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 991          return;
 992       }
 993
 994       default:
 995          unreachable("not reached");
 996       }
 997       return;
 998    }
 999
1000    ir->condition->accept(this);
1001
1002    emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
1003 }
1004
1005 void
1006 vec4_visitor::visit(ir_variable *ir)
1007 {
1008    dst_reg *reg = NULL;
1009
1010    if (variable_storage(ir))
1011       return;
1012
1013    switch (ir->data.mode) {
1014    case ir_var_shader_in:
1015       reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
1016       break;
1017
1018    case ir_var_shader_out:
1019       reg = new(mem_ctx) dst_reg(this, ir->type);
1020
1021       for (int i = 0; i < type_size(ir->type); i++) {
1022          output_reg[ir->data.location + i] = *reg;
1023          output_reg[ir->data.location + i].reg_offset = i;
1024          output_reg[ir->data.location + i].type =
1025             brw_type_for_base_type(ir->type->get_scalar_type());
1026          output_reg_annotation[ir->data.location + i] = ir->name;
1027       }
1028       break;
1029
1030    case ir_var_auto:
1031    case ir_var_temporary:
1032       reg = new(mem_ctx) dst_reg(this, ir->type);
1033       break;
1034
1035    case ir_var_uniform:
1036       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
1037
1038       /* Thanks to the lower_ubo_reference pass, we will see only
1039        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
1040        * variables, so no need for them to be in variable_ht.
1041        *
1042        * Some uniforms, such as samplers and atomic counters, have no actual
1043        * storage, so we should ignore them.
1044        */
1045       if (ir->is_in_uniform_block() || type_size(ir->type) == 0)
1046          return;
1047
1048       /* Track how big the whole uniform variable is, in case we need to put a
1049        * copy of its data into pull constants for array access.
1050        */
1051       assert(this->uniforms < uniform_array_size);
1052       this->uniform_size[this->uniforms] = type_size(ir->type);
1053
1054       if (!strncmp(ir->name, "gl_", 3)) {
1055          setup_builtin_uniform_values(ir);
1056       } else {
1057          setup_uniform_values(ir);
1058       }
1059       break;
1060
1061    case ir_var_system_value:
1062       reg = make_reg_for_system_value(ir);
1063       break;
1064
1065    default:
1066       unreachable("not reached");
1067    }
1068
1069    reg->type = brw_type_for_base_type(ir->type);
1070    hash_table_insert(this->variable_ht, reg, ir);
1071 }
1072
1073 void
1074 vec4_visitor::visit(ir_loop *ir)
1075 {
1076    /* We don't want debugging output to print the whole body of the
1077     * loop as the annotation.
1078     */
1079    this->base_ir = NULL;
1080
1081    emit(BRW_OPCODE_DO);
1082
1083    visit_instructions(&ir->body_instructions);
1084
1085    emit(BRW_OPCODE_WHILE);
1086 }
1087
1088 void
1089 vec4_visitor::visit(ir_loop_jump *ir)
1090 {
1091    switch (ir->mode) {
1092    case ir_loop_jump::jump_break:
1093       emit(BRW_OPCODE_BREAK);
1094       break;
1095    case ir_loop_jump::jump_continue:
1096       emit(BRW_OPCODE_CONTINUE);
1097       break;
1098    }
1099 }
1100
1101
1102 void
1103 vec4_visitor::visit(ir_function_signature *)
1104 {
1105    unreachable("not reached");
1106 }
1107
1108 void
1109 vec4_visitor::visit(ir_function *ir)
1110 {
1111    /* Ignore function bodies other than main() -- we shouldn't see calls to
1112     * them since they should all be inlined.
1113     */
1114    if (strcmp(ir->name, "main") == 0) {
1115       const ir_function_signature *sig;
1116       exec_list empty;
1117
1118       sig = ir->matching_signature(NULL, &empty, false);
1119
1120       assert(sig);
1121
1122       visit_instructions(&sig->body);
1123    }
1124 }
1125
1126 bool
1127 vec4_visitor::try_emit_mad(ir_expression *ir)
1128 {
1129    /* 3-src instructions were introduced in gen6. */
1130    if (brw->gen < 6)
1131       return false;
1132
1133    /* MAD can only handle floating-point data. */
1134    if (ir->type->base_type != GLSL_TYPE_FLOAT)
1135       return false;
1136
1137    ir_rvalue *nonmul = ir->operands[1];
1138    ir_expression *mul = ir->operands[0]->as_expression();
1139
1140    if (!mul || mul->operation != ir_binop_mul) {
1141       nonmul = ir->operands[0];
1142       mul = ir->operands[1]->as_expression();
1143
1144       if (!mul || mul->operation != ir_binop_mul)
1145          return false;
1146    }
1147
1148    nonmul->accept(this);
1149    src_reg src0 = fix_3src_operand(this->result);
1150
1151    mul->operands[0]->accept(this);
1152    src_reg src1 = fix_3src_operand(this->result);
1153
1154    mul->operands[1]->accept(this);
1155    src_reg src2 = fix_3src_operand(this->result);
1156
1157    this->result = src_reg(this, ir->type);
1158    emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1159
1160    return true;
1161 }
1162
1163 bool
1164 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1165 {
1166    /* This optimization relies on CMP setting the destination to 0 when
1167     * false.  Early hardware only sets the least significant bit, and
1168     * leaves the other bits undefined.  So we can't use it.
1169     */
1170    if (brw->gen < 6)
1171       return false;
1172
1173    ir_expression *const cmp = ir->operands[0]->as_expression();
1174
1175    if (cmp == NULL)
1176       return false;
1177
1178    switch (cmp->operation) {
1179    case ir_binop_less:
1180    case ir_binop_greater:
1181    case ir_binop_lequal:
1182    case ir_binop_gequal:
1183    case ir_binop_equal:
1184    case ir_binop_nequal:
1185       break;
1186
1187    default:
1188       return false;
1189    }
1190
1191    cmp->operands[0]->accept(this);
1192    const src_reg cmp_src0 = this->result;
1193
1194    cmp->operands[1]->accept(this);
1195    const src_reg cmp_src1 = this->result;
1196
1197    this->result = src_reg(this, ir->type);
1198
1199    emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1200             brw_conditional_for_comparison(cmp->operation)));
1201
1202    /* If the comparison is false, this->result will just happen to be zero.
1203     */
1204    vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1205                                        this->result, src_reg(1.0f));
1206    inst->predicate = BRW_PREDICATE_NORMAL;
1207    inst->predicate_inverse = true;
1208
1209    return true;
1210 }
1211
1212 void
1213 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
1214                           src_reg src0, src_reg src1)
1215 {
1216    vec4_instruction *inst;
1217
1218    if (brw->gen >= 6) {
1219       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1220       inst->conditional_mod = conditionalmod;
1221    } else {
1222       emit(CMP(dst, src0, src1, conditionalmod));
1223
1224       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1225       inst->predicate = BRW_PREDICATE_NORMAL;
1226    }
1227 }
1228
1229 void
1230 vec4_visitor::emit_lrp(const dst_reg &dst,
1231                        const src_reg &x, const src_reg &y, const src_reg &a)
1232 {
1233    if (brw->gen >= 6) {
1234       /* Note that the instruction's argument order is reversed from GLSL
1235        * and the IR.
1236        */
1237       emit(LRP(dst,
1238                fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
1239    } else {
1240       /* Earlier generations don't support three source operations, so we
1241        * need to emit x*(1-a) + y*a.
1242        */
1243       dst_reg y_times_a           = dst_reg(this, glsl_type::vec4_type);
1244       dst_reg one_minus_a         = dst_reg(this, glsl_type::vec4_type);
1245       dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1246       y_times_a.writemask           = dst.writemask;
1247       one_minus_a.writemask         = dst.writemask;
1248       x_times_one_minus_a.writemask = dst.writemask;
1249
1250       emit(MUL(y_times_a, y, a));
1251       emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1252       emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1253       emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1254    }
1255 }
1256
1257 void
1258 vec4_visitor::visit(ir_expression *ir)
1259 {
1260    unsigned int operand;
1261    src_reg op[Elements(ir->operands)];
1262    src_reg result_src;
1263    dst_reg result_dst;
1264    vec4_instruction *inst;
1265
1266    if (ir->operation == ir_binop_add) {
1267       if (try_emit_mad(ir))
1268          return;
1269    }
1270
1271    if (ir->operation == ir_unop_b2f) {
1272       if (try_emit_b2f_of_compare(ir))
1273          return;
1274    }
1275
1276    for (operand = 0; operand < ir->get_num_operands(); operand++) {
1277       this->result.file = BAD_FILE;
1278       ir->operands[operand]->accept(this);
1279       if (this->result.file == BAD_FILE) {
1280          fprintf(stderr, "Failed to get tree for expression operand:\n");
1281          ir->operands[operand]->fprint(stderr);
1282          exit(1);
1283       }
1284       op[operand] = this->result;
1285
1286       /* Matrix expression operands should have been broken down to vector
1287        * operations already.
1288        */
1289       assert(!ir->operands[operand]->type->is_matrix());
1290    }
1291
1292    int vector_elements = ir->operands[0]->type->vector_elements;
1293    if (ir->operands[1]) {
1294       vector_elements = MAX2(vector_elements,
1295                              ir->operands[1]->type->vector_elements);
1296    }
1297
1298    this->result.file = BAD_FILE;
1299
1300    /* Storage for our result.  Ideally for an assignment we'd be using
1301     * the actual storage for the result here, instead.
1302     */
1303    result_src = src_reg(this, ir->type);
1304    /* convenience for the emit functions below. */
1305    result_dst = dst_reg(result_src);
1306    /* If nothing special happens, this is the result. */
1307    this->result = result_src;
1308    /* Limit writes to the channels that will be used by result_src later.
1309     * This does limit this temp's use as a temporary for multi-instruction
1310     * sequences.
1311     */
1312    result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1313
1314    switch (ir->operation) {
1315    case ir_unop_logic_not:
1316       if (ctx->Const.UniformBooleanTrue != 1) {
1317          emit(NOT(result_dst, op[0]));
1318       } else {
1319          emit(XOR(result_dst, op[0], src_reg(1u)));
1320       }
1321       break;
1322    case ir_unop_neg:
1323       op[0].negate = !op[0].negate;
1324       emit(MOV(result_dst, op[0]));
1325       break;
1326    case ir_unop_abs:
1327       op[0].abs = true;
1328       op[0].negate = false;
1329       emit(MOV(result_dst, op[0]));
1330       break;
1331
1332    case ir_unop_sign:
1333       if (ir->type->is_float()) {
1334          /* AND(val, 0x80000000) gives the sign bit.
1335           *
1336           * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1337           * zero.
1338           */
1339          emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1340
1341          op[0].type = BRW_REGISTER_TYPE_UD;
1342          result_dst.type = BRW_REGISTER_TYPE_UD;
1343          emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1344
1345          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1346          inst->predicate = BRW_PREDICATE_NORMAL;
1347
1348          this->result.type = BRW_REGISTER_TYPE_F;
1349       } else {
1350          /*  ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1351           *               -> non-negative val generates 0x00000000.
1352           *  Predicated OR sets 1 if val is positive.
1353           */
1354          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1355
1356          emit(ASR(result_dst, op[0], src_reg(31)));
1357
1358          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1359          inst->predicate = BRW_PREDICATE_NORMAL;
1360       }
1361       break;
1362
1363    case ir_unop_rcp:
1364       emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1365       break;
1366
1367    case ir_unop_exp2:
1368       emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1369       break;
1370    case ir_unop_log2:
1371       emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1372       break;
1373    case ir_unop_exp:
1374    case ir_unop_log:
1375       unreachable("not reached: should be handled by ir_explog_to_explog2");
1376    case ir_unop_sin:
1377    case ir_unop_sin_reduced:
1378       emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1379       break;
1380    case ir_unop_cos:
1381    case ir_unop_cos_reduced:
1382       emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1383       break;
1384
1385    case ir_unop_dFdx:
1386    case ir_unop_dFdx_coarse:
1387    case ir_unop_dFdx_fine:
1388    case ir_unop_dFdy:
1389    case ir_unop_dFdy_coarse:
1390    case ir_unop_dFdy_fine:
1391       unreachable("derivatives not valid in vertex shader");
1392
1393    case ir_unop_bitfield_reverse:
1394       emit(BFREV(result_dst, op[0]));
1395       break;
1396    case ir_unop_bit_count:
1397       emit(CBIT(result_dst, op[0]));
1398       break;
1399    case ir_unop_find_msb: {
1400       src_reg temp = src_reg(this, glsl_type::uint_type);
1401
1402       inst = emit(FBH(dst_reg(temp), op[0]));
1403       inst->dst.writemask = WRITEMASK_XYZW;
1404
1405       /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1406        * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1407        * subtract the result from 31 to convert the MSB count into an LSB count.
1408        */
1409
1410       /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1411       temp.swizzle = BRW_SWIZZLE_NOOP;
1412       emit(MOV(result_dst, temp));
1413
1414       src_reg src_tmp = src_reg(result_dst);
1415       emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1416
1417       src_tmp.negate = true;
1418       inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1419       inst->predicate = BRW_PREDICATE_NORMAL;
1420       break;
1421    }
1422    case ir_unop_find_lsb:
1423       emit(FBL(result_dst, op[0]));
1424       break;
1425    case ir_unop_saturate:
1426       inst = emit(MOV(result_dst, op[0]));
1427       inst->saturate = true;
1428       break;
1429
1430    case ir_unop_noise:
1431       unreachable("not reached: should be handled by lower_noise");
1432
1433    case ir_binop_add:
1434       emit(ADD(result_dst, op[0], op[1]));
1435       break;
1436    case ir_binop_sub:
1437       unreachable("not reached: should be handled by ir_sub_to_add_neg");
1438
1439    case ir_binop_mul:
1440       if (brw->gen < 8 && ir->type->is_integer()) {
1441          /* For integer multiplication, the MUL uses the low 16 bits of one of
1442           * the operands (src0 through SNB, src1 on IVB and later).  The MACH
1443           * accumulates in the contribution of the upper 16 bits of that
1444           * operand.  If we can determine that one of the args is in the low
1445           * 16 bits, though, we can just emit a single MUL.
1446           */
1447          if (ir->operands[0]->is_uint16_constant()) {
1448             if (brw->gen < 7)
1449                emit(MUL(result_dst, op[0], op[1]));
1450             else
1451                emit(MUL(result_dst, op[1], op[0]));
1452          } else if (ir->operands[1]->is_uint16_constant()) {
1453             if (brw->gen < 7)
1454                emit(MUL(result_dst, op[1], op[0]));
1455             else
1456                emit(MUL(result_dst, op[0], op[1]));
1457          } else {
1458             struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1459
1460             emit(MUL(acc, op[0], op[1]));
1461             emit(MACH(dst_null_d(), op[0], op[1]));
1462             emit(MOV(result_dst, src_reg(acc)));
1463          }
1464       } else {
1465          emit(MUL(result_dst, op[0], op[1]));
1466       }
1467       break;
1468    case ir_binop_imul_high: {
1469       struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1470
1471       emit(MUL(acc, op[0], op[1]));
1472       emit(MACH(result_dst, op[0], op[1]));
1473       break;
1474    }
1475    case ir_binop_div:
1476       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1477       assert(ir->type->is_integer());
1478       emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1479       break;
1480    case ir_binop_carry: {
1481       struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
1482
1483       emit(ADDC(dst_null_ud(), op[0], op[1]));
1484       emit(MOV(result_dst, src_reg(acc)));
1485       break;
1486    }
1487    case ir_binop_borrow: {
1488       struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
1489
1490       emit(SUBB(dst_null_ud(), op[0], op[1]));
1491       emit(MOV(result_dst, src_reg(acc)));
1492       break;
1493    }
1494    case ir_binop_mod:
1495       /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1496       assert(ir->type->is_integer());
1497       emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1498       break;
1499
1500    case ir_binop_less:
1501    case ir_binop_greater:
1502    case ir_binop_lequal:
1503    case ir_binop_gequal:
1504    case ir_binop_equal:
1505    case ir_binop_nequal: {
1506       emit(CMP(result_dst, op[0], op[1],
1507                brw_conditional_for_comparison(ir->operation)));
1508       if (ctx->Const.UniformBooleanTrue == 1) {
1509          emit(AND(result_dst, result_src, src_reg(1u)));
1510       }
1511       break;
1512    }
1513
1514    case ir_binop_all_equal:
1515       /* "==" operator producing a scalar boolean. */
1516       if (ir->operands[0]->type->is_vector() ||
1517           ir->operands[1]->type->is_vector()) {
1518          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1519          emit(MOV(result_dst, src_reg(0)));
1520          inst = emit(MOV(result_dst, src_reg(ctx->Const.UniformBooleanTrue)));
1521          inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1522       } else {
1523          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1524          if (ctx->Const.UniformBooleanTrue == 1) {
1525             emit(AND(result_dst, result_src, src_reg(1u)));
1526          }
1527       }
1528       break;
1529    case ir_binop_any_nequal:
1530       /* "!=" operator producing a scalar boolean. */
1531       if (ir->operands[0]->type->is_vector() ||
1532           ir->operands[1]->type->is_vector()) {
1533          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1534
1535          emit(MOV(result_dst, src_reg(0)));
1536          inst = emit(MOV(result_dst, src_reg(ctx->Const.UniformBooleanTrue)));
1537          inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1538       } else {
1539          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1540          if (ctx->Const.UniformBooleanTrue == 1) {
1541             emit(AND(result_dst, result_src, src_reg(1u)));
1542          }
1543       }
1544       break;
1545
1546    case ir_unop_any:
1547       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1548       emit(MOV(result_dst, src_reg(0)));
1549
1550       inst = emit(MOV(result_dst, src_reg(ctx->Const.UniformBooleanTrue)));
1551       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1552       break;
1553
1554    case ir_binop_logic_xor:
1555       emit(XOR(result_dst, op[0], op[1]));
1556       break;
1557
1558    case ir_binop_logic_or:
1559       emit(OR(result_dst, op[0], op[1]));
1560       break;
1561
1562    case ir_binop_logic_and:
1563       emit(AND(result_dst, op[0], op[1]));
1564       break;
1565
1566    case ir_binop_dot:
1567       assert(ir->operands[0]->type->is_vector());
1568       assert(ir->operands[0]->type == ir->operands[1]->type);
1569       emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1570       break;
1571
1572    case ir_unop_sqrt:
1573       emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1574       break;
1575    case ir_unop_rsq:
1576       emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1577       break;
1578
1579    case ir_unop_bitcast_i2f:
1580    case ir_unop_bitcast_u2f:
1581       this->result = op[0];
1582       this->result.type = BRW_REGISTER_TYPE_F;
1583       break;
1584
1585    case ir_unop_bitcast_f2i:
1586       this->result = op[0];
1587       this->result.type = BRW_REGISTER_TYPE_D;
1588       break;
1589
1590    case ir_unop_bitcast_f2u:
1591       this->result = op[0];
1592       this->result.type = BRW_REGISTER_TYPE_UD;
1593       break;
1594
1595    case ir_unop_i2f:
1596    case ir_unop_i2u:
1597    case ir_unop_u2i:
1598    case ir_unop_u2f:
1599    case ir_unop_f2i:
1600    case ir_unop_f2u:
1601       emit(MOV(result_dst, op[0]));
1602       break;
1603    case ir_unop_b2i:
1604       if (ctx->Const.UniformBooleanTrue != 1) {
1605          emit(AND(result_dst, op[0], src_reg(1u)));
1606       } else {
1607          emit(MOV(result_dst, op[0]));
1608       }
1609       break;
1610    case ir_unop_b2f:
1611       if (ctx->Const.UniformBooleanTrue != 1) {
1612          op[0].type = BRW_REGISTER_TYPE_UD;
1613          result_dst.type = BRW_REGISTER_TYPE_UD;
1614          emit(AND(result_dst, op[0], src_reg(0x3f800000u)));
1615          result_dst.type = BRW_REGISTER_TYPE_F;
1616       } else {
1617          emit(MOV(result_dst, op[0]));
1618       }
1619       break;
1620    case ir_unop_f2b:
1621    case ir_unop_i2b:
1622       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1623       if (ctx->Const.UniformBooleanTrue == 1) {
1624          emit(AND(result_dst, result_src, src_reg(1u)));
1625       }
1626       break;
1627
1628    case ir_unop_trunc:
1629       emit(RNDZ(result_dst, op[0]));
1630       break;
1631    case ir_unop_ceil:
1632       op[0].negate = !op[0].negate;
1633       inst = emit(RNDD(result_dst, op[0]));
1634       this->result.negate = true;
1635       break;
1636    case ir_unop_floor:
1637       inst = emit(RNDD(result_dst, op[0]));
1638       break;
1639    case ir_unop_fract:
1640       inst = emit(FRC(result_dst, op[0]));
1641       break;
1642    case ir_unop_round_even:
1643       emit(RNDE(result_dst, op[0]));
1644       break;
1645
1646    case ir_binop_min:
1647       emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1648       break;
1649    case ir_binop_max:
1650       emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1651       break;
1652
1653    case ir_binop_pow:
1654       emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1655       break;
1656
1657    case ir_unop_bit_not:
1658       inst = emit(NOT(result_dst, op[0]));
1659       break;
1660    case ir_binop_bit_and:
1661       inst = emit(AND(result_dst, op[0], op[1]));
1662       break;
1663    case ir_binop_bit_xor:
1664       inst = emit(XOR(result_dst, op[0], op[1]));
1665       break;
1666    case ir_binop_bit_or:
1667       inst = emit(OR(result_dst, op[0], op[1]));
1668       break;
1669
1670    case ir_binop_lshift:
1671       inst = emit(SHL(result_dst, op[0], op[1]));
1672       break;
1673
1674    case ir_binop_rshift:
1675       if (ir->type->base_type == GLSL_TYPE_INT)
1676          inst = emit(ASR(result_dst, op[0], op[1]));
1677       else
1678          inst = emit(SHR(result_dst, op[0], op[1]));
1679       break;
1680
1681    case ir_binop_bfm:
1682       emit(BFI1(result_dst, op[0], op[1]));
1683       break;
1684
1685    case ir_binop_ubo_load: {
1686       ir_constant *const_uniform_block = ir->operands[0]->as_constant();
1687       ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1688       unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1689       src_reg offset;
1690
1691       /* Now, load the vector from that offset. */
1692       assert(ir->type->is_vector() || ir->type->is_scalar());
1693
1694       src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1695       packed_consts.type = result.type;
1696       src_reg surf_index;
1697
1698       if (const_uniform_block) {
1699          /* The block index is a constant, so just emit the binding table entry
1700           * as an immediate.
1701           */
1702          surf_index = src_reg(prog_data->base.binding_table.ubo_start +
1703                               const_uniform_block->value.u[0]);
1704       } else {
1705          /* The block index is not a constant. Evaluate the index expression
1706           * per-channel and add the base UBO index; the generator will select
1707           * a value from any live channel.
1708           */
1709          surf_index = src_reg(this, glsl_type::uint_type);
1710          emit(ADD(dst_reg(surf_index), op[0],
1711                   src_reg(prog_data->base.binding_table.ubo_start)));
1712
1713          /* Assume this may touch any UBO. It would be nice to provide
1714           * a tighter bound, but the array information is already lowered away.
1715           */
1716          brw_mark_surface_used(&prog_data->base,
1717                                prog_data->base.binding_table.ubo_start +
1718                                shader_prog->NumUniformBlocks - 1);
1719       }
1720
1721       if (const_offset_ir) {
1722          if (brw->gen >= 8) {
1723             /* Store the offset in a GRF so we can send-from-GRF. */
1724             offset = src_reg(this, glsl_type::int_type);
1725             emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1726          } else {
1727             /* Immediates are fine on older generations since they'll be moved
1728              * to a (potentially fake) MRF at the generator level.
1729              */
1730             offset = src_reg(const_offset / 16);
1731          }
1732       } else {
1733          offset = src_reg(this, glsl_type::uint_type);
1734          emit(SHR(dst_reg(offset), op[1], src_reg(4)));
1735       }
1736
1737       if (brw->gen >= 7) {
1738          dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1739          grf_offset.type = offset.type;
1740
1741          emit(MOV(grf_offset, offset));
1742
1743          emit(new(mem_ctx) vec4_instruction(this,
1744                                             VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1745                                             dst_reg(packed_consts),
1746                                             surf_index,
1747                                             src_reg(grf_offset)));
1748       } else {
1749          vec4_instruction *pull =
1750             emit(new(mem_ctx) vec4_instruction(this,
1751                                                VS_OPCODE_PULL_CONSTANT_LOAD,
1752                                                dst_reg(packed_consts),
1753                                                surf_index,
1754                                                offset));
1755          pull->base_mrf = 14;
1756          pull->mlen = 1;
1757       }
1758
1759       packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1760       packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1761                                             const_offset % 16 / 4,
1762                                             const_offset % 16 / 4,
1763                                             const_offset % 16 / 4);
1764
1765       /* UBO bools are any nonzero int.  We need to convert them to use the
1766        * value of true stored in ctx->Const.UniformBooleanTrue.
1767        */
1768       if (ir->type->base_type == GLSL_TYPE_BOOL) {
1769          emit(CMP(result_dst, packed_consts, src_reg(0u),
1770                   BRW_CONDITIONAL_NZ));
1771          if (ctx->Const.UniformBooleanTrue == 1) {
1772             emit(AND(result_dst, result, src_reg(1u)));
1773          }
1774       } else {
1775          emit(MOV(result_dst, packed_consts));
1776       }
1777       break;
1778    }
1779
1780    case ir_binop_vector_extract:
1781       unreachable("should have been lowered by vec_index_to_cond_assign");
1782
1783    case ir_triop_fma:
1784       op[0] = fix_3src_operand(op[0]);
1785       op[1] = fix_3src_operand(op[1]);
1786       op[2] = fix_3src_operand(op[2]);
1787       /* Note that the instruction's argument order is reversed from GLSL
1788        * and the IR.
1789        */
1790       emit(MAD(result_dst, op[2], op[1], op[0]));
1791       break;
1792
1793    case ir_triop_lrp:
1794       emit_lrp(result_dst, op[0], op[1], op[2]);
1795       break;
1796
1797    case ir_triop_csel:
1798       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1799       inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1800       inst->predicate = BRW_PREDICATE_NORMAL;
1801       break;
1802
1803    case ir_triop_bfi:
1804       op[0] = fix_3src_operand(op[0]);
1805       op[1] = fix_3src_operand(op[1]);
1806       op[2] = fix_3src_operand(op[2]);
1807       emit(BFI2(result_dst, op[0], op[1], op[2]));
1808       break;
1809
1810    case ir_triop_bitfield_extract:
1811       op[0] = fix_3src_operand(op[0]);
1812       op[1] = fix_3src_operand(op[1]);
1813       op[2] = fix_3src_operand(op[2]);
1814       /* Note that the instruction's argument order is reversed from GLSL
1815        * and the IR.
1816        */
1817       emit(BFE(result_dst, op[2], op[1], op[0]));
1818       break;
1819
1820    case ir_triop_vector_insert:
1821       unreachable("should have been lowered by lower_vector_insert");
1822
1823    case ir_quadop_bitfield_insert:
1824       unreachable("not reached: should be handled by "
1825               "bitfield_insert_to_bfm_bfi\n");
1826
1827    case ir_quadop_vector:
1828       unreachable("not reached: should be handled by lower_quadop_vector");
1829
1830    case ir_unop_pack_half_2x16:
1831       emit_pack_half_2x16(result_dst, op[0]);
1832       break;
1833    case ir_unop_unpack_half_2x16:
1834       emit_unpack_half_2x16(result_dst, op[0]);
1835       break;
1836    case ir_unop_pack_snorm_2x16:
1837    case ir_unop_pack_snorm_4x8:
1838    case ir_unop_pack_unorm_2x16:
1839    case ir_unop_pack_unorm_4x8:
1840    case ir_unop_unpack_snorm_2x16:
1841    case ir_unop_unpack_snorm_4x8:
1842    case ir_unop_unpack_unorm_2x16:
1843    case ir_unop_unpack_unorm_4x8:
1844       unreachable("not reached: should be handled by lower_packing_builtins");
1845    case ir_unop_unpack_half_2x16_split_x:
1846    case ir_unop_unpack_half_2x16_split_y:
1847    case ir_binop_pack_half_2x16_split:
1848    case ir_unop_interpolate_at_centroid:
1849    case ir_binop_interpolate_at_sample:
1850    case ir_binop_interpolate_at_offset:
1851       unreachable("not reached: should not occur in vertex shader");
1852    case ir_binop_ldexp:
1853       unreachable("not reached: should be handled by ldexp_to_arith()");
1854    }
1855 }
1856
1857
1858 void
1859 vec4_visitor::visit(ir_swizzle *ir)
1860 {
1861    src_reg src;
1862    int i = 0;
1863    int swizzle[4];
1864
1865    /* Note that this is only swizzles in expressions, not those on the left
1866     * hand side of an assignment, which do write masking.  See ir_assignment
1867     * for that.
1868     */
1869
1870    ir->val->accept(this);
1871    src = this->result;
1872    assert(src.file != BAD_FILE);
1873
1874    for (i = 0; i < ir->type->vector_elements; i++) {
1875       switch (i) {
1876       case 0:
1877          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1878          break;
1879       case 1:
1880          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1881          break;
1882       case 2:
1883          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1884          break;
1885       case 3:
1886          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1887             break;
1888       }
1889    }
1890    for (; i < 4; i++) {
1891       /* Replicate the last channel out. */
1892       swizzle[i] = swizzle[ir->type->vector_elements - 1];
1893    }
1894
1895    src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1896
1897    this->result = src;
1898 }
1899
1900 void
1901 vec4_visitor::visit(ir_dereference_variable *ir)
1902 {
1903    const struct glsl_type *type = ir->type;
1904    dst_reg *reg = variable_storage(ir->var);
1905
1906    if (!reg) {
1907       fail("Failed to find variable storage for %s\n", ir->var->name);
1908       this->result = src_reg(brw_null_reg());
1909       return;
1910    }
1911
1912    this->result = src_reg(*reg);
1913
1914    /* System values get their swizzle from the dst_reg writemask */
1915    if (ir->var->data.mode == ir_var_system_value)
1916       return;
1917
1918    if (type->is_scalar() || type->is_vector() || type->is_matrix())
1919       this->result.swizzle = swizzle_for_size(type->vector_elements);
1920 }
1921
1922
1923 int
1924 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1925 {
1926    /* Under normal circumstances array elements are stored consecutively, so
1927     * the stride is equal to the size of the array element.
1928     */
1929    return type_size(ir->type);
1930 }
1931
1932
1933 void
1934 vec4_visitor::visit(ir_dereference_array *ir)
1935 {
1936    ir_constant *constant_index;
1937    src_reg src;
1938    int array_stride = compute_array_stride(ir);
1939
1940    constant_index = ir->array_index->constant_expression_value();
1941
1942    ir->array->accept(this);
1943    src = this->result;
1944
1945    if (constant_index) {
1946       src.reg_offset += constant_index->value.i[0] * array_stride;
1947    } else {
1948       /* Variable index array dereference.  It eats the "vec4" of the
1949        * base of the array and an index that offsets the Mesa register
1950        * index.
1951        */
1952       ir->array_index->accept(this);
1953
1954       src_reg index_reg;
1955
1956       if (array_stride == 1) {
1957          index_reg = this->result;
1958       } else {
1959          index_reg = src_reg(this, glsl_type::int_type);
1960
1961          emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
1962       }
1963
1964       if (src.reladdr) {
1965          src_reg temp = src_reg(this, glsl_type::int_type);
1966
1967          emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1968
1969          index_reg = temp;
1970       }
1971
1972       src.reladdr = ralloc(mem_ctx, src_reg);
1973       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1974    }
1975
1976    /* If the type is smaller than a vec4, replicate the last channel out. */
1977    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1978       src.swizzle = swizzle_for_size(ir->type->vector_elements);
1979    else
1980       src.swizzle = BRW_SWIZZLE_NOOP;
1981    src.type = brw_type_for_base_type(ir->type);
1982
1983    this->result = src;
1984 }
1985
1986 void
1987 vec4_visitor::visit(ir_dereference_record *ir)
1988 {
1989    unsigned int i;
1990    const glsl_type *struct_type = ir->record->type;
1991    int offset = 0;
1992
1993    ir->record->accept(this);
1994
1995    for (i = 0; i < struct_type->length; i++) {
1996       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1997          break;
1998       offset += type_size(struct_type->fields.structure[i].type);
1999    }
2000
2001    /* If the type is smaller than a vec4, replicate the last channel out. */
2002    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2003       this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
2004    else
2005       this->result.swizzle = BRW_SWIZZLE_NOOP;
2006    this->result.type = brw_type_for_base_type(ir->type);
2007
2008    this->result.reg_offset += offset;
2009 }
2010
2011 /**
2012  * We want to be careful in assignment setup to hit the actual storage
2013  * instead of potentially using a temporary like we might with the
2014  * ir_dereference handler.
2015  */
2016 static dst_reg
2017 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
2018 {
2019    /* The LHS must be a dereference.  If the LHS is a variable indexed array
2020     * access of a vector, it must be separated into a series conditional moves
2021     * before reaching this point (see ir_vec_index_to_cond_assign).
2022     */
2023    assert(ir->as_dereference());
2024    ir_dereference_array *deref_array = ir->as_dereference_array();
2025    if (deref_array) {
2026       assert(!deref_array->array->type->is_vector());
2027    }
2028
2029    /* Use the rvalue deref handler for the most part.  We'll ignore
2030     * swizzles in it and write swizzles using writemask, though.
2031     */
2032    ir->accept(v);
2033    return dst_reg(v->result);
2034 }
2035
2036 void
2037 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
2038                               const struct glsl_type *type,
2039                               enum brw_predicate predicate)
2040 {
2041    if (type->base_type == GLSL_TYPE_STRUCT) {
2042       for (unsigned int i = 0; i < type->length; i++) {
2043          emit_block_move(dst, src, type->fields.structure[i].type, predicate);
2044       }
2045       return;
2046    }
2047
2048    if (type->is_array()) {
2049       for (unsigned int i = 0; i < type->length; i++) {
2050          emit_block_move(dst, src, type->fields.array, predicate);
2051       }
2052       return;
2053    }
2054
2055    if (type->is_matrix()) {
2056       const struct glsl_type *vec_type;
2057
2058       vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
2059                                          type->vector_elements, 1);
2060
2061       for (int i = 0; i < type->matrix_columns; i++) {
2062          emit_block_move(dst, src, vec_type, predicate);
2063       }
2064       return;
2065    }
2066
2067    assert(type->is_scalar() || type->is_vector());
2068
2069    dst->type = brw_type_for_base_type(type);
2070    src->type = dst->type;
2071
2072    dst->writemask = (1 << type->vector_elements) - 1;
2073
2074    src->swizzle = swizzle_for_size(type->vector_elements);
2075
2076    vec4_instruction *inst = emit(MOV(*dst, *src));
2077    inst->predicate = predicate;
2078
2079    dst->reg_offset++;
2080    src->reg_offset++;
2081 }
2082
2083
2084 /* If the RHS processing resulted in an instruction generating a
2085  * temporary value, and it would be easy to rewrite the instruction to
2086  * generate its result right into the LHS instead, do so.  This ends
2087  * up reliably removing instructions where it can be tricky to do so
2088  * later without real UD chain information.
2089  */
2090 bool
2091 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2092                                      dst_reg dst,
2093                                      src_reg src,
2094                                      vec4_instruction *pre_rhs_inst,
2095                                      vec4_instruction *last_rhs_inst)
2096 {
2097    /* This could be supported, but it would take more smarts. */
2098    if (ir->condition)
2099       return false;
2100
2101    if (pre_rhs_inst == last_rhs_inst)
2102       return false; /* No instructions generated to work with. */
2103
2104    /* Make sure the last instruction generated our source reg. */
2105    if (src.file != GRF ||
2106        src.file != last_rhs_inst->dst.file ||
2107        src.reg != last_rhs_inst->dst.reg ||
2108        src.reg_offset != last_rhs_inst->dst.reg_offset ||
2109        src.reladdr ||
2110        src.abs ||
2111        src.negate ||
2112        last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2113       return false;
2114
2115    /* Check that that last instruction fully initialized the channels
2116     * we want to use, in the order we want to use them.  We could
2117     * potentially reswizzle the operands of many instructions so that
2118     * we could handle out of order channels, but don't yet.
2119     */
2120
2121    for (unsigned i = 0; i < 4; i++) {
2122       if (dst.writemask & (1 << i)) {
2123          if (!(last_rhs_inst->dst.writemask & (1 << i)))
2124             return false;
2125
2126          if (BRW_GET_SWZ(src.swizzle, i) != i)
2127             return false;
2128       }
2129    }
2130
2131    /* Success!  Rewrite the instruction. */
2132    last_rhs_inst->dst.file = dst.file;
2133    last_rhs_inst->dst.reg = dst.reg;
2134    last_rhs_inst->dst.reg_offset = dst.reg_offset;
2135    last_rhs_inst->dst.reladdr = dst.reladdr;
2136    last_rhs_inst->dst.writemask &= dst.writemask;
2137
2138    return true;
2139 }
2140
2141 void
2142 vec4_visitor::visit(ir_assignment *ir)
2143 {
2144    dst_reg dst = get_assignment_lhs(ir->lhs, this);
2145    enum brw_predicate predicate = BRW_PREDICATE_NONE;
2146
2147    if (!ir->lhs->type->is_scalar() &&
2148        !ir->lhs->type->is_vector()) {
2149       ir->rhs->accept(this);
2150       src_reg src = this->result;
2151
2152       if (ir->condition) {
2153          emit_bool_to_cond_code(ir->condition, &predicate);
2154       }
2155
2156       /* emit_block_move doesn't account for swizzles in the source register.
2157        * This should be ok, since the source register is a structure or an
2158        * array, and those can't be swizzled.  But double-check to be sure.
2159        */
2160       assert(src.swizzle ==
2161              (ir->rhs->type->is_matrix()
2162               ? swizzle_for_size(ir->rhs->type->vector_elements)
2163               : BRW_SWIZZLE_NOOP));
2164
2165       emit_block_move(&dst, &src, ir->rhs->type, predicate);
2166       return;
2167    }
2168
2169    /* Now we're down to just a scalar/vector with writemasks. */
2170    int i;
2171
2172    vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2173    pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2174
2175    ir->rhs->accept(this);
2176
2177    last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2178
2179    src_reg src = this->result;
2180
2181    int swizzles[4];
2182    int first_enabled_chan = 0;
2183    int src_chan = 0;
2184
2185    assert(ir->lhs->type->is_vector() ||
2186           ir->lhs->type->is_scalar());
2187    dst.writemask = ir->write_mask;
2188
2189    for (int i = 0; i < 4; i++) {
2190       if (dst.writemask & (1 << i)) {
2191          first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
2192          break;
2193       }
2194    }
2195
2196    /* Swizzle a small RHS vector into the channels being written.
2197     *
2198     * glsl ir treats write_mask as dictating how many channels are
2199     * present on the RHS while in our instructions we need to make
2200     * those channels appear in the slots of the vec4 they're written to.
2201     */
2202    for (int i = 0; i < 4; i++) {
2203       if (dst.writemask & (1 << i))
2204          swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
2205       else
2206          swizzles[i] = first_enabled_chan;
2207    }
2208    src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2209                               swizzles[2], swizzles[3]);
2210
2211    if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2212       return;
2213    }
2214
2215    if (ir->condition) {
2216       emit_bool_to_cond_code(ir->condition, &predicate);
2217    }
2218
2219    for (i = 0; i < type_size(ir->lhs->type); i++) {
2220       vec4_instruction *inst = emit(MOV(dst, src));
2221       inst->predicate = predicate;
2222
2223       dst.reg_offset++;
2224       src.reg_offset++;
2225    }
2226 }
2227
2228 void
2229 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2230 {
2231    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2232       foreach_in_list(ir_constant, field_value, &ir->components) {
2233          emit_constant_values(dst, field_value);
2234       }
2235       return;
2236    }
2237
2238    if (ir->type->is_array()) {
2239       for (unsigned int i = 0; i < ir->type->length; i++) {
2240          emit_constant_values(dst, ir->array_elements[i]);
2241       }
2242       return;
2243    }
2244
2245    if (ir->type->is_matrix()) {
2246       for (int i = 0; i < ir->type->matrix_columns; i++) {
2247          float *vec = &ir->value.f[i * ir->type->vector_elements];
2248
2249          for (int j = 0; j < ir->type->vector_elements; j++) {
2250             dst->writemask = 1 << j;
2251             dst->type = BRW_REGISTER_TYPE_F;
2252
2253             emit(MOV(*dst, src_reg(vec[j])));
2254          }
2255          dst->reg_offset++;
2256       }
2257       return;
2258    }
2259
2260    int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2261
2262    for (int i = 0; i < ir->type->vector_elements; i++) {
2263       if (!(remaining_writemask & (1 << i)))
2264          continue;
2265
2266       dst->writemask = 1 << i;
2267       dst->type = brw_type_for_base_type(ir->type);
2268
2269       /* Find other components that match the one we're about to
2270        * write.  Emits fewer instructions for things like vec4(0.5,
2271        * 1.5, 1.5, 1.5).
2272        */
2273       for (int j = i + 1; j < ir->type->vector_elements; j++) {
2274          if (ir->type->base_type == GLSL_TYPE_BOOL) {
2275             if (ir->value.b[i] == ir->value.b[j])
2276                dst->writemask |= (1 << j);
2277          } else {
2278             /* u, i, and f storage all line up, so no need for a
2279              * switch case for comparing each type.
2280              */
2281             if (ir->value.u[i] == ir->value.u[j])
2282                dst->writemask |= (1 << j);
2283          }
2284       }
2285
2286       switch (ir->type->base_type) {
2287       case GLSL_TYPE_FLOAT:
2288          emit(MOV(*dst, src_reg(ir->value.f[i])));
2289          break;
2290       case GLSL_TYPE_INT:
2291          emit(MOV(*dst, src_reg(ir->value.i[i])));
2292          break;
2293       case GLSL_TYPE_UINT:
2294          emit(MOV(*dst, src_reg(ir->value.u[i])));
2295          break;
2296       case GLSL_TYPE_BOOL:
2297          emit(MOV(*dst,
2298                   src_reg(ir->value.b[i] != 0 ? ctx->Const.UniformBooleanTrue
2299                                               : 0u)));
2300          break;
2301       default:
2302          unreachable("Non-float/uint/int/bool constant");
2303       }
2304
2305       remaining_writemask &= ~dst->writemask;
2306    }
2307    dst->reg_offset++;
2308 }
2309
2310 void
2311 vec4_visitor::visit(ir_constant *ir)
2312 {
2313    dst_reg dst = dst_reg(this, ir->type);
2314    this->result = src_reg(dst);
2315
2316    emit_constant_values(&dst, ir);
2317 }
2318
2319 void
2320 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2321 {
2322    ir_dereference *deref = static_cast<ir_dereference *>(
2323       ir->actual_parameters.get_head());
2324    ir_variable *location = deref->variable_referenced();
2325    unsigned surf_index = (prog_data->base.binding_table.abo_start +
2326                           location->data.binding);
2327
2328    /* Calculate the surface offset */
2329    src_reg offset(this, glsl_type::uint_type);
2330    ir_dereference_array *deref_array = deref->as_dereference_array();
2331    if (deref_array) {
2332       deref_array->array_index->accept(this);
2333
2334       src_reg tmp(this, glsl_type::uint_type);
2335       emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2336       emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2337    } else {
2338       offset = location->data.atomic.offset;
2339    }
2340
2341    /* Emit the appropriate machine instruction */
2342    const char *callee = ir->callee->function_name();
2343    dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2344
2345    if (!strcmp("__intrinsic_atomic_read", callee)) {
2346       emit_untyped_surface_read(surf_index, dst, offset);
2347
2348    } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2349       emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2350                           src_reg(), src_reg());
2351
2352    } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2353       emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2354                           src_reg(), src_reg());
2355    }
2356 }
2357
2358 void
2359 vec4_visitor::visit(ir_call *ir)
2360 {
2361    const char *callee = ir->callee->function_name();
2362
2363    if (!strcmp("__intrinsic_atomic_read", callee) ||
2364        !strcmp("__intrinsic_atomic_increment", callee) ||
2365        !strcmp("__intrinsic_atomic_predecrement", callee)) {
2366       visit_atomic_counter_intrinsic(ir);
2367    } else {
2368       unreachable("Unsupported intrinsic.");
2369    }
2370 }
2371
2372 src_reg
2373 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, src_reg sampler)
2374 {
2375    vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_MCS);
2376    inst->base_mrf = 2;
2377    inst->mlen = 1;
2378    inst->dst = dst_reg(this, glsl_type::uvec4_type);
2379    inst->dst.writemask = WRITEMASK_XYZW;
2380
2381    inst->src[1] = sampler;
2382
2383    /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2384    int param_base = inst->base_mrf;
2385    int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2386    int zero_mask = 0xf & ~coord_mask;
2387
2388    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2389             coordinate));
2390
2391    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2392             src_reg(0)));
2393
2394    emit(inst);
2395    return src_reg(inst->dst);
2396 }
2397
2398 static bool
2399 is_high_sampler(struct brw_context *brw, src_reg sampler)
2400 {
2401    if (brw->gen < 8 && !brw->is_haswell)
2402       return false;
2403
2404    return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
2405 }
2406
2407 void
2408 vec4_visitor::visit(ir_texture *ir)
2409 {
2410    uint32_t sampler =
2411       _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2412
2413    ir_rvalue *nonconst_sampler_index =
2414       _mesa_get_sampler_array_nonconst_index(ir->sampler);
2415
2416    /* Handle non-constant sampler array indexing */
2417    src_reg sampler_reg;
2418    if (nonconst_sampler_index) {
2419       /* The highest sampler which may be used by this operation is
2420        * the last element of the array. Mark it here, because the generator
2421        * doesn't have enough information to determine the bound.
2422        */
2423       uint32_t array_size = ir->sampler->as_dereference_array()
2424          ->array->type->array_size();
2425
2426       uint32_t max_used = sampler + array_size - 1;
2427       if (ir->op == ir_tg4 && brw->gen < 8) {
2428          max_used += prog_data->base.binding_table.gather_texture_start;
2429       } else {
2430          max_used += prog_data->base.binding_table.texture_start;
2431       }
2432
2433       brw_mark_surface_used(&prog_data->base, max_used);
2434
2435       /* Emit code to evaluate the actual indexing expression */
2436       nonconst_sampler_index->accept(this);
2437       dst_reg temp(this, glsl_type::uint_type);
2438       emit(ADD(temp, this->result, src_reg(sampler)))
2439          ->force_writemask_all = true;
2440       sampler_reg = src_reg(temp);
2441    } else {
2442       /* Single sampler, or constant array index; the indexing expression
2443        * is just an immediate.
2444        */
2445       sampler_reg = src_reg(sampler);
2446    }
2447
2448    /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2449     * emitting anything other than setting up the constant result.
2450     */
2451    if (ir->op == ir_tg4) {
2452       ir_constant *chan = ir->lod_info.component->as_constant();
2453       int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2454       if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2455          dst_reg result(this, ir->type);
2456          this->result = src_reg(result);
2457          emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2458          return;
2459       }
2460    }
2461
2462    /* Should be lowered by do_lower_texture_projection */
2463    assert(!ir->projector);
2464
2465    /* Should be lowered */
2466    assert(!ir->offset || !ir->offset->type->is_array());
2467
2468    /* Generate code to compute all the subexpression trees.  This has to be
2469     * done before loading any values into MRFs for the sampler message since
2470     * generating these values may involve SEND messages that need the MRFs.
2471     */
2472    src_reg coordinate;
2473    if (ir->coordinate) {
2474       ir->coordinate->accept(this);
2475       coordinate = this->result;
2476    }
2477
2478    src_reg shadow_comparitor;
2479    if (ir->shadow_comparitor) {
2480       ir->shadow_comparitor->accept(this);
2481       shadow_comparitor = this->result;
2482    }
2483
2484    bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2485    src_reg offset_value;
2486    if (has_nonconstant_offset) {
2487       ir->offset->accept(this);
2488       offset_value = src_reg(this->result);
2489    }
2490
2491    const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2492    src_reg lod, dPdx, dPdy, sample_index, mcs;
2493    switch (ir->op) {
2494    case ir_tex:
2495       lod = src_reg(0.0f);
2496       lod_type = glsl_type::float_type;
2497       break;
2498    case ir_txf:
2499    case ir_txl:
2500    case ir_txs:
2501       ir->lod_info.lod->accept(this);
2502       lod = this->result;
2503       lod_type = ir->lod_info.lod->type;
2504       break;
2505    case ir_query_levels:
2506       lod = src_reg(0);
2507       lod_type = glsl_type::int_type;
2508       break;
2509    case ir_txf_ms:
2510       ir->lod_info.sample_index->accept(this);
2511       sample_index = this->result;
2512       sample_index_type = ir->lod_info.sample_index->type;
2513
2514       if (brw->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2515          mcs = emit_mcs_fetch(ir, coordinate, sampler_reg);
2516       else
2517          mcs = src_reg(0u);
2518       break;
2519    case ir_txd:
2520       ir->lod_info.grad.dPdx->accept(this);
2521       dPdx = this->result;
2522
2523       ir->lod_info.grad.dPdy->accept(this);
2524       dPdy = this->result;
2525
2526       lod_type = ir->lod_info.grad.dPdx->type;
2527       break;
2528    case ir_txb:
2529    case ir_lod:
2530    case ir_tg4:
2531       break;
2532    }
2533
2534    enum opcode opcode;
2535    switch (ir->op) {
2536    case ir_tex: opcode = SHADER_OPCODE_TXL; break;
2537    case ir_txl: opcode = SHADER_OPCODE_TXL; break;
2538    case ir_txd: opcode = SHADER_OPCODE_TXD; break;
2539    case ir_txf: opcode = SHADER_OPCODE_TXF; break;
2540    case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
2541    case ir_txs: opcode = SHADER_OPCODE_TXS; break;
2542    case ir_tg4: opcode = has_nonconstant_offset
2543                          ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
2544    case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
2545    case ir_txb:
2546       unreachable("TXB is not valid for vertex shaders.");
2547    case ir_lod:
2548       unreachable("LOD is not valid for vertex shaders.");
2549    default:
2550       unreachable("Unrecognized tex op");
2551    }
2552
2553    vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, opcode);
2554
2555    if (ir->offset != NULL && !has_nonconstant_offset) {
2556       inst->texture_offset =
2557          brw_texture_offset(ctx, ir->offset->as_constant()->value.i,
2558                             ir->offset->type->vector_elements);
2559    }
2560
2561    /* Stuff the channel select bits in the top of the texture offset */
2562    if (ir->op == ir_tg4)
2563       inst->texture_offset |= gather_channel(ir, sampler) << 16;
2564
2565    /* The message header is necessary for:
2566     * - Gen4 (always)
2567     * - Texel offsets
2568     * - Gather channel selection
2569     * - Sampler indices too large to fit in a 4-bit value.
2570     */
2571    inst->header_present =
2572       brw->gen < 5 || inst->texture_offset != 0 || ir->op == ir_tg4 ||
2573       is_high_sampler(brw, sampler_reg);
2574    inst->base_mrf = 2;
2575    inst->mlen = inst->header_present + 1; /* always at least one */
2576    inst->dst = dst_reg(this, ir->type);
2577    inst->dst.writemask = WRITEMASK_XYZW;
2578    inst->shadow_compare = ir->shadow_comparitor != NULL;
2579
2580    inst->src[1] = sampler_reg;
2581
2582    /* MRF for the first parameter */
2583    int param_base = inst->base_mrf + inst->header_present;
2584
2585    if (ir->op == ir_txs || ir->op == ir_query_levels) {
2586       int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2587       emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2588    } else {
2589       /* Load the coordinate */
2590       /* FINISHME: gl_clamp_mask and saturate */
2591       int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2592       int zero_mask = 0xf & ~coord_mask;
2593
2594       emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2595                coordinate));
2596
2597       if (zero_mask != 0) {
2598          emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2599                   src_reg(0)));
2600       }
2601       /* Load the shadow comparitor */
2602       if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2603          emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2604                           WRITEMASK_X),
2605                   shadow_comparitor));
2606          inst->mlen++;
2607       }
2608
2609       /* Load the LOD info */
2610       if (ir->op == ir_tex || ir->op == ir_txl) {
2611          int mrf, writemask;
2612          if (brw->gen >= 5) {
2613             mrf = param_base + 1;
2614             if (ir->shadow_comparitor) {
2615                writemask = WRITEMASK_Y;
2616                /* mlen already incremented */
2617             } else {
2618                writemask = WRITEMASK_X;
2619                inst->mlen++;
2620             }
2621          } else /* brw->gen == 4 */ {
2622             mrf = param_base;
2623             writemask = WRITEMASK_W;
2624          }
2625          emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2626       } else if (ir->op == ir_txf) {
2627          emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2628       } else if (ir->op == ir_txf_ms) {
2629          emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2630                   sample_index));
2631          if (brw->gen >= 7) {
2632             /* MCS data is in the first channel of `mcs`, but we need to get it into
2633              * the .y channel of the second vec4 of params, so replicate .x across
2634              * the whole vec4 and then mask off everything except .y
2635              */
2636             mcs.swizzle = BRW_SWIZZLE_XXXX;
2637             emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2638                      mcs));
2639          }
2640          inst->mlen++;
2641       } else if (ir->op == ir_txd) {
2642          const glsl_type *type = lod_type;
2643
2644          if (brw->gen >= 5) {
2645             dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2646             dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2647             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2648             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2649             inst->mlen++;
2650
2651             if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2652                dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2653                dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2654                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2655                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2656                inst->mlen++;
2657
2658                if (ir->shadow_comparitor) {
2659                   emit(MOV(dst_reg(MRF, param_base + 2,
2660                                    ir->shadow_comparitor->type, WRITEMASK_Z),
2661                            shadow_comparitor));
2662                }
2663             }
2664          } else /* brw->gen == 4 */ {
2665             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2666             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2667             inst->mlen += 2;
2668          }
2669       } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2670          if (ir->shadow_comparitor) {
2671             emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2672                      shadow_comparitor));
2673          }
2674
2675          emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2676                   offset_value));
2677          inst->mlen++;
2678       }
2679    }
2680
2681    emit(inst);
2682
2683    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2684     * spec requires layers.
2685     */
2686    if (ir->op == ir_txs) {
2687       glsl_type const *type = ir->sampler->type;
2688       if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2689           type->sampler_array) {
2690          emit_math(SHADER_OPCODE_INT_QUOTIENT,
2691                    writemask(inst->dst, WRITEMASK_Z),
2692                    src_reg(inst->dst), src_reg(6));
2693       }
2694    }
2695
2696    if (brw->gen == 6 && ir->op == ir_tg4) {
2697       emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2698    }
2699
2700    swizzle_result(ir, src_reg(inst->dst), sampler);
2701 }
2702
2703 /**
2704  * Apply workarounds for Gen6 gather with UINT/SINT
2705  */
2706 void
2707 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2708 {
2709    if (!wa)
2710       return;
2711
2712    int width = (wa & WA_8BIT) ? 8 : 16;
2713    dst_reg dst_f = dst;
2714    dst_f.type = BRW_REGISTER_TYPE_F;
2715
2716    /* Convert from UNORM to UINT */
2717    emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2718    emit(MOV(dst, src_reg(dst_f)));
2719
2720    if (wa & WA_SIGN) {
2721       /* Reinterpret the UINT value as a signed INT value by
2722        * shifting the sign bit into place, then shifting back
2723        * preserving sign.
2724        */
2725       emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2726       emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2727    }
2728 }
2729
2730 /**
2731  * Set up the gather channel based on the swizzle, for gather4.
2732  */
2733 uint32_t
2734 vec4_visitor::gather_channel(ir_texture *ir, uint32_t sampler)
2735 {
2736    ir_constant *chan = ir->lod_info.component->as_constant();
2737    int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2738    switch (swiz) {
2739       case SWIZZLE_X: return 0;
2740       case SWIZZLE_Y:
2741          /* gather4 sampler is broken for green channel on RG32F --
2742           * we must ask for blue instead.
2743           */
2744          if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2745             return 2;
2746          return 1;
2747       case SWIZZLE_Z: return 2;
2748       case SWIZZLE_W: return 3;
2749       default:
2750          unreachable("Not reached"); /* zero, one swizzles handled already */
2751    }
2752 }
2753
2754 void
2755 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, uint32_t sampler)
2756 {
2757    int s = key->tex.swizzles[sampler];
2758
2759    this->result = src_reg(this, ir->type);
2760    dst_reg swizzled_result(this->result);
2761
2762    if (ir->op == ir_query_levels) {
2763       /* # levels is in .w */
2764       orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2765       emit(MOV(swizzled_result, orig_val));
2766       return;
2767    }
2768
2769    if (ir->op == ir_txs || ir->type == glsl_type::float_type
2770                         || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2771       emit(MOV(swizzled_result, orig_val));
2772       return;
2773    }
2774
2775
2776    int zero_mask = 0, one_mask = 0, copy_mask = 0;
2777    int swizzle[4] = {0};
2778
2779    for (int i = 0; i < 4; i++) {
2780       switch (GET_SWZ(s, i)) {
2781       case SWIZZLE_ZERO:
2782          zero_mask |= (1 << i);
2783          break;
2784       case SWIZZLE_ONE:
2785          one_mask |= (1 << i);
2786          break;
2787       default:
2788          copy_mask |= (1 << i);
2789          swizzle[i] = GET_SWZ(s, i);
2790          break;
2791       }
2792    }
2793
2794    if (copy_mask) {
2795       orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2796       swizzled_result.writemask = copy_mask;
2797       emit(MOV(swizzled_result, orig_val));
2798    }
2799
2800    if (zero_mask) {
2801       swizzled_result.writemask = zero_mask;
2802       emit(MOV(swizzled_result, src_reg(0.0f)));
2803    }
2804
2805    if (one_mask) {
2806       swizzled_result.writemask = one_mask;
2807       emit(MOV(swizzled_result, src_reg(1.0f)));
2808    }
2809 }
2810
2811 void
2812 vec4_visitor::visit(ir_return *)
2813 {
2814    unreachable("not reached");
2815 }
2816
2817 void
2818 vec4_visitor::visit(ir_discard *)
2819 {
2820    unreachable("not reached");
2821 }
2822
2823 void
2824 vec4_visitor::visit(ir_if *ir)
2825 {
2826    /* Don't point the annotation at the if statement, because then it plus
2827     * the then and else blocks get printed.
2828     */
2829    this->base_ir = ir->condition;
2830
2831    if (brw->gen == 6) {
2832       emit_if_gen6(ir);
2833    } else {
2834       enum brw_predicate predicate;
2835       emit_bool_to_cond_code(ir->condition, &predicate);
2836       emit(IF(predicate));
2837    }
2838
2839    visit_instructions(&ir->then_instructions);
2840
2841    if (!ir->else_instructions.is_empty()) {
2842       this->base_ir = ir->condition;
2843       emit(BRW_OPCODE_ELSE);
2844
2845       visit_instructions(&ir->else_instructions);
2846    }
2847
2848    this->base_ir = ir->condition;
2849    emit(BRW_OPCODE_ENDIF);
2850 }
2851
2852 void
2853 vec4_visitor::visit(ir_emit_vertex *)
2854 {
2855    unreachable("not reached");
2856 }
2857
2858 void
2859 vec4_visitor::visit(ir_end_primitive *)
2860 {
2861    unreachable("not reached");
2862 }
2863
2864 void
2865 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2866                                   dst_reg dst, src_reg offset,
2867                                   src_reg src0, src_reg src1)
2868 {
2869    unsigned mlen = 0;
2870
2871    /* Set the atomic operation offset. */
2872    emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2873    mlen++;
2874
2875    /* Set the atomic operation arguments. */
2876    if (src0.file != BAD_FILE) {
2877       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2878       mlen++;
2879    }
2880
2881    if (src1.file != BAD_FILE) {
2882       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2883       mlen++;
2884    }
2885
2886    /* Emit the instruction.  Note that this maps to the normal SIMD8
2887     * untyped atomic message on Ivy Bridge, but that's OK because
2888     * unused channels will be masked out.
2889     */
2890    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
2891                                  src_reg(atomic_op), src_reg(surf_index));
2892    inst->base_mrf = 0;
2893    inst->mlen = mlen;
2894 }
2895
2896 void
2897 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
2898                                         src_reg offset)
2899 {
2900    /* Set the surface read offset. */
2901    emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
2902
2903    /* Emit the instruction.  Note that this maps to the normal SIMD8
2904     * untyped surface read message, but that's OK because unused
2905     * channels will be masked out.
2906     */
2907    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ,
2908                                  dst, src_reg(surf_index));
2909    inst->base_mrf = 0;
2910    inst->mlen = 1;
2911 }
2912
2913 void
2914 vec4_visitor::emit_ndc_computation()
2915 {
2916    /* Get the position */
2917    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2918
2919    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2920    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2921    output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2922
2923    current_annotation = "NDC";
2924    dst_reg ndc_w = ndc;
2925    ndc_w.writemask = WRITEMASK_W;
2926    src_reg pos_w = pos;
2927    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2928    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2929
2930    dst_reg ndc_xyz = ndc;
2931    ndc_xyz.writemask = WRITEMASK_XYZ;
2932
2933    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2934 }
2935
2936 void
2937 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
2938 {
2939    if (brw->gen < 6 &&
2940        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2941         key->userclip_active || brw->has_negative_rhw_bug)) {
2942       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2943       dst_reg header1_w = header1;
2944       header1_w.writemask = WRITEMASK_W;
2945
2946       emit(MOV(header1, 0u));
2947
2948       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2949          src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2950
2951          current_annotation = "Point size";
2952          emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2953          emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2954       }
2955
2956       if (key->userclip_active) {
2957          current_annotation = "Clipping flags";
2958          dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
2959          dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
2960
2961          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
2962          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
2963          emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
2964
2965          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
2966          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
2967          emit(SHL(flags1, src_reg(flags1), src_reg(4)));
2968          emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
2969       }
2970
2971       /* i965 clipping workaround:
2972        * 1) Test for -ve rhw
2973        * 2) If set,
2974        *      set ndc = (0,0,0,0)
2975        *      set ucp[6] = 1
2976        *
2977        * Later, clipping will detect ucp[6] and ensure the primitive is
2978        * clipped against all fixed planes.
2979        */
2980       if (brw->has_negative_rhw_bug) {
2981          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
2982          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
2983          emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
2984          vec4_instruction *inst;
2985          inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
2986          inst->predicate = BRW_PREDICATE_NORMAL;
2987          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
2988          inst->predicate = BRW_PREDICATE_NORMAL;
2989       }
2990
2991       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2992    } else if (brw->gen < 6) {
2993       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2994    } else {
2995       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2996       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2997          dst_reg reg_w = reg;
2998          reg_w.writemask = WRITEMASK_W;
2999          emit(MOV(reg_w, src_reg(output_reg[VARYING_SLOT_PSIZ])));
3000       }
3001       if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
3002          dst_reg reg_y = reg;
3003          reg_y.writemask = WRITEMASK_Y;
3004          reg_y.type = BRW_REGISTER_TYPE_D;
3005          emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
3006       }
3007       if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
3008          dst_reg reg_z = reg;
3009          reg_z.writemask = WRITEMASK_Z;
3010          reg_z.type = BRW_REGISTER_TYPE_D;
3011          emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
3012       }
3013    }
3014 }
3015
3016 void
3017 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
3018 {
3019    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
3020     *
3021     *     "If a linked set of shaders forming the vertex stage contains no
3022     *     static write to gl_ClipVertex or gl_ClipDistance, but the
3023     *     application has requested clipping against user clip planes through
3024     *     the API, then the coordinate written to gl_Position is used for
3025     *     comparison against the user clip planes."
3026     *
3027     * This function is only called if the shader didn't write to
3028     * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
3029     * if the user wrote to it; otherwise we use gl_Position.
3030     */
3031    gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
3032    if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
3033       clip_vertex = VARYING_SLOT_POS;
3034    }
3035
3036    for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
3037         ++i) {
3038       reg.writemask = 1 << i;
3039       emit(DP4(reg,
3040                src_reg(output_reg[clip_vertex]),
3041                src_reg(this->userplane[i + offset])));
3042    }
3043 }
3044
3045 void
3046 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
3047 {
3048    assert (varying < VARYING_SLOT_MAX);
3049    reg.type = output_reg[varying].type;
3050    current_annotation = output_reg_annotation[varying];
3051    /* Copy the register, saturating if necessary */
3052    vec4_instruction *inst = emit(MOV(reg,
3053                                      src_reg(output_reg[varying])));
3054    if ((varying == VARYING_SLOT_COL0 ||
3055         varying == VARYING_SLOT_COL1 ||
3056         varying == VARYING_SLOT_BFC0 ||
3057         varying == VARYING_SLOT_BFC1) &&
3058        key->clamp_vertex_color) {
3059       inst->saturate = true;
3060    }
3061 }
3062
3063 void
3064 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
3065 {
3066    reg.type = BRW_REGISTER_TYPE_F;
3067
3068    switch (varying) {
3069    case VARYING_SLOT_PSIZ:
3070    {
3071       /* PSIZ is always in slot 0, and is coupled with other flags. */
3072       current_annotation = "indices, point width, clip flags";
3073       emit_psiz_and_flags(reg);
3074       break;
3075    }
3076    case BRW_VARYING_SLOT_NDC:
3077       current_annotation = "NDC";
3078       emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
3079       break;
3080    case VARYING_SLOT_POS:
3081       current_annotation = "gl_Position";
3082       emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
3083       break;
3084    case VARYING_SLOT_EDGE:
3085       /* This is present when doing unfilled polygons.  We're supposed to copy
3086        * the edge flag from the user-provided vertex array
3087        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
3088        * of that attribute (starts as 1.0f).  This is then used in clipping to
3089        * determine which edges should be drawn as wireframe.
3090        */
3091       current_annotation = "edge flag";
3092       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
3093                                     glsl_type::float_type, WRITEMASK_XYZW))));
3094       break;
3095    case BRW_VARYING_SLOT_PAD:
3096       /* No need to write to this slot */
3097       break;
3098    default:
3099       emit_generic_urb_slot(reg, varying);
3100       break;
3101    }
3102 }
3103
3104 static int
3105 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
3106 {
3107    if (brw->gen >= 6) {
3108       /* URB data written (does not include the message header reg) must
3109        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
3110        * section 5.4.3.2.2: URB_INTERLEAVED.
3111        *
3112        * URB entries are allocated on a multiple of 1024 bits, so an
3113        * extra 128 bits written here to make the end align to 256 is
3114        * no problem.
3115        */
3116       if ((mlen % 2) != 1)
3117          mlen++;
3118    }
3119
3120    return mlen;
3121 }
3122
3123
3124 /**
3125  * Generates the VUE payload plus the necessary URB write instructions to
3126  * output it.
3127  *
3128  * The VUE layout is documented in Volume 2a.
3129  */
3130 void
3131 vec4_visitor::emit_vertex()
3132 {
3133    /* MRF 0 is reserved for the debugger, so start with message header
3134     * in MRF 1.
3135     */
3136    int base_mrf = 1;
3137    int mrf = base_mrf;
3138    /* In the process of generating our URB write message contents, we
3139     * may need to unspill a register or load from an array.  Those
3140     * reads would use MRFs 14-15.
3141     */
3142    int max_usable_mrf = 13;
3143
3144    /* The following assertion verifies that max_usable_mrf causes an
3145     * even-numbered amount of URB write data, which will meet gen6's
3146     * requirements for length alignment.
3147     */
3148    assert ((max_usable_mrf - base_mrf) % 2 == 0);
3149
3150    /* First mrf is the g0-based message header containing URB handles and
3151     * such.
3152     */
3153    emit_urb_write_header(mrf++);
3154
3155    if (brw->gen < 6) {
3156       emit_ndc_computation();
3157    }
3158
3159    /* Lower legacy ff and ClipVertex clipping to clip distances */
3160    if (key->userclip_active && !prog->UsesClipDistanceOut) {
3161       current_annotation = "user clip distances";
3162
3163       output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3164       output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3165
3166       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3167       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3168    }
3169
3170    /* We may need to split this up into several URB writes, so do them in a
3171     * loop.
3172     */
3173    int slot = 0;
3174    bool complete = false;
3175    do {
3176       /* URB offset is in URB row increments, and each of our MRFs is half of
3177        * one of those, since we're doing interleaved writes.
3178        */
3179       int offset = slot / 2;
3180
3181       mrf = base_mrf + 1;
3182       for (; slot < prog_data->vue_map.num_slots; ++slot) {
3183          emit_urb_slot(dst_reg(MRF, mrf++),
3184                        prog_data->vue_map.slot_to_varying[slot]);
3185
3186          /* If this was max_usable_mrf, we can't fit anything more into this
3187           * URB WRITE.
3188           */
3189          if (mrf > max_usable_mrf) {
3190             slot++;
3191             break;
3192          }
3193       }
3194
3195       complete = slot >= prog_data->vue_map.num_slots;
3196       current_annotation = "URB write";
3197       vec4_instruction *inst = emit_urb_write_opcode(complete);
3198       inst->base_mrf = base_mrf;
3199       inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
3200       inst->offset += offset;
3201    } while(!complete);
3202 }
3203
3204
3205 src_reg
3206 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
3207                                  src_reg *reladdr, int reg_offset)
3208 {
3209    /* Because we store the values to scratch interleaved like our
3210     * vertex data, we need to scale the vec4 index by 2.
3211     */
3212    int message_header_scale = 2;
3213
3214    /* Pre-gen6, the message header uses byte offsets instead of vec4
3215     * (16-byte) offset units.
3216     */
3217    if (brw->gen < 6)
3218       message_header_scale *= 16;
3219
3220    if (reladdr) {
3221       src_reg index = src_reg(this, glsl_type::int_type);
3222
3223       emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3224                                    src_reg(reg_offset)));
3225       emit_before(block, inst, MUL(dst_reg(index), index,
3226                                    src_reg(message_header_scale)));
3227
3228       return index;
3229    } else {
3230       return src_reg(reg_offset * message_header_scale);
3231    }
3232 }
3233
3234 src_reg
3235 vec4_visitor::get_pull_constant_offset(bblock_t * block, vec4_instruction *inst,
3236                                        src_reg *reladdr, int reg_offset)
3237 {
3238    if (reladdr) {
3239       src_reg index = src_reg(this, glsl_type::int_type);
3240
3241       emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3242                                    src_reg(reg_offset)));
3243
3244       /* Pre-gen6, the message header uses byte offsets instead of vec4
3245        * (16-byte) offset units.
3246        */
3247       if (brw->gen < 6) {
3248          emit_before(block, inst, MUL(dst_reg(index), index, src_reg(16)));
3249       }
3250
3251       return index;
3252    } else if (brw->gen >= 8) {
3253       /* Store the offset in a GRF so we can send-from-GRF. */
3254       src_reg offset = src_reg(this, glsl_type::int_type);
3255       emit_before(block, inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3256       return offset;
3257    } else {
3258       int message_header_scale = brw->gen < 6 ? 16 : 1;
3259       return src_reg(reg_offset * message_header_scale);
3260    }
3261 }
3262
3263 /**
3264  * Emits an instruction before @inst to load the value named by @orig_src
3265  * from scratch space at @base_offset to @temp.
3266  *
3267  * @base_offset is measured in 32-byte units (the size of a register).
3268  */
3269 void
3270 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
3271                                 dst_reg temp, src_reg orig_src,
3272                                 int base_offset)
3273 {
3274    int reg_offset = base_offset + orig_src.reg_offset;
3275    src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
3276                                       reg_offset);
3277
3278    emit_before(block, inst, SCRATCH_READ(temp, index));
3279 }
3280
3281 /**
3282  * Emits an instruction after @inst to store the value to be written
3283  * to @orig_dst to scratch space at @base_offset, from @temp.
3284  *
3285  * @base_offset is measured in 32-byte units (the size of a register).
3286  */
3287 void
3288 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
3289                                  int base_offset)
3290 {
3291    int reg_offset = base_offset + inst->dst.reg_offset;
3292    src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
3293                                       reg_offset);
3294
3295    /* Create a temporary register to store *inst's result in.
3296     *
3297     * We have to be careful in MOVing from our temporary result register in
3298     * the scratch write.  If we swizzle from channels of the temporary that
3299     * weren't initialized, it will confuse live interval analysis, which will
3300     * make spilling fail to make progress.
3301     */
3302    src_reg temp = src_reg(this, glsl_type::vec4_type);
3303    temp.type = inst->dst.type;
3304    int first_writemask_chan = ffs(inst->dst.writemask) - 1;
3305    int swizzles[4];
3306    for (int i = 0; i < 4; i++)
3307       if (inst->dst.writemask & (1 << i))
3308          swizzles[i] = i;
3309       else
3310          swizzles[i] = first_writemask_chan;
3311    temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
3312                                swizzles[2], swizzles[3]);
3313
3314    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3315                                        inst->dst.writemask));
3316    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3317    write->predicate = inst->predicate;
3318    write->ir = inst->ir;
3319    write->annotation = inst->annotation;
3320    inst->insert_after(block, write);
3321
3322    inst->dst.file = temp.file;
3323    inst->dst.reg = temp.reg;
3324    inst->dst.reg_offset = temp.reg_offset;
3325    inst->dst.reladdr = NULL;
3326 }
3327
3328 /**
3329  * We can't generally support array access in GRF space, because a
3330  * single instruction's destination can only span 2 contiguous
3331  * registers.  So, we send all GRF arrays that get variable index
3332  * access to scratch space.
3333  */
3334 void
3335 vec4_visitor::move_grf_array_access_to_scratch()
3336 {
3337    int scratch_loc[this->virtual_grf_count];
3338    memset(scratch_loc, -1, sizeof(scratch_loc));
3339
3340    /* First, calculate the set of virtual GRFs that need to be punted
3341     * to scratch due to having any array access on them, and where in
3342     * scratch.
3343     */
3344    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
3345       if (inst->dst.file == GRF && inst->dst.reladdr &&
3346           scratch_loc[inst->dst.reg] == -1) {
3347          scratch_loc[inst->dst.reg] = c->last_scratch;
3348          c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
3349       }
3350
3351       for (int i = 0 ; i < 3; i++) {
3352          src_reg *src = &inst->src[i];
3353
3354          if (src->file == GRF && src->reladdr &&
3355              scratch_loc[src->reg] == -1) {
3356             scratch_loc[src->reg] = c->last_scratch;
3357             c->last_scratch += this->virtual_grf_sizes[src->reg];
3358          }
3359       }
3360    }
3361
3362    /* Now, for anything that will be accessed through scratch, rewrite
3363     * it to load/store.  Note that this is a _safe list walk, because
3364     * we may generate a new scratch_write instruction after the one
3365     * we're processing.
3366     */
3367    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3368       /* Set up the annotation tracking for new generated instructions. */
3369       base_ir = inst->ir;
3370       current_annotation = inst->annotation;
3371
3372       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
3373          emit_scratch_write(block, inst, scratch_loc[inst->dst.reg]);
3374       }
3375
3376       for (int i = 0 ; i < 3; i++) {
3377          if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
3378             continue;
3379
3380          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3381
3382          emit_scratch_read(block, inst, temp, inst->src[i],
3383                            scratch_loc[inst->src[i].reg]);
3384
3385          inst->src[i].file = temp.file;
3386          inst->src[i].reg = temp.reg;
3387          inst->src[i].reg_offset = temp.reg_offset;
3388          inst->src[i].reladdr = NULL;
3389       }
3390    }
3391 }
3392
3393 /**
3394  * Emits an instruction before @inst to load the value named by @orig_src
3395  * from the pull constant buffer (surface) at @base_offset to @temp.
3396  */
3397 void
3398 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
3399                                       dst_reg temp, src_reg orig_src,
3400                                       int base_offset)
3401 {
3402    int reg_offset = base_offset + orig_src.reg_offset;
3403    src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3404    src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr,
3405                                              reg_offset);
3406    vec4_instruction *load;
3407
3408    if (brw->gen >= 7) {
3409       dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
3410       grf_offset.type = offset.type;
3411       emit_before(block, inst, MOV(grf_offset, offset));
3412
3413       load = new(mem_ctx) vec4_instruction(this,
3414                                            VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
3415                                            temp, index, src_reg(grf_offset));
3416    } else {
3417       load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
3418                                            temp, index, offset);
3419       load->base_mrf = 14;
3420       load->mlen = 1;
3421    }
3422    emit_before(block, inst, load);
3423 }
3424
3425 /**
3426  * Implements array access of uniforms by inserting a
3427  * PULL_CONSTANT_LOAD instruction.
3428  *
3429  * Unlike temporary GRF array access (where we don't support it due to
3430  * the difficulty of doing relative addressing on instruction
3431  * destinations), we could potentially do array access of uniforms
3432  * that were loaded in GRF space as push constants.  In real-world
3433  * usage we've seen, though, the arrays being used are always larger
3434  * than we could load as push constants, so just always move all
3435  * uniform array access out to a pull constant buffer.
3436  */
3437 void
3438 vec4_visitor::move_uniform_array_access_to_pull_constants()
3439 {
3440    int pull_constant_loc[this->uniforms];
3441    memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
3442
3443    /* Walk through and find array access of uniforms.  Put a copy of that
3444     * uniform in the pull constant buffer.
3445     *
3446     * Note that we don't move constant-indexed accesses to arrays.  No
3447     * testing has been done of the performance impact of this choice.
3448     */
3449    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3450       for (int i = 0 ; i < 3; i++) {
3451          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3452             continue;
3453
3454          int uniform = inst->src[i].reg;
3455
3456          /* If this array isn't already present in the pull constant buffer,
3457           * add it.
3458           */
3459          if (pull_constant_loc[uniform] == -1) {
3460             const gl_constant_value **values =
3461                &stage_prog_data->param[uniform * 4];
3462
3463             pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3464
3465             assert(uniform < uniform_array_size);
3466             for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3467                stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3468                   = values[j];
3469             }
3470          }
3471
3472          /* Set up the annotation tracking for new generated instructions. */
3473          base_ir = inst->ir;
3474          current_annotation = inst->annotation;
3475
3476          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3477
3478          emit_pull_constant_load(block, inst, temp, inst->src[i],
3479                                  pull_constant_loc[uniform]);
3480
3481          inst->src[i].file = temp.file;
3482          inst->src[i].reg = temp.reg;
3483          inst->src[i].reg_offset = temp.reg_offset;
3484          inst->src[i].reladdr = NULL;
3485       }
3486    }
3487
3488    /* Now there are no accesses of the UNIFORM file with a reladdr, so
3489     * no need to track them as larger-than-vec4 objects.  This will be
3490     * relied on in cutting out unused uniform vectors from push
3491     * constants.
3492     */
3493    split_uniform_registers();
3494 }
3495
3496 void
3497 vec4_visitor::resolve_ud_negate(src_reg *reg)
3498 {
3499    if (reg->type != BRW_REGISTER_TYPE_UD ||
3500        !reg->negate)
3501       return;
3502
3503    src_reg temp = src_reg(this, glsl_type::uvec4_type);
3504    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3505    *reg = temp;
3506 }
3507
3508 vec4_visitor::vec4_visitor(struct brw_context *brw,
3509                            struct brw_vec4_compile *c,
3510                            struct gl_program *prog,
3511                            const struct brw_vec4_prog_key *key,
3512                            struct brw_vec4_prog_data *prog_data,
3513                            struct gl_shader_program *shader_prog,
3514                            gl_shader_stage stage,
3515                            void *mem_ctx,
3516                            bool debug_flag,
3517                            bool no_spills,
3518                            shader_time_shader_type st_base,
3519                            shader_time_shader_type st_written,
3520                            shader_time_shader_type st_reset)
3521    : backend_visitor(brw, shader_prog, prog, &prog_data->base, stage),
3522      c(c),
3523      key(key),
3524      prog_data(prog_data),
3525      sanity_param_count(0),
3526      fail_msg(NULL),
3527      first_non_payload_grf(0),
3528      need_all_constants_in_pull_buffer(false),
3529      debug_flag(debug_flag),
3530      no_spills(no_spills),
3531      st_base(st_base),
3532      st_written(st_written),
3533      st_reset(st_reset)
3534 {
3535    this->mem_ctx = mem_ctx;
3536    this->failed = false;
3537
3538    this->base_ir = NULL;
3539    this->current_annotation = NULL;
3540    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3541
3542    this->variable_ht = hash_table_ctor(0,
3543                                        hash_table_pointer_hash,
3544                                        hash_table_pointer_compare);
3545
3546    this->virtual_grf_start = NULL;
3547    this->virtual_grf_end = NULL;
3548    this->virtual_grf_sizes = NULL;
3549    this->virtual_grf_count = 0;
3550    this->virtual_grf_reg_map = NULL;
3551    this->virtual_grf_reg_count = 0;
3552    this->virtual_grf_array_size = 0;
3553    this->live_intervals_valid = false;
3554
3555    this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3556
3557    this->uniforms = 0;
3558
3559    /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3560     * at least one. See setup_uniforms() in brw_vec4.cpp.
3561     */
3562    this->uniform_array_size = 1;
3563    if (prog_data) {
3564       this->uniform_array_size = MAX2(stage_prog_data->nr_params, 1);
3565    }
3566
3567    this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3568    this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3569 }
3570
3571 vec4_visitor::~vec4_visitor()
3572 {
3573    hash_table_dtor(this->variable_ht);
3574 }
3575
3576
3577 void
3578 vec4_visitor::fail(const char *format, ...)
3579 {
3580    va_list va;
3581    char *msg;
3582
3583    if (failed)
3584       return;
3585
3586    failed = true;
3587
3588    va_start(va, format);
3589    msg = ralloc_vasprintf(mem_ctx, format, va);
3590    va_end(va);
3591    msg = ralloc_asprintf(mem_ctx, "vec4 compile failed: %s\n", msg);
3592
3593    this->fail_msg = msg;
3594
3595    if (debug_flag) {
3596       fprintf(stderr, "%s",  msg);
3597    }
3598 }
3599
3600 } /* namespace brw */