src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 extern "C" {
  32
  33 #include <sys/types.h>
  34
  35 #include "main/hash_table.h"
  36 #include "main/macros.h"
  37 #include "main/shaderobj.h"
  38 #include "main/fbobject.h"
  39 #include "program/prog_parameter.h"
  40 #include "program/prog_print.h"
  41 #include "program/register_allocate.h"
  42 #include "program/sampler.h"
  43 #include "program/hash_table.h"
  44 #include "brw_context.h"
  45 #include "brw_eu.h"
  46 #include "brw_wm.h"
  47 }
  48 #include "brw_fs.h"
  49 #include "brw_dead_control_flow.h"
  50 #include "main/uniforms.h"
  51 #include "brw_fs_live_variables.h"
  52 #include "glsl/glsl_types.h"
  53
  54 void
  55 fs_inst::init(enum opcode opcode, const fs_reg &dst, fs_reg *src, int sources)
  56 {
  57    memset(this, 0, sizeof(*this));
  58
  59    this->opcode = opcode;
  60    this->dst = dst;
  61    this->src = src;
  62    this->sources = sources;
  63
  64    this->conditional_mod = BRW_CONDITIONAL_NONE;
  65
  66    /* This will be the case for almost all instructions. */
  67    this->regs_written = 1;
  68
  69    this->writes_accumulator = false;
  70 }
  71
  72 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
  73 {
  74    fs_reg *src = ralloc_array(this, fs_reg, 3);
  75    init(opcode, dst, src, 0);
  76 }
  77
  78 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
  79 {
  80    fs_reg *src = ralloc_array(this, fs_reg, 3);
  81    src[0] = src0;
  82    init(opcode, dst, src, 1);
  83 }
  84
  85 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
  86                  const fs_reg &src1)
  87 {
  88    fs_reg *src = ralloc_array(this, fs_reg, 3);
  89    src[0] = src0;
  90    src[1] = src1;
  91    init(opcode, dst, src, 2);
  92 }
  93
  94 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
  95                  const fs_reg &src1, const fs_reg &src2)
  96 {
  97    fs_reg *src = ralloc_array(this, fs_reg, 3);
  98    src[0] = src0;
  99    src[1] = src1;
 100    src[2] = src2;
 101    init(opcode, dst, src, 3);
 102 }
 103
 104 fs_inst::fs_inst(const fs_inst &that)
 105 {
 106    memcpy(this, &that, sizeof(that));
 107
 108    this->src = ralloc_array(this, fs_reg, that.sources);
 109
 110    for (int i = 0; i < that.sources; i++)
 111       this->src[i] = that.src[i];
 112 }
 113
 114 void
 115 fs_inst::resize_sources(uint8_t num_sources)
 116 {
 117    if (this->sources != num_sources) {
 118       this->src = reralloc(this, this->src, fs_reg, num_sources);
 119       this->sources = num_sources;
 120    }
 121 }
 122
 123 #define ALU1(op)                                                        \
 124    fs_inst *                                                            \
 125    fs_visitor::op(fs_reg dst, fs_reg src0)                              \
 126    {                                                                    \
 127       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
 128    }
 129
 130 #define ALU2(op)                                                        \
 131    fs_inst *                                                            \
 132    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1)                 \
 133    {                                                                    \
 134       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
 135    }
 136
 137 #define ALU2_ACC(op)                                                    \
 138    fs_inst *                                                            \
 139    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1)                 \
 140    {                                                                    \
 141       fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
 142       inst->writes_accumulator = true;                                  \
 143       return inst;                                                      \
 144    }
 145
 146 #define ALU3(op)                                                        \
 147    fs_inst *                                                            \
 148    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2)    \
 149    {                                                                    \
 150       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
 151    }
 152
 153 ALU1(NOT)
 154 ALU1(MOV)
 155 ALU1(FRC)
 156 ALU1(RNDD)
 157 ALU1(RNDE)
 158 ALU1(RNDZ)
 159 ALU2(ADD)
 160 ALU2(MUL)
 161 ALU2_ACC(MACH)
 162 ALU2(AND)
 163 ALU2(OR)
 164 ALU2(XOR)
 165 ALU2(SHL)
 166 ALU2(SHR)
 167 ALU2(ASR)
 168 ALU3(LRP)
 169 ALU1(BFREV)
 170 ALU3(BFE)
 171 ALU2(BFI1)
 172 ALU3(BFI2)
 173 ALU1(FBH)
 174 ALU1(FBL)
 175 ALU1(CBIT)
 176 ALU3(MAD)
 177 ALU2_ACC(ADDC)
 178 ALU2_ACC(SUBB)
 179 ALU2(SEL)
 180 ALU2(MAC)
 181
 182 /** Gen4 predicated IF. */
 183 fs_inst *
 184 fs_visitor::IF(uint32_t predicate)
 185 {
 186    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
 187    inst->predicate = predicate;
 188    return inst;
 189 }
 190
 191 /** Gen6 IF with embedded comparison. */
 192 fs_inst *
 193 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
 194 {
 195    assert(brw->gen == 6);
 196    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
 197                                         reg_null_d, src0, src1);
 198    inst->conditional_mod = condition;
 199    return inst;
 200 }
 201
 202 /**
 203  * CMP: Sets the low bit of the destination channels with the result
 204  * of the comparison, while the upper bits are undefined, and updates
 205  * the flag register with the packed 16 bits of the result.
 206  */
 207 fs_inst *
 208 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
 209 {
 210    fs_inst *inst;
 211
 212    /* Take the instruction:
 213     *
 214     * CMP null<d> src0<f> src1<f>
 215     *
 216     * Original gen4 does type conversion to the destination type before
 217     * comparison, producing garbage results for floating point comparisons.
 218     * gen5 does the comparison on the execution type (resolved source types),
 219     * so dst type doesn't matter.  gen6 does comparison and then uses the
 220     * result as if it was the dst type with no conversion, which happens to
 221     * mostly work out for float-interpreted-as-int since our comparisons are
 222     * for >0, =0, <0.
 223     */
 224    if (brw->gen == 4) {
 225       dst.type = src0.type;
 226       if (dst.file == HW_REG)
 227          dst.fixed_hw_reg.type = dst.type;
 228    }
 229
 230    resolve_ud_negate(&src0);
 231    resolve_ud_negate(&src1);
 232
 233    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
 234    inst->conditional_mod = condition;
 235
 236    return inst;
 237 }
 238
 239 exec_list
 240 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
 241                                        const fs_reg &surf_index,
 242                                        const fs_reg &varying_offset,
 243                                        uint32_t const_offset)
 244 {
 245    exec_list instructions;
 246    fs_inst *inst;
 247
 248    /* We have our constant surface use a pitch of 4 bytes, so our index can
 249     * be any component of a vector, and then we load 4 contiguous
 250     * components starting from that.
 251     *
 252     * We break down the const_offset to a portion added to the variable
 253     * offset and a portion done using reg_offset, which means that if you
 254     * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
 255     * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
 256     * CSE can later notice that those loads are all the same and eliminate
 257     * the redundant ones.
 258     */
 259    fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
 260    instructions.push_tail(ADD(vec4_offset,
 261                               varying_offset, const_offset & ~3));
 262
 263    int scale = 1;
 264    if (brw->gen == 4 && dispatch_width == 8) {
 265       /* Pre-gen5, we can either use a SIMD8 message that requires (header,
 266        * u, v, r) as parameters, or we can just use the SIMD16 message
 267        * consisting of (header, u).  We choose the second, at the cost of a
 268        * longer return length.
 269        */
 270       scale = 2;
 271    }
 272
 273    enum opcode op;
 274    if (brw->gen >= 7)
 275       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
 276    else
 277       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
 278    fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type);
 279    inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
 280    inst->regs_written = 4 * scale;
 281    instructions.push_tail(inst);
 282
 283    if (brw->gen < 7) {
 284       inst->base_mrf = 13;
 285       inst->header_present = true;
 286       if (brw->gen == 4)
 287          inst->mlen = 3;
 288       else
 289          inst->mlen = 1 + dispatch_width / 8;
 290    }
 291
 292    vec4_result.reg_offset += (const_offset & 3) * scale;
 293    instructions.push_tail(MOV(dst, vec4_result));
 294
 295    return instructions;
 296 }
 297
 298 /**
 299  * A helper for MOV generation for fixing up broken hardware SEND dependency
 300  * handling.
 301  */
 302 fs_inst *
 303 fs_visitor::DEP_RESOLVE_MOV(int grf)
 304 {
 305    fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
 306
 307    inst->ir = NULL;
 308    inst->annotation = "send dependency resolve";
 309
 310    /* The caller always wants uncompressed to emit the minimal extra
 311     * dependencies, and to avoid having to deal with aligning its regs to 2.
 312     */
 313    inst->force_uncompressed = true;
 314
 315    return inst;
 316 }
 317
 318 bool
 319 fs_inst::equals(fs_inst *inst) const
 320 {
 321    return (opcode == inst->opcode &&
 322            dst.equals(inst->dst) &&
 323            src[0].equals(inst->src[0]) &&
 324            src[1].equals(inst->src[1]) &&
 325            src[2].equals(inst->src[2]) &&
 326            saturate == inst->saturate &&
 327            predicate == inst->predicate &&
 328            conditional_mod == inst->conditional_mod &&
 329            mlen == inst->mlen &&
 330            base_mrf == inst->base_mrf &&
 331            sampler == inst->sampler &&
 332            target == inst->target &&
 333            eot == inst->eot &&
 334            header_present == inst->header_present &&
 335            shadow_compare == inst->shadow_compare &&
 336            offset == inst->offset);
 337 }
 338
 339 bool
 340 fs_inst::overwrites_reg(const fs_reg &reg) const
 341 {
 342    return (reg.file == dst.file &&
 343            reg.reg == dst.reg &&
 344            reg.reg_offset >= dst.reg_offset  &&
 345            reg.reg_offset < dst.reg_offset + regs_written);
 346 }
 347
 348 bool
 349 fs_inst::is_send_from_grf() const
 350 {
 351    return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
 352            opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
 353            (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
 354             src[1].file == GRF) ||
 355            (is_tex() && src[0].file == GRF));
 356 }
 357
 358 bool
 359 fs_visitor::can_do_source_mods(fs_inst *inst)
 360 {
 361    if (brw->gen == 6 && inst->is_math())
 362       return false;
 363
 364    if (inst->is_send_from_grf())
 365       return false;
 366
 367    if (!inst->can_do_source_mods())
 368       return false;
 369
 370    return true;
 371 }
 372
 373 void
 374 fs_reg::init()
 375 {
 376    memset(this, 0, sizeof(*this));
 377    stride = 1;
 378 }
 379
 380 /** Generic unset register constructor. */
 381 fs_reg::fs_reg()
 382 {
 383    init();
 384    this->file = BAD_FILE;
 385 }
 386
 387 /** Immediate value constructor. */
 388 fs_reg::fs_reg(float f)
 389 {
 390    init();
 391    this->file = IMM;
 392    this->type = BRW_REGISTER_TYPE_F;
 393    this->imm.f = f;
 394 }
 395
 396 /** Immediate value constructor. */
 397 fs_reg::fs_reg(int32_t i)
 398 {
 399    init();
 400    this->file = IMM;
 401    this->type = BRW_REGISTER_TYPE_D;
 402    this->imm.i = i;
 403 }
 404
 405 /** Immediate value constructor. */
 406 fs_reg::fs_reg(uint32_t u)
 407 {
 408    init();
 409    this->file = IMM;
 410    this->type = BRW_REGISTER_TYPE_UD;
 411    this->imm.u = u;
 412 }
 413
 414 /** Fixed brw_reg. */
 415 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 416 {
 417    init();
 418    this->file = HW_REG;
 419    this->fixed_hw_reg = fixed_hw_reg;
 420    this->type = fixed_hw_reg.type;
 421 }
 422
 423 bool
 424 fs_reg::equals(const fs_reg &r) const
 425 {
 426    return (file == r.file &&
 427            reg == r.reg &&
 428            reg_offset == r.reg_offset &&
 429            subreg_offset == r.subreg_offset &&
 430            type == r.type &&
 431            negate == r.negate &&
 432            abs == r.abs &&
 433            !reladdr && !r.reladdr &&
 434            memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
 435                   sizeof(fixed_hw_reg)) == 0 &&
 436            stride == r.stride &&
 437            imm.u == r.imm.u);
 438 }
 439
 440 fs_reg &
 441 fs_reg::apply_stride(unsigned stride)
 442 {
 443    assert((this->stride * stride) <= 4 &&
 444           (is_power_of_two(stride) || stride == 0) &&
 445           file != HW_REG && file != IMM);
 446    this->stride *= stride;
 447    return *this;
 448 }
 449
 450 fs_reg &
 451 fs_reg::set_smear(unsigned subreg)
 452 {
 453    assert(file != HW_REG && file != IMM);
 454    subreg_offset = subreg * type_sz(type);
 455    stride = 0;
 456    return *this;
 457 }
 458
 459 bool
 460 fs_reg::is_contiguous() const
 461 {
 462    return stride == 1;
 463 }
 464
 465 bool
 466 fs_reg::is_zero() const
 467 {
 468    if (file != IMM)
 469       return false;
 470
 471    return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
 472 }
 473
 474 bool
 475 fs_reg::is_one() const
 476 {
 477    if (file != IMM)
 478       return false;
 479
 480    return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
 481 }
 482
 483 bool
 484 fs_reg::is_null() const
 485 {
 486    return file == HW_REG &&
 487           fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
 488           fixed_hw_reg.nr == BRW_ARF_NULL;
 489 }
 490
 491 bool
 492 fs_reg::is_valid_3src() const
 493 {
 494    return file == GRF || file == UNIFORM;
 495 }
 496
 497 bool
 498 fs_reg::is_accumulator() const
 499 {
 500    return file == HW_REG &&
 501           fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
 502           fixed_hw_reg.nr == BRW_ARF_ACCUMULATOR;
 503 }
 504
 505 int
 506 fs_visitor::type_size(const struct glsl_type *type)
 507 {
 508    unsigned int size, i;
 509
 510    switch (type->base_type) {
 511    case GLSL_TYPE_UINT:
 512    case GLSL_TYPE_INT:
 513    case GLSL_TYPE_FLOAT:
 514    case GLSL_TYPE_BOOL:
 515       return type->components();
 516    case GLSL_TYPE_ARRAY:
 517       return type_size(type->fields.array) * type->length;
 518    case GLSL_TYPE_STRUCT:
 519       size = 0;
 520       for (i = 0; i < type->length; i++) {
 521          size += type_size(type->fields.structure[i].type);
 522       }
 523       return size;
 524    case GLSL_TYPE_SAMPLER:
 525       /* Samplers take up no register space, since they're baked in at
 526        * link time.
 527        */
 528       return 0;
 529    case GLSL_TYPE_ATOMIC_UINT:
 530       return 0;
 531    case GLSL_TYPE_IMAGE:
 532    case GLSL_TYPE_VOID:
 533    case GLSL_TYPE_ERROR:
 534    case GLSL_TYPE_INTERFACE:
 535       assert(!"not reached");
 536       break;
 537    }
 538
 539    return 0;
 540 }
 541
 542 fs_reg
 543 fs_visitor::get_timestamp()
 544 {
 545    assert(brw->gen >= 7);
 546
 547    fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 548                                           BRW_ARF_TIMESTAMP,
 549                                           0),
 550                              BRW_REGISTER_TYPE_UD));
 551
 552    fs_reg dst = fs_reg(this, glsl_type::uint_type);
 553
 554    fs_inst *mov = emit(MOV(dst, ts));
 555    /* We want to read the 3 fields we care about (mostly field 0, but also 2)
 556     * even if it's not enabled in the dispatch.
 557     */
 558    mov->force_writemask_all = true;
 559    mov->force_uncompressed = true;
 560
 561    /* The caller wants the low 32 bits of the timestamp.  Since it's running
 562     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 563     * which is plenty of time for our purposes.  It is identical across the
 564     * EUs, but since it's tracking GPU core speed it will increment at a
 565     * varying rate as render P-states change.
 566     *
 567     * The caller could also check if render P-states have changed (or anything
 568     * else that might disrupt timing) by setting smear to 2 and checking if
 569     * that field is != 0.
 570     */
 571    dst.set_smear(0);
 572
 573    return dst;
 574 }
 575
 576 void
 577 fs_visitor::emit_shader_time_begin()
 578 {
 579    current_annotation = "shader time start";
 580    shader_start_time = get_timestamp();
 581 }
 582
 583 void
 584 fs_visitor::emit_shader_time_end()
 585 {
 586    current_annotation = "shader time end";
 587
 588    enum shader_time_shader_type type, written_type, reset_type;
 589    if (dispatch_width == 8) {
 590       type = ST_FS8;
 591       written_type = ST_FS8_WRITTEN;
 592       reset_type = ST_FS8_RESET;
 593    } else {
 594       assert(dispatch_width == 16);
 595       type = ST_FS16;
 596       written_type = ST_FS16_WRITTEN;
 597       reset_type = ST_FS16_RESET;
 598    }
 599
 600    fs_reg shader_end_time = get_timestamp();
 601
 602    /* Check that there weren't any timestamp reset events (assuming these
 603     * were the only two timestamp reads that happened).
 604     */
 605    fs_reg reset = shader_end_time;
 606    reset.set_smear(2);
 607    fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
 608    test->conditional_mod = BRW_CONDITIONAL_Z;
 609    emit(IF(BRW_PREDICATE_NORMAL));
 610
 611    push_force_uncompressed();
 612    fs_reg start = shader_start_time;
 613    start.negate = true;
 614    fs_reg diff = fs_reg(this, glsl_type::uint_type);
 615    emit(ADD(diff, start, shader_end_time));
 616
 617    /* If there were no instructions between the two timestamp gets, the diff
 618     * is 2 cycles.  Remove that overhead, so I can forget about that when
 619     * trying to determine the time taken for single instructions.
 620     */
 621    emit(ADD(diff, diff, fs_reg(-2u)));
 622
 623    emit_shader_time_write(type, diff);
 624    emit_shader_time_write(written_type, fs_reg(1u));
 625    emit(BRW_OPCODE_ELSE);
 626    emit_shader_time_write(reset_type, fs_reg(1u));
 627    emit(BRW_OPCODE_ENDIF);
 628
 629    pop_force_uncompressed();
 630 }
 631
 632 void
 633 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
 634                                    fs_reg value)
 635 {
 636    int shader_time_index =
 637       brw_get_shader_time_index(brw, shader_prog, &fp->Base, type);
 638    fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
 639
 640    fs_reg payload;
 641    if (dispatch_width == 8)
 642       payload = fs_reg(this, glsl_type::uvec2_type);
 643    else
 644       payload = fs_reg(this, glsl_type::uint_type);
 645
 646    emit(new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
 647                              fs_reg(), payload, offset, value));
 648 }
 649
 650 void
 651 fs_visitor::vfail(const char *format, va_list va)
 652 {
 653    char *msg;
 654
 655    if (failed)
 656       return;
 657
 658    failed = true;
 659
 660    msg = ralloc_vasprintf(mem_ctx, format, va);
 661    msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
 662
 663    this->fail_msg = msg;
 664
 665    if (INTEL_DEBUG & DEBUG_WM) {
 666       fprintf(stderr, "%s",  msg);
 667    }
 668 }
 669
 670 void
 671 fs_visitor::fail(const char *format, ...)
 672 {
 673    va_list va;
 674
 675    va_start(va, format);
 676    vfail(format, va);
 677    va_end(va);
 678 }
 679
 680 /**
 681  * Mark this program as impossible to compile in SIMD16 mode.
 682  *
 683  * During the SIMD8 compile (which happens first), we can detect and flag
 684  * things that are unsupported in SIMD16 mode, so the compiler can skip
 685  * the SIMD16 compile altogether.
 686  *
 687  * During a SIMD16 compile (if one happens anyway), this just calls fail().
 688  */
 689 void
 690 fs_visitor::no16(const char *format, ...)
 691 {
 692    va_list va;
 693
 694    va_start(va, format);
 695
 696    if (dispatch_width == 16) {
 697       vfail(format, va);
 698    } else {
 699       simd16_unsupported = true;
 700
 701       if (brw->perf_debug) {
 702          if (no16_msg)
 703             ralloc_vasprintf_append(&no16_msg, format, va);
 704          else
 705             no16_msg = ralloc_vasprintf(mem_ctx, format, va);
 706       }
 707    }
 708
 709    va_end(va);
 710 }
 711
 712 fs_inst *
 713 fs_visitor::emit(enum opcode opcode)
 714 {
 715    return emit(new(mem_ctx) fs_inst(opcode));
 716 }
 717
 718 fs_inst *
 719 fs_visitor::emit(enum opcode opcode, fs_reg dst)
 720 {
 721    return emit(new(mem_ctx) fs_inst(opcode, dst));
 722 }
 723
 724 fs_inst *
 725 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
 726 {
 727    return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
 728 }
 729
 730 fs_inst *
 731 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 732 {
 733    return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
 734 }
 735
 736 fs_inst *
 737 fs_visitor::emit(enum opcode opcode, fs_reg dst,
 738                  fs_reg src0, fs_reg src1, fs_reg src2)
 739 {
 740    return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
 741 }
 742
 743 void
 744 fs_visitor::push_force_uncompressed()
 745 {
 746    force_uncompressed_stack++;
 747 }
 748
 749 void
 750 fs_visitor::pop_force_uncompressed()
 751 {
 752    force_uncompressed_stack--;
 753    assert(force_uncompressed_stack >= 0);
 754 }
 755
 756 /**
 757  * Returns true if the instruction has a flag that means it won't
 758  * update an entire destination register.
 759  *
 760  * For example, dead code elimination and live variable analysis want to know
 761  * when a write to a variable screens off any preceding values that were in
 762  * it.
 763  */
 764 bool
 765 fs_inst::is_partial_write() const
 766 {
 767    return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
 768            this->force_uncompressed ||
 769            this->force_sechalf || !this->dst.is_contiguous());
 770 }
 771
 772 int
 773 fs_inst::regs_read(fs_visitor *v, int arg) const
 774 {
 775    if (is_tex() && arg == 0 && src[0].file == GRF) {
 776       if (v->dispatch_width == 16)
 777          return (mlen + 1) / 2;
 778       else
 779          return mlen;
 780    }
 781    return 1;
 782 }
 783
 784 bool
 785 fs_inst::reads_flag() const
 786 {
 787    return predicate;
 788 }
 789
 790 bool
 791 fs_inst::writes_flag() const
 792 {
 793    return (conditional_mod && opcode != BRW_OPCODE_SEL) ||
 794           opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
 795 }
 796
 797 /**
 798  * Returns how many MRFs an FS opcode will write over.
 799  *
 800  * Note that this is not the 0 or 1 implied writes in an actual gen
 801  * instruction -- the FS opcodes often generate MOVs in addition.
 802  */
 803 int
 804 fs_visitor::implied_mrf_writes(fs_inst *inst)
 805 {
 806    if (inst->mlen == 0)
 807       return 0;
 808
 809    if (inst->base_mrf == -1)
 810       return 0;
 811
 812    switch (inst->opcode) {
 813    case SHADER_OPCODE_RCP:
 814    case SHADER_OPCODE_RSQ:
 815    case SHADER_OPCODE_SQRT:
 816    case SHADER_OPCODE_EXP2:
 817    case SHADER_OPCODE_LOG2:
 818    case SHADER_OPCODE_SIN:
 819    case SHADER_OPCODE_COS:
 820       return 1 * dispatch_width / 8;
 821    case SHADER_OPCODE_POW:
 822    case SHADER_OPCODE_INT_QUOTIENT:
 823    case SHADER_OPCODE_INT_REMAINDER:
 824       return 2 * dispatch_width / 8;
 825    case SHADER_OPCODE_TEX:
 826    case FS_OPCODE_TXB:
 827    case SHADER_OPCODE_TXD:
 828    case SHADER_OPCODE_TXF:
 829    case SHADER_OPCODE_TXF_CMS:
 830    case SHADER_OPCODE_TXF_MCS:
 831    case SHADER_OPCODE_TG4:
 832    case SHADER_OPCODE_TG4_OFFSET:
 833    case SHADER_OPCODE_TXL:
 834    case SHADER_OPCODE_TXS:
 835    case SHADER_OPCODE_LOD:
 836       return 1;
 837    case FS_OPCODE_FB_WRITE:
 838       return 2;
 839    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 840    case SHADER_OPCODE_GEN4_SCRATCH_READ:
 841       return 1;
 842    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
 843       return inst->mlen;
 844    case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
 845       return 2;
 846    case SHADER_OPCODE_UNTYPED_ATOMIC:
 847    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
 848       return 0;
 849    default:
 850       assert(!"not reached");
 851       return inst->mlen;
 852    }
 853 }
 854
 855 int
 856 fs_visitor::virtual_grf_alloc(int size)
 857 {
 858    if (virtual_grf_array_size <= virtual_grf_count) {
 859       if (virtual_grf_array_size == 0)
 860          virtual_grf_array_size = 16;
 861       else
 862          virtual_grf_array_size *= 2;
 863       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 864                                    virtual_grf_array_size);
 865    }
 866    virtual_grf_sizes[virtual_grf_count] = size;
 867    return virtual_grf_count++;
 868 }
 869
 870 /** Fixed HW reg constructor. */
 871 fs_reg::fs_reg(enum register_file file, int reg)
 872 {
 873    init();
 874    this->file = file;
 875    this->reg = reg;
 876    this->type = BRW_REGISTER_TYPE_F;
 877 }
 878
 879 /** Fixed HW reg constructor. */
 880 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
 881 {
 882    init();
 883    this->file = file;
 884    this->reg = reg;
 885    this->type = type;
 886 }
 887
 888 /** Automatic reg constructor. */
 889 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
 890 {
 891    init();
 892
 893    this->file = GRF;
 894    this->reg = v->virtual_grf_alloc(v->type_size(type));
 895    this->reg_offset = 0;
 896    this->type = brw_type_for_base_type(type);
 897 }
 898
 899 fs_reg *
 900 fs_visitor::variable_storage(ir_variable *var)
 901 {
 902    return (fs_reg *)hash_table_find(this->variable_ht, var);
 903 }
 904
 905 void
 906 import_uniforms_callback(const void *key,
 907                          void *data,
 908                          void *closure)
 909 {
 910    struct hash_table *dst_ht = (struct hash_table *)closure;
 911    const fs_reg *reg = (const fs_reg *)data;
 912
 913    if (reg->file != UNIFORM)
 914       return;
 915
 916    hash_table_insert(dst_ht, data, key);
 917 }
 918
 919 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
 920  * This brings in those uniform definitions
 921  */
 922 void
 923 fs_visitor::import_uniforms(fs_visitor *v)
 924 {
 925    hash_table_call_foreach(v->variable_ht,
 926                            import_uniforms_callback,
 927                            variable_ht);
 928    this->push_constant_loc = v->push_constant_loc;
 929    this->pull_constant_loc = v->pull_constant_loc;
 930    this->uniforms = v->uniforms;
 931    this->param_size = v->param_size;
 932 }
 933
 934 /* Our support for uniforms is piggy-backed on the struct
 935  * gl_fragment_program, because that's where the values actually
 936  * get stored, rather than in some global gl_shader_program uniform
 937  * store.
 938  */
 939 void
 940 fs_visitor::setup_uniform_values(ir_variable *ir)
 941 {
 942    int namelen = strlen(ir->name);
 943
 944    /* The data for our (non-builtin) uniforms is stored in a series of
 945     * gl_uniform_driver_storage structs for each subcomponent that
 946     * glGetUniformLocation() could name.  We know it's been set up in the same
 947     * order we'd walk the type, so walk the list of storage and find anything
 948     * with our name, or the prefix of a component that starts with our name.
 949     */
 950    unsigned params_before = uniforms;
 951    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 952       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 953
 954       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 955           (storage->name[namelen] != 0 &&
 956            storage->name[namelen] != '.' &&
 957            storage->name[namelen] != '[')) {
 958          continue;
 959       }
 960
 961       unsigned slots = storage->type->component_slots();
 962       if (storage->array_elements)
 963          slots *= storage->array_elements;
 964
 965       for (unsigned i = 0; i < slots; i++) {
 966          stage_prog_data->param[uniforms++] = &storage->storage[i].f;
 967       }
 968    }
 969
 970    /* Make sure we actually initialized the right amount of stuff here. */
 971    assert(params_before + ir->type->component_slots() == uniforms);
 972    (void)params_before;
 973 }
 974
 975
 976 /* Our support for builtin uniforms is even scarier than non-builtin.
 977  * It sits on top of the PROG_STATE_VAR parameters that are
 978  * automatically updated from GL context state.
 979  */
 980 void
 981 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
 982 {
 983    const ir_state_slot *const slots = ir->state_slots;
 984    assert(ir->state_slots != NULL);
 985
 986    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 987       /* This state reference has already been setup by ir_to_mesa, but we'll
 988        * get the same index back here.
 989        */
 990       int index = _mesa_add_state_reference(this->fp->Base.Parameters,
 991                                             (gl_state_index *)slots[i].tokens);
 992
 993       /* Add each of the unique swizzles of the element as a parameter.
 994        * This'll end up matching the expected layout of the
 995        * array/matrix/structure we're trying to fill in.
 996        */
 997       int last_swiz = -1;
 998       for (unsigned int j = 0; j < 4; j++) {
 999          int swiz = GET_SWZ(slots[i].swizzle, j);
1000          if (swiz == last_swiz)
1001             break;
1002          last_swiz = swiz;
1003
1004          stage_prog_data->param[uniforms++] =
1005             &fp->Base.Parameters->ParameterValues[index][swiz].f;
1006       }
1007    }
1008 }
1009
1010 fs_reg *
1011 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
1012 {
1013    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1014    fs_reg wpos = *reg;
1015    bool flip = !ir->data.origin_upper_left ^ key->render_to_fbo;
1016
1017    /* gl_FragCoord.x */
1018    if (ir->data.pixel_center_integer) {
1019       emit(MOV(wpos, this->pixel_x));
1020    } else {
1021       emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
1022    }
1023    wpos.reg_offset++;
1024
1025    /* gl_FragCoord.y */
1026    if (!flip && ir->data.pixel_center_integer) {
1027       emit(MOV(wpos, this->pixel_y));
1028    } else {
1029       fs_reg pixel_y = this->pixel_y;
1030       float offset = (ir->data.pixel_center_integer ? 0.0 : 0.5);
1031
1032       if (flip) {
1033          pixel_y.negate = true;
1034          offset += key->drawable_height - 1.0;
1035       }
1036
1037       emit(ADD(wpos, pixel_y, fs_reg(offset)));
1038    }
1039    wpos.reg_offset++;
1040
1041    /* gl_FragCoord.z */
1042    if (brw->gen >= 6) {
1043       emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
1044    } else {
1045       emit(FS_OPCODE_LINTERP, wpos,
1046            this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1047            this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1048            interp_reg(VARYING_SLOT_POS, 2));
1049    }
1050    wpos.reg_offset++;
1051
1052    /* gl_FragCoord.w: Already set up in emit_interpolation */
1053    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1054
1055    return reg;
1056 }
1057
1058 fs_inst *
1059 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1060                          glsl_interp_qualifier interpolation_mode,
1061                          bool is_centroid, bool is_sample)
1062 {
1063    brw_wm_barycentric_interp_mode barycoord_mode;
1064    if (brw->gen >= 6) {
1065       if (is_centroid) {
1066          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1067             barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1068          else
1069             barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1070       } else if (is_sample) {
1071           if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1072             barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1073          else
1074             barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1075       } else {
1076          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1077             barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1078          else
1079             barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1080       }
1081    } else {
1082       /* On Ironlake and below, there is only one interpolation mode.
1083        * Centroid interpolation doesn't mean anything on this hardware --
1084        * there is no multisampling.
1085        */
1086       barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1087    }
1088    return emit(FS_OPCODE_LINTERP, attr,
1089                this->delta_x[barycoord_mode],
1090                this->delta_y[barycoord_mode], interp);
1091 }
1092
1093 fs_reg *
1094 fs_visitor::emit_general_interpolation(ir_variable *ir)
1095 {
1096    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1097    reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
1098    fs_reg attr = *reg;
1099
1100    unsigned int array_elements;
1101    const glsl_type *type;
1102
1103    if (ir->type->is_array()) {
1104       array_elements = ir->type->length;
1105       if (array_elements == 0) {
1106          fail("dereferenced array '%s' has length 0\n", ir->name);
1107       }
1108       type = ir->type->fields.array;
1109    } else {
1110       array_elements = 1;
1111       type = ir->type;
1112    }
1113
1114    glsl_interp_qualifier interpolation_mode =
1115       ir->determine_interpolation_mode(key->flat_shade);
1116
1117    int location = ir->data.location;
1118    for (unsigned int i = 0; i < array_elements; i++) {
1119       for (unsigned int j = 0; j < type->matrix_columns; j++) {
1120          if (prog_data->urb_setup[location] == -1) {
1121             /* If there's no incoming setup data for this slot, don't
1122              * emit interpolation for it.
1123              */
1124             attr.reg_offset += type->vector_elements;
1125             location++;
1126             continue;
1127          }
1128
1129          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1130             /* Constant interpolation (flat shading) case. The SF has
1131              * handed us defined values in only the constant offset
1132              * field of the setup reg.
1133              */
1134             for (unsigned int k = 0; k < type->vector_elements; k++) {
1135                struct brw_reg interp = interp_reg(location, k);
1136                interp = suboffset(interp, 3);
1137                interp.type = reg->type;
1138                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1139                attr.reg_offset++;
1140             }
1141          } else {
1142             /* Smooth/noperspective interpolation case. */
1143             for (unsigned int k = 0; k < type->vector_elements; k++) {
1144                struct brw_reg interp = interp_reg(location, k);
1145                emit_linterp(attr, fs_reg(interp), interpolation_mode,
1146                             ir->data.centroid && !key->persample_shading,
1147                             ir->data.sample || key->persample_shading);
1148                if (brw->needs_unlit_centroid_workaround && ir->data.centroid) {
1149                   /* Get the pixel/sample mask into f0 so that we know
1150                    * which pixels are lit.  Then, for each channel that is
1151                    * unlit, replace the centroid data with non-centroid
1152                    * data.
1153                    */
1154                   emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1155                   fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1156                                                interpolation_mode,
1157                                                false, false);
1158                   inst->predicate = BRW_PREDICATE_NORMAL;
1159                   inst->predicate_inverse = true;
1160                }
1161                if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1162                   emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1163                }
1164                attr.reg_offset++;
1165             }
1166
1167          }
1168          location++;
1169       }
1170    }
1171
1172    return reg;
1173 }
1174
1175 fs_reg *
1176 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1177 {
1178    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1179
1180    /* The frontfacing comes in as a bit in the thread payload. */
1181    if (brw->gen >= 6) {
1182       emit(BRW_OPCODE_ASR, *reg,
1183            fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1184            fs_reg(15));
1185       emit(BRW_OPCODE_NOT, *reg, *reg);
1186       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1187    } else {
1188       struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1189       /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1190        * us front face
1191        */
1192       emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1193       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1194    }
1195
1196    return reg;
1197 }
1198
1199 void
1200 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1201 {
1202    assert(dst.type == BRW_REGISTER_TYPE_F);
1203
1204    if (key->compute_pos_offset) {
1205       /* Convert int_sample_pos to floating point */
1206       emit(MOV(dst, int_sample_pos));
1207       /* Scale to the range [0, 1] */
1208       emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1209    }
1210    else {
1211       /* From ARB_sample_shading specification:
1212        * "When rendering to a non-multisample buffer, or if multisample
1213        *  rasterization is disabled, gl_SamplePosition will always be
1214        *  (0.5, 0.5).
1215        */
1216       emit(MOV(dst, fs_reg(0.5f)));
1217    }
1218 }
1219
1220 fs_reg *
1221 fs_visitor::emit_samplepos_setup(ir_variable *ir)
1222 {
1223    assert(brw->gen >= 6);
1224    assert(ir->type == glsl_type::vec2_type);
1225
1226    this->current_annotation = "compute sample position";
1227    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1228    fs_reg pos = *reg;
1229    fs_reg int_sample_x = fs_reg(this, glsl_type::int_type);
1230    fs_reg int_sample_y = fs_reg(this, glsl_type::int_type);
1231
1232    /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1233     * mode will be enabled.
1234     *
1235     * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1236     * R31.1:0         Position Offset X/Y for Slot[3:0]
1237     * R31.3:2         Position Offset X/Y for Slot[7:4]
1238     * .....
1239     *
1240     * The X, Y sample positions come in as bytes in  thread payload. So, read
1241     * the positions using vstride=16, width=8, hstride=2.
1242     */
1243    struct brw_reg sample_pos_reg =
1244       stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1245                     BRW_REGISTER_TYPE_B), 16, 8, 2);
1246
1247    emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1248    if (dispatch_width == 16) {
1249       fs_inst *inst = emit(MOV(half(int_sample_x, 1),
1250                                fs_reg(suboffset(sample_pos_reg, 16))));
1251       inst->force_sechalf = true;
1252    }
1253    /* Compute gl_SamplePosition.x */
1254    compute_sample_position(pos, int_sample_x);
1255    pos.reg_offset++;
1256    emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1257    if (dispatch_width == 16) {
1258       fs_inst *inst = emit(MOV(half(int_sample_y, 1),
1259                                fs_reg(suboffset(sample_pos_reg, 17))));
1260       inst->force_sechalf = true;
1261    }
1262    /* Compute gl_SamplePosition.y */
1263    compute_sample_position(pos, int_sample_y);
1264    return reg;
1265 }
1266
1267 fs_reg *
1268 fs_visitor::emit_sampleid_setup(ir_variable *ir)
1269 {
1270    assert(brw->gen >= 6);
1271
1272    this->current_annotation = "compute sample id";
1273    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1274
1275    if (key->compute_sample_id) {
1276       fs_reg t1 = fs_reg(this, glsl_type::int_type);
1277       fs_reg t2 = fs_reg(this, glsl_type::int_type);
1278       t2.type = BRW_REGISTER_TYPE_UW;
1279
1280       /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1281        * 8x multisampling, subspan 0 will represent sample N (where N
1282        * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1283        * 7. We can find the value of N by looking at R0.0 bits 7:6
1284        * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1285        * (since samples are always delivered in pairs). That is, we
1286        * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1287        * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1288        * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1289        * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1290        * populating a temporary variable with the sequence (0, 1, 2, 3),
1291        * and then reading from it using vstride=1, width=4, hstride=0.
1292        * These computations hold good for 4x multisampling as well.
1293        */
1294       emit(BRW_OPCODE_AND, t1,
1295            fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1296            fs_reg(0xc0));
1297       emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1298       /* This works for both SIMD8 and SIMD16 */
1299       emit(MOV(t2, brw_imm_v(0x3210)));
1300       /* This special instruction takes care of setting vstride=1,
1301        * width=4, hstride=0 of t2 during an ADD instruction.
1302        */
1303       emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1304    } else {
1305       /* As per GL_ARB_sample_shading specification:
1306        * "When rendering to a non-multisample buffer, or if multisample
1307        *  rasterization is disabled, gl_SampleID will always be zero."
1308        */
1309       emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1310    }
1311
1312    return reg;
1313 }
1314
1315 fs_reg
1316 fs_visitor::fix_math_operand(fs_reg src)
1317 {
1318    /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1319     * might be able to do better by doing execsize = 1 math and then
1320     * expanding that result out, but we would need to be careful with
1321     * masking.
1322     *
1323     * The hardware ignores source modifiers (negate and abs) on math
1324     * instructions, so we also move to a temp to set those up.
1325     */
1326    if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1327        !src.abs && !src.negate)
1328       return src;
1329
1330    /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1331     * operands to math
1332     */
1333    if (brw->gen >= 7 && src.file != IMM)
1334       return src;
1335
1336    fs_reg expanded = fs_reg(this, glsl_type::float_type);
1337    expanded.type = src.type;
1338    emit(BRW_OPCODE_MOV, expanded, src);
1339    return expanded;
1340 }
1341
1342 fs_inst *
1343 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1344 {
1345    switch (opcode) {
1346    case SHADER_OPCODE_RCP:
1347    case SHADER_OPCODE_RSQ:
1348    case SHADER_OPCODE_SQRT:
1349    case SHADER_OPCODE_EXP2:
1350    case SHADER_OPCODE_LOG2:
1351    case SHADER_OPCODE_SIN:
1352    case SHADER_OPCODE_COS:
1353       break;
1354    default:
1355       assert(!"not reached: bad math opcode");
1356       return NULL;
1357    }
1358
1359    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
1360     * might be able to do better by doing execsize = 1 math and then
1361     * expanding that result out, but we would need to be careful with
1362     * masking.
1363     *
1364     * Gen 6 hardware ignores source modifiers (negate and abs) on math
1365     * instructions, so we also move to a temp to set those up.
1366     */
1367    if (brw->gen >= 6)
1368       src = fix_math_operand(src);
1369
1370    fs_inst *inst = emit(opcode, dst, src);
1371
1372    if (brw->gen < 6) {
1373       inst->base_mrf = 2;
1374       inst->mlen = dispatch_width / 8;
1375    }
1376
1377    return inst;
1378 }
1379
1380 fs_inst *
1381 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1382 {
1383    int base_mrf = 2;
1384    fs_inst *inst;
1385
1386    switch (opcode) {
1387    case SHADER_OPCODE_INT_QUOTIENT:
1388    case SHADER_OPCODE_INT_REMAINDER:
1389       if (brw->gen >= 7)
1390          no16("SIMD16 INTDIV unsupported\n");
1391       break;
1392    case SHADER_OPCODE_POW:
1393       break;
1394    default:
1395       assert(!"not reached: unsupported binary math opcode.");
1396       return NULL;
1397    }
1398
1399    if (brw->gen >= 6) {
1400       src0 = fix_math_operand(src0);
1401       src1 = fix_math_operand(src1);
1402
1403       inst = emit(opcode, dst, src0, src1);
1404    } else {
1405       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1406        * "Message Payload":
1407        *
1408        * "Operand0[7].  For the INT DIV functions, this operand is the
1409        *  denominator."
1410        *  ...
1411        * "Operand1[7].  For the INT DIV functions, this operand is the
1412        *  numerator."
1413        */
1414       bool is_int_div = opcode != SHADER_OPCODE_POW;
1415       fs_reg &op0 = is_int_div ? src1 : src0;
1416       fs_reg &op1 = is_int_div ? src0 : src1;
1417
1418       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1419       inst = emit(opcode, dst, op0, reg_null_f);
1420
1421       inst->base_mrf = base_mrf;
1422       inst->mlen = 2 * dispatch_width / 8;
1423    }
1424    return inst;
1425 }
1426
1427 void
1428 fs_visitor::assign_curb_setup()
1429 {
1430    if (dispatch_width == 8) {
1431       prog_data->first_curbe_grf = payload.num_regs;
1432    } else {
1433       prog_data->first_curbe_grf_16 = payload.num_regs;
1434    }
1435
1436    prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1437
1438    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1439    foreach_list(node, &this->instructions) {
1440       fs_inst *inst = (fs_inst *)node;
1441
1442       for (unsigned int i = 0; i < inst->sources; i++) {
1443          if (inst->src[i].file == UNIFORM) {
1444             int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1445             int constant_nr;
1446             if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1447                constant_nr = push_constant_loc[uniform_nr];
1448             } else {
1449                /* Section 5.11 of the OpenGL 4.1 spec says:
1450                 * "Out-of-bounds reads return undefined values, which include
1451                 *  values from other variables of the active program or zero."
1452                 * Just return the first push constant.
1453                 */
1454                constant_nr = 0;
1455             }
1456
1457             struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1458                                                   constant_nr / 8,
1459                                                   constant_nr % 8);
1460
1461             inst->src[i].file = HW_REG;
1462             inst->src[i].fixed_hw_reg = byte_offset(
1463                retype(brw_reg, inst->src[i].type),
1464                inst->src[i].subreg_offset);
1465          }
1466       }
1467    }
1468 }
1469
1470 void
1471 fs_visitor::calculate_urb_setup()
1472 {
1473    for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1474       prog_data->urb_setup[i] = -1;
1475    }
1476
1477    int urb_next = 0;
1478    /* Figure out where each of the incoming setup attributes lands. */
1479    if (brw->gen >= 6) {
1480       if (_mesa_bitcount_64(fp->Base.InputsRead &
1481                             BRW_FS_VARYING_INPUT_MASK) <= 16) {
1482          /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1483           * first 16 varying inputs, so we can put them wherever we want.
1484           * Just put them in order.
1485           *
1486           * This is useful because it means that (a) inputs not used by the
1487           * fragment shader won't take up valuable register space, and (b) we
1488           * won't have to recompile the fragment shader if it gets paired with
1489           * a different vertex (or geometry) shader.
1490           */
1491          for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1492             if (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1493                 BITFIELD64_BIT(i)) {
1494                prog_data->urb_setup[i] = urb_next++;
1495             }
1496          }
1497       } else {
1498          /* We have enough input varyings that the SF/SBE pipeline stage can't
1499           * arbitrarily rearrange them to suit our whim; we have to put them
1500           * in an order that matches the output of the previous pipeline stage
1501           * (geometry or vertex shader).
1502           */
1503          struct brw_vue_map prev_stage_vue_map;
1504          brw_compute_vue_map(brw, &prev_stage_vue_map,
1505                              key->input_slots_valid);
1506          int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1507          assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1508          for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1509               slot++) {
1510             int varying = prev_stage_vue_map.slot_to_varying[slot];
1511             /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1512              * unused.
1513              */
1514             if (varying != BRW_VARYING_SLOT_COUNT &&
1515                 (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1516                  BITFIELD64_BIT(varying))) {
1517                prog_data->urb_setup[varying] = slot - first_slot;
1518             }
1519          }
1520          urb_next = prev_stage_vue_map.num_slots - first_slot;
1521       }
1522    } else {
1523       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1524       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1525          /* Point size is packed into the header, not as a general attribute */
1526          if (i == VARYING_SLOT_PSIZ)
1527             continue;
1528
1529          if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1530             /* The back color slot is skipped when the front color is
1531              * also written to.  In addition, some slots can be
1532              * written in the vertex shader and not read in the
1533              * fragment shader.  So the register number must always be
1534              * incremented, mapped or not.
1535              */
1536             if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1537                prog_data->urb_setup[i] = urb_next;
1538             urb_next++;
1539          }
1540       }
1541
1542       /*
1543        * It's a FS only attribute, and we did interpolation for this attribute
1544        * in SF thread. So, count it here, too.
1545        *
1546        * See compile_sf_prog() for more info.
1547        */
1548       if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1549          prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1550    }
1551
1552    prog_data->num_varying_inputs = urb_next;
1553 }
1554
1555 void
1556 fs_visitor::assign_urb_setup()
1557 {
1558    int urb_start = payload.num_regs + prog_data->curb_read_length;
1559
1560    /* Offset all the urb_setup[] index by the actual position of the
1561     * setup regs, now that the location of the constants has been chosen.
1562     */
1563    foreach_list(node, &this->instructions) {
1564       fs_inst *inst = (fs_inst *)node;
1565
1566       if (inst->opcode == FS_OPCODE_LINTERP) {
1567          assert(inst->src[2].file == HW_REG);
1568          inst->src[2].fixed_hw_reg.nr += urb_start;
1569       }
1570
1571       if (inst->opcode == FS_OPCODE_CINTERP) {
1572          assert(inst->src[0].file == HW_REG);
1573          inst->src[0].fixed_hw_reg.nr += urb_start;
1574       }
1575    }
1576
1577    /* Each attribute is 4 setup channels, each of which is half a reg. */
1578    this->first_non_payload_grf =
1579       urb_start + prog_data->num_varying_inputs * 2;
1580 }
1581
1582 /**
1583  * Split large virtual GRFs into separate components if we can.
1584  *
1585  * This is mostly duplicated with what brw_fs_vector_splitting does,
1586  * but that's really conservative because it's afraid of doing
1587  * splitting that doesn't result in real progress after the rest of
1588  * the optimization phases, which would cause infinite looping in
1589  * optimization.  We can do it once here, safely.  This also has the
1590  * opportunity to split interpolated values, or maybe even uniforms,
1591  * which we don't have at the IR level.
1592  *
1593  * We want to split, because virtual GRFs are what we register
1594  * allocate and spill (due to contiguousness requirements for some
1595  * instructions), and they're what we naturally generate in the
1596  * codegen process, but most virtual GRFs don't actually need to be
1597  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1598  * live intervals and better dead code elimination and coalescing.
1599  */
1600 void
1601 fs_visitor::split_virtual_grfs()
1602 {
1603    int num_vars = this->virtual_grf_count;
1604    bool split_grf[num_vars];
1605    int new_virtual_grf[num_vars];
1606
1607    /* Try to split anything > 0 sized. */
1608    for (int i = 0; i < num_vars; i++) {
1609       if (this->virtual_grf_sizes[i] != 1)
1610          split_grf[i] = true;
1611       else
1612          split_grf[i] = false;
1613    }
1614
1615    if (brw->has_pln &&
1616        this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1617       /* PLN opcodes rely on the delta_xy being contiguous.  We only have to
1618        * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1619        * Gen6, that was the only supported interpolation mode, and since Gen6,
1620        * delta_x and delta_y are in fixed hardware registers.
1621        */
1622       split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1623          false;
1624    }
1625
1626    foreach_list(node, &this->instructions) {
1627       fs_inst *inst = (fs_inst *)node;
1628
1629       /* If there's a SEND message that requires contiguous destination
1630        * registers, no splitting is allowed.
1631        */
1632       if (inst->regs_written > 1) {
1633          split_grf[inst->dst.reg] = false;
1634       }
1635
1636       /* If we're sending from a GRF, don't split it, on the assumption that
1637        * the send is reading the whole thing.
1638        */
1639       if (inst->is_send_from_grf()) {
1640          for (int i = 0; i < inst->sources; i++) {
1641             if (inst->src[i].file == GRF) {
1642                split_grf[inst->src[i].reg] = false;
1643             }
1644          }
1645       }
1646    }
1647
1648    /* Allocate new space for split regs.  Note that the virtual
1649     * numbers will be contiguous.
1650     */
1651    for (int i = 0; i < num_vars; i++) {
1652       if (split_grf[i]) {
1653          new_virtual_grf[i] = virtual_grf_alloc(1);
1654          for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1655             int reg = virtual_grf_alloc(1);
1656             assert(reg == new_virtual_grf[i] + j - 1);
1657             (void) reg;
1658          }
1659          this->virtual_grf_sizes[i] = 1;
1660       }
1661    }
1662
1663    foreach_list(node, &this->instructions) {
1664       fs_inst *inst = (fs_inst *)node;
1665
1666       if (inst->dst.file == GRF &&
1667           split_grf[inst->dst.reg] &&
1668           inst->dst.reg_offset != 0) {
1669          inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1670                           inst->dst.reg_offset - 1);
1671          inst->dst.reg_offset = 0;
1672       }
1673       for (int i = 0; i < inst->sources; i++) {
1674          if (inst->src[i].file == GRF &&
1675              split_grf[inst->src[i].reg] &&
1676              inst->src[i].reg_offset != 0) {
1677             inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1678                                 inst->src[i].reg_offset - 1);
1679             inst->src[i].reg_offset = 0;
1680          }
1681       }
1682    }
1683    invalidate_live_intervals();
1684 }
1685
1686 /**
1687  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1688  *
1689  * During code generation, we create tons of temporary variables, many of
1690  * which get immediately killed and are never used again.  Yet, in later
1691  * optimization and analysis passes, such as compute_live_intervals, we need
1692  * to loop over all the virtual GRFs.  Compacting them can save a lot of
1693  * overhead.
1694  */
1695 void
1696 fs_visitor::compact_virtual_grfs()
1697 {
1698    if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER))
1699       return;
1700
1701    /* Mark which virtual GRFs are used, and count how many. */
1702    int remap_table[this->virtual_grf_count];
1703    memset(remap_table, -1, sizeof(remap_table));
1704
1705    foreach_list(node, &this->instructions) {
1706       const fs_inst *inst = (const fs_inst *) node;
1707
1708       if (inst->dst.file == GRF)
1709          remap_table[inst->dst.reg] = 0;
1710
1711       for (int i = 0; i < inst->sources; i++) {
1712          if (inst->src[i].file == GRF)
1713             remap_table[inst->src[i].reg] = 0;
1714       }
1715    }
1716
1717    /* Compact the GRF arrays. */
1718    int new_index = 0;
1719    for (int i = 0; i < this->virtual_grf_count; i++) {
1720       if (remap_table[i] != -1) {
1721          remap_table[i] = new_index;
1722          virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1723          invalidate_live_intervals();
1724          ++new_index;
1725       }
1726    }
1727
1728    this->virtual_grf_count = new_index;
1729
1730    /* Patch all the instructions to use the newly renumbered registers */
1731    foreach_list(node, &this->instructions) {
1732       fs_inst *inst = (fs_inst *) node;
1733
1734       if (inst->dst.file == GRF)
1735          inst->dst.reg = remap_table[inst->dst.reg];
1736
1737       for (int i = 0; i < inst->sources; i++) {
1738          if (inst->src[i].file == GRF)
1739             inst->src[i].reg = remap_table[inst->src[i].reg];
1740       }
1741    }
1742 }
1743
1744 /*
1745  * Implements array access of uniforms by inserting a
1746  * PULL_CONSTANT_LOAD instruction.
1747  *
1748  * Unlike temporary GRF array access (where we don't support it due to
1749  * the difficulty of doing relative addressing on instruction
1750  * destinations), we could potentially do array access of uniforms
1751  * that were loaded in GRF space as push constants.  In real-world
1752  * usage we've seen, though, the arrays being used are always larger
1753  * than we could load as push constants, so just always move all
1754  * uniform array access out to a pull constant buffer.
1755  */
1756 void
1757 fs_visitor::move_uniform_array_access_to_pull_constants()
1758 {
1759    if (dispatch_width != 8)
1760       return;
1761
1762    pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
1763
1764    for (unsigned int i = 0; i < uniforms; i++) {
1765       pull_constant_loc[i] = -1;
1766    }
1767
1768    /* Walk through and find array access of uniforms.  Put a copy of that
1769     * uniform in the pull constant buffer.
1770     *
1771     * Note that we don't move constant-indexed accesses to arrays.  No
1772     * testing has been done of the performance impact of this choice.
1773     */
1774    foreach_list_safe(node, &this->instructions) {
1775       fs_inst *inst = (fs_inst *)node;
1776
1777       for (int i = 0 ; i < inst->sources; i++) {
1778          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1779             continue;
1780
1781          int uniform = inst->src[i].reg;
1782
1783          /* If this array isn't already present in the pull constant buffer,
1784           * add it.
1785           */
1786          if (pull_constant_loc[uniform] == -1) {
1787             const float **values = &stage_prog_data->param[uniform];
1788
1789             assert(param_size[uniform]);
1790
1791             for (int j = 0; j < param_size[uniform]; j++) {
1792                pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
1793
1794                stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
1795                   values[j];
1796             }
1797          }
1798       }
1799    }
1800 }
1801
1802 /**
1803  * Assign UNIFORM file registers to either push constants or pull constants.
1804  *
1805  * We allow a fragment shader to have more than the specified minimum
1806  * maximum number of fragment shader uniform components (64).  If
1807  * there are too many of these, they'd fill up all of register space.
1808  * So, this will push some of them out to the pull constant buffer and
1809  * update the program to load them.
1810  */
1811 void
1812 fs_visitor::assign_constant_locations()
1813 {
1814    /* Only the first compile (SIMD8 mode) gets to decide on locations. */
1815    if (dispatch_width != 8)
1816       return;
1817
1818    /* Find which UNIFORM registers are still in use. */
1819    bool is_live[uniforms];
1820    for (unsigned int i = 0; i < uniforms; i++) {
1821       is_live[i] = false;
1822    }
1823
1824    foreach_list(node, &this->instructions) {
1825       fs_inst *inst = (fs_inst *) node;
1826
1827       for (int i = 0; i < inst->sources; i++) {
1828          if (inst->src[i].file != UNIFORM)
1829             continue;
1830
1831          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1832          if (constant_nr >= 0 && constant_nr < (int) uniforms)
1833             is_live[constant_nr] = true;
1834       }
1835    }
1836
1837    /* Only allow 16 registers (128 uniform components) as push constants.
1838     *
1839     * Just demote the end of the list.  We could probably do better
1840     * here, demoting things that are rarely used in the program first.
1841     */
1842    unsigned int max_push_components = 16 * 8;
1843    unsigned int num_push_constants = 0;
1844
1845    push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
1846
1847    for (unsigned int i = 0; i < uniforms; i++) {
1848       if (!is_live[i] || pull_constant_loc[i] != -1) {
1849          /* This UNIFORM register is either dead, or has already been demoted
1850           * to a pull const.  Mark it as no longer living in the param[] array.
1851           */
1852          push_constant_loc[i] = -1;
1853          continue;
1854       }
1855
1856       if (num_push_constants < max_push_components) {
1857          /* Retain as a push constant.  Record the location in the params[]
1858           * array.
1859           */
1860          push_constant_loc[i] = num_push_constants++;
1861       } else {
1862          /* Demote to a pull constant. */
1863          push_constant_loc[i] = -1;
1864
1865          int pull_index = stage_prog_data->nr_pull_params++;
1866          stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
1867          pull_constant_loc[i] = pull_index;
1868       }
1869    }
1870
1871    stage_prog_data->nr_params = num_push_constants;
1872
1873    /* Up until now, the param[] array has been indexed by reg + reg_offset
1874     * of UNIFORM registers.  Condense it to only contain the uniforms we
1875     * chose to upload as push constants.
1876     */
1877    for (unsigned int i = 0; i < uniforms; i++) {
1878       int remapped = push_constant_loc[i];
1879
1880       if (remapped == -1)
1881          continue;
1882
1883       assert(remapped <= (int)i);
1884       stage_prog_data->param[remapped] = stage_prog_data->param[i];
1885    }
1886 }
1887
1888 /**
1889  * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
1890  * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
1891  */
1892 void
1893 fs_visitor::demote_pull_constants()
1894 {
1895    foreach_list(node, &this->instructions) {
1896       fs_inst *inst = (fs_inst *)node;
1897
1898       for (int i = 0; i < inst->sources; i++) {
1899          if (inst->src[i].file != UNIFORM)
1900             continue;
1901
1902          int pull_index = pull_constant_loc[inst->src[i].reg +
1903                                             inst->src[i].reg_offset];
1904          if (pull_index == -1)
1905             continue;
1906
1907          /* Set up the annotation tracking for new generated instructions. */
1908          base_ir = inst->ir;
1909          current_annotation = inst->annotation;
1910
1911          fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
1912          fs_reg dst = fs_reg(this, glsl_type::float_type);
1913
1914          /* Generate a pull load into dst. */
1915          if (inst->src[i].reladdr) {
1916             exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
1917                                                         surf_index,
1918                                                         *inst->src[i].reladdr,
1919                                                         pull_index);
1920             inst->insert_before(&list);
1921             inst->src[i].reladdr = NULL;
1922          } else {
1923             fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1924             fs_inst *pull =
1925                new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1926                                     dst, surf_index, offset);
1927             inst->insert_before(pull);
1928             inst->src[i].set_smear(pull_index & 3);
1929          }
1930
1931          /* Rewrite the instruction to use the temporary VGRF. */
1932          inst->src[i].file = GRF;
1933          inst->src[i].reg = dst.reg;
1934          inst->src[i].reg_offset = 0;
1935       }
1936    }
1937    invalidate_live_intervals();
1938 }
1939
1940 bool
1941 fs_visitor::opt_algebraic()
1942 {
1943    bool progress = false;
1944
1945    foreach_list(node, &this->instructions) {
1946       fs_inst *inst = (fs_inst *)node;
1947
1948       switch (inst->opcode) {
1949       case BRW_OPCODE_MUL:
1950          if (inst->src[1].file != IMM)
1951             continue;
1952
1953          /* a * 1.0 = a */
1954          if (inst->src[1].is_one()) {
1955             inst->opcode = BRW_OPCODE_MOV;
1956             inst->src[1] = reg_undef;
1957             progress = true;
1958             break;
1959          }
1960
1961          /* a * 0.0 = 0.0 */
1962          if (inst->src[1].is_zero()) {
1963             inst->opcode = BRW_OPCODE_MOV;
1964             inst->src[0] = inst->src[1];
1965             inst->src[1] = reg_undef;
1966             progress = true;
1967             break;
1968          }
1969
1970          break;
1971       case BRW_OPCODE_ADD:
1972          if (inst->src[1].file != IMM)
1973             continue;
1974
1975          /* a + 0.0 = a */
1976          if (inst->src[1].is_zero()) {
1977             inst->opcode = BRW_OPCODE_MOV;
1978             inst->src[1] = reg_undef;
1979             progress = true;
1980             break;
1981          }
1982          break;
1983       case BRW_OPCODE_OR:
1984          if (inst->src[0].equals(inst->src[1])) {
1985             inst->opcode = BRW_OPCODE_MOV;
1986             inst->src[1] = reg_undef;
1987             progress = true;
1988             break;
1989          }
1990          break;
1991       case BRW_OPCODE_LRP:
1992          if (inst->src[1].equals(inst->src[2])) {
1993             inst->opcode = BRW_OPCODE_MOV;
1994             inst->src[0] = inst->src[1];
1995             inst->src[1] = reg_undef;
1996             inst->src[2] = reg_undef;
1997             progress = true;
1998             break;
1999          }
2000          break;
2001       case BRW_OPCODE_SEL:
2002          if (inst->saturate && inst->src[1].file == IMM) {
2003             switch (inst->conditional_mod) {
2004             case BRW_CONDITIONAL_LE:
2005             case BRW_CONDITIONAL_L:
2006                switch (inst->src[1].type) {
2007                case BRW_REGISTER_TYPE_F:
2008                   if (inst->src[1].imm.f >= 1.0f) {
2009                      inst->opcode = BRW_OPCODE_MOV;
2010                      inst->src[1] = reg_undef;
2011                      progress = true;
2012                   }
2013                   break;
2014                default:
2015                   break;
2016                }
2017                break;
2018             case BRW_CONDITIONAL_GE:
2019             case BRW_CONDITIONAL_G:
2020                switch (inst->src[1].type) {
2021                case BRW_REGISTER_TYPE_F:
2022                   if (inst->src[1].imm.f <= 0.0f) {
2023                      inst->opcode = BRW_OPCODE_MOV;
2024                      inst->src[1] = reg_undef;
2025                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
2026                      progress = true;
2027                   }
2028                   break;
2029                default:
2030                   break;
2031                }
2032             default:
2033                break;
2034             }
2035          }
2036          break;
2037       default:
2038          break;
2039       }
2040    }
2041
2042    return progress;
2043 }
2044
2045 bool
2046 fs_visitor::compute_to_mrf()
2047 {
2048    bool progress = false;
2049    int next_ip = 0;
2050
2051    calculate_live_intervals();
2052
2053    foreach_list_safe(node, &this->instructions) {
2054       fs_inst *inst = (fs_inst *)node;
2055
2056       int ip = next_ip;
2057       next_ip++;
2058
2059       if (inst->opcode != BRW_OPCODE_MOV ||
2060           inst->is_partial_write() ||
2061           inst->dst.file != MRF || inst->src[0].file != GRF ||
2062           inst->dst.type != inst->src[0].type ||
2063           inst->src[0].abs || inst->src[0].negate ||
2064           !inst->src[0].is_contiguous() ||
2065           inst->src[0].subreg_offset)
2066          continue;
2067
2068       /* Work out which hardware MRF registers are written by this
2069        * instruction.
2070        */
2071       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2072       int mrf_high;
2073       if (inst->dst.reg & BRW_MRF_COMPR4) {
2074          mrf_high = mrf_low + 4;
2075       } else if (dispatch_width == 16 &&
2076                  (!inst->force_uncompressed && !inst->force_sechalf)) {
2077          mrf_high = mrf_low + 1;
2078       } else {
2079          mrf_high = mrf_low;
2080       }
2081
2082       /* Can't compute-to-MRF this GRF if someone else was going to
2083        * read it later.
2084        */
2085       if (this->virtual_grf_end[inst->src[0].reg] > ip)
2086          continue;
2087
2088       /* Found a move of a GRF to a MRF.  Let's see if we can go
2089        * rewrite the thing that made this GRF to write into the MRF.
2090        */
2091       fs_inst *scan_inst;
2092       for (scan_inst = (fs_inst *)inst->prev;
2093            scan_inst->prev != NULL;
2094            scan_inst = (fs_inst *)scan_inst->prev) {
2095          if (scan_inst->dst.file == GRF &&
2096              scan_inst->dst.reg == inst->src[0].reg) {
2097             /* Found the last thing to write our reg we want to turn
2098              * into a compute-to-MRF.
2099              */
2100
2101             /* If this one instruction didn't populate all the
2102              * channels, bail.  We might be able to rewrite everything
2103              * that writes that reg, but it would require smarter
2104              * tracking to delay the rewriting until complete success.
2105              */
2106             if (scan_inst->is_partial_write())
2107                break;
2108
2109             /* Things returning more than one register would need us to
2110              * understand coalescing out more than one MOV at a time.
2111              */
2112             if (scan_inst->regs_written > 1)
2113                break;
2114
2115             /* SEND instructions can't have MRF as a destination. */
2116             if (scan_inst->mlen)
2117                break;
2118
2119             if (brw->gen == 6) {
2120                /* gen6 math instructions must have the destination be
2121                 * GRF, so no compute-to-MRF for them.
2122                 */
2123                if (scan_inst->is_math()) {
2124                   break;
2125                }
2126             }
2127
2128             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2129                /* Found the creator of our MRF's source value. */
2130                scan_inst->dst.file = MRF;
2131                scan_inst->dst.reg = inst->dst.reg;
2132                scan_inst->saturate |= inst->saturate;
2133                inst->remove();
2134                progress = true;
2135             }
2136             break;
2137          }
2138
2139          /* We don't handle control flow here.  Most computation of
2140           * values that end up in MRFs are shortly before the MRF
2141           * write anyway.
2142           */
2143          if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2144             break;
2145
2146          /* You can't read from an MRF, so if someone else reads our
2147           * MRF's source GRF that we wanted to rewrite, that stops us.
2148           */
2149          bool interfered = false;
2150          for (int i = 0; i < scan_inst->sources; i++) {
2151             if (scan_inst->src[i].file == GRF &&
2152                 scan_inst->src[i].reg == inst->src[0].reg &&
2153                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2154                interfered = true;
2155             }
2156          }
2157          if (interfered)
2158             break;
2159
2160          if (scan_inst->dst.file == MRF) {
2161             /* If somebody else writes our MRF here, we can't
2162              * compute-to-MRF before that.
2163              */
2164             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2165             int scan_mrf_high;
2166
2167             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2168                scan_mrf_high = scan_mrf_low + 4;
2169             } else if (dispatch_width == 16 &&
2170                        (!scan_inst->force_uncompressed &&
2171                         !scan_inst->force_sechalf)) {
2172                scan_mrf_high = scan_mrf_low + 1;
2173             } else {
2174                scan_mrf_high = scan_mrf_low;
2175             }
2176
2177             if (mrf_low == scan_mrf_low ||
2178                 mrf_low == scan_mrf_high ||
2179                 mrf_high == scan_mrf_low ||
2180                 mrf_high == scan_mrf_high) {
2181                break;
2182             }
2183          }
2184
2185          if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2186             /* Found a SEND instruction, which means that there are
2187              * live values in MRFs from base_mrf to base_mrf +
2188              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2189              * above it.
2190              */
2191             if (mrf_low >= scan_inst->base_mrf &&
2192                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2193                break;
2194             }
2195             if (mrf_high >= scan_inst->base_mrf &&
2196                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2197                break;
2198             }
2199          }
2200       }
2201    }
2202
2203    if (progress)
2204       invalidate_live_intervals();
2205
2206    return progress;
2207 }
2208
2209 /**
2210  * Walks through basic blocks, looking for repeated MRF writes and
2211  * removing the later ones.
2212  */
2213 bool
2214 fs_visitor::remove_duplicate_mrf_writes()
2215 {
2216    fs_inst *last_mrf_move[16];
2217    bool progress = false;
2218
2219    /* Need to update the MRF tracking for compressed instructions. */
2220    if (dispatch_width == 16)
2221       return false;
2222
2223    memset(last_mrf_move, 0, sizeof(last_mrf_move));
2224
2225    foreach_list_safe(node, &this->instructions) {
2226       fs_inst *inst = (fs_inst *)node;
2227
2228       if (inst->is_control_flow()) {
2229          memset(last_mrf_move, 0, sizeof(last_mrf_move));
2230       }
2231
2232       if (inst->opcode == BRW_OPCODE_MOV &&
2233           inst->dst.file == MRF) {
2234          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2235          if (prev_inst && inst->equals(prev_inst)) {
2236             inst->remove();
2237             progress = true;
2238             continue;
2239          }
2240       }
2241
2242       /* Clear out the last-write records for MRFs that were overwritten. */
2243       if (inst->dst.file == MRF) {
2244          last_mrf_move[inst->dst.reg] = NULL;
2245       }
2246
2247       if (inst->mlen > 0 && inst->base_mrf != -1) {
2248          /* Found a SEND instruction, which will include two or fewer
2249           * implied MRF writes.  We could do better here.
2250           */
2251          for (int i = 0; i < implied_mrf_writes(inst); i++) {
2252             last_mrf_move[inst->base_mrf + i] = NULL;
2253          }
2254       }
2255
2256       /* Clear out any MRF move records whose sources got overwritten. */
2257       if (inst->dst.file == GRF) {
2258          for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2259             if (last_mrf_move[i] &&
2260                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2261                last_mrf_move[i] = NULL;
2262             }
2263          }
2264       }
2265
2266       if (inst->opcode == BRW_OPCODE_MOV &&
2267           inst->dst.file == MRF &&
2268           inst->src[0].file == GRF &&
2269           !inst->is_partial_write()) {
2270          last_mrf_move[inst->dst.reg] = inst;
2271       }
2272    }
2273
2274    if (progress)
2275       invalidate_live_intervals();
2276
2277    return progress;
2278 }
2279
2280 static void
2281 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2282                         int first_grf, int grf_len)
2283 {
2284    bool inst_simd16 = (dispatch_width > 8 &&
2285                        !inst->force_uncompressed &&
2286                        !inst->force_sechalf);
2287
2288    /* Clear the flag for registers that actually got read (as expected). */
2289    for (int i = 0; i < inst->sources; i++) {
2290       int grf;
2291       if (inst->src[i].file == GRF) {
2292          grf = inst->src[i].reg;
2293       } else if (inst->src[i].file == HW_REG &&
2294                  inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2295          grf = inst->src[i].fixed_hw_reg.nr;
2296       } else {
2297          continue;
2298       }
2299
2300       if (grf >= first_grf &&
2301           grf < first_grf + grf_len) {
2302          deps[grf - first_grf] = false;
2303          if (inst_simd16)
2304             deps[grf - first_grf + 1] = false;
2305       }
2306    }
2307 }
2308
2309 /**
2310  * Implements this workaround for the original 965:
2311  *
2312  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2313  *      check for post destination dependencies on this instruction, software
2314  *      must ensure that there is no destination hazard for the case of ‘write
2315  *      followed by a posted write’ shown in the following example.
2316  *
2317  *      1. mov r3 0
2318  *      2. send r3.xy <rest of send instruction>
2319  *      3. mov r2 r3
2320  *
2321  *      Due to no post-destination dependency check on the ‘send’, the above
2322  *      code sequence could have two instructions (1 and 2) in flight at the
2323  *      same time that both consider ‘r3’ as the target of their final writes.
2324  */
2325 void
2326 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2327 {
2328    int reg_size = dispatch_width / 8;
2329    int write_len = inst->regs_written * reg_size;
2330    int first_write_grf = inst->dst.reg;
2331    bool needs_dep[BRW_MAX_MRF];
2332    assert(write_len < (int)sizeof(needs_dep) - 1);
2333
2334    memset(needs_dep, false, sizeof(needs_dep));
2335    memset(needs_dep, true, write_len);
2336
2337    clear_deps_for_inst_src(inst, dispatch_width,
2338                            needs_dep, first_write_grf, write_len);
2339
2340    /* Walk backwards looking for writes to registers we're writing which
2341     * aren't read since being written.  If we hit the start of the program,
2342     * we assume that there are no outstanding dependencies on entry to the
2343     * program.
2344     */
2345    for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2346         scan_inst != NULL;
2347         scan_inst = (fs_inst *)scan_inst->prev) {
2348
2349       /* If we hit control flow, assume that there *are* outstanding
2350        * dependencies, and force their cleanup before our instruction.
2351        */
2352       if (scan_inst->is_control_flow()) {
2353          for (int i = 0; i < write_len; i++) {
2354             if (needs_dep[i]) {
2355                inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2356             }
2357          }
2358          return;
2359       }
2360
2361       bool scan_inst_simd16 = (dispatch_width > 8 &&
2362                                !scan_inst->force_uncompressed &&
2363                                !scan_inst->force_sechalf);
2364
2365       /* We insert our reads as late as possible on the assumption that any
2366        * instruction but a MOV that might have left us an outstanding
2367        * dependency has more latency than a MOV.
2368        */
2369       if (scan_inst->dst.file == GRF) {
2370          for (int i = 0; i < scan_inst->regs_written; i++) {
2371             int reg = scan_inst->dst.reg + i * reg_size;
2372
2373             if (reg >= first_write_grf &&
2374                 reg < first_write_grf + write_len &&
2375                 needs_dep[reg - first_write_grf]) {
2376                inst->insert_before(DEP_RESOLVE_MOV(reg));
2377                needs_dep[reg - first_write_grf] = false;
2378                if (scan_inst_simd16)
2379                   needs_dep[reg - first_write_grf + 1] = false;
2380             }
2381          }
2382       }
2383
2384       /* Clear the flag for registers that actually got read (as expected). */
2385       clear_deps_for_inst_src(scan_inst, dispatch_width,
2386                               needs_dep, first_write_grf, write_len);
2387
2388       /* Continue the loop only if we haven't resolved all the dependencies */
2389       int i;
2390       for (i = 0; i < write_len; i++) {
2391          if (needs_dep[i])
2392             break;
2393       }
2394       if (i == write_len)
2395          return;
2396    }
2397 }
2398
2399 /**
2400  * Implements this workaround for the original 965:
2401  *
2402  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
2403  *      used as a destination register until after it has been sourced by an
2404  *      instruction with a different destination register.
2405  */
2406 void
2407 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2408 {
2409    int write_len = inst->regs_written * dispatch_width / 8;
2410    int first_write_grf = inst->dst.reg;
2411    bool needs_dep[BRW_MAX_MRF];
2412    assert(write_len < (int)sizeof(needs_dep) - 1);
2413
2414    memset(needs_dep, false, sizeof(needs_dep));
2415    memset(needs_dep, true, write_len);
2416    /* Walk forwards looking for writes to registers we're writing which aren't
2417     * read before being written.
2418     */
2419    for (fs_inst *scan_inst = (fs_inst *)inst->next;
2420         !scan_inst->is_tail_sentinel();
2421         scan_inst = (fs_inst *)scan_inst->next) {
2422       /* If we hit control flow, force resolve all remaining dependencies. */
2423       if (scan_inst->is_control_flow()) {
2424          for (int i = 0; i < write_len; i++) {
2425             if (needs_dep[i])
2426                scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2427          }
2428          return;
2429       }
2430
2431       /* Clear the flag for registers that actually got read (as expected). */
2432       clear_deps_for_inst_src(scan_inst, dispatch_width,
2433                               needs_dep, first_write_grf, write_len);
2434
2435       /* We insert our reads as late as possible since they're reading the
2436        * result of a SEND, which has massive latency.
2437        */
2438       if (scan_inst->dst.file == GRF &&
2439           scan_inst->dst.reg >= first_write_grf &&
2440           scan_inst->dst.reg < first_write_grf + write_len &&
2441           needs_dep[scan_inst->dst.reg - first_write_grf]) {
2442          scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2443          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2444       }
2445
2446       /* Continue the loop only if we haven't resolved all the dependencies */
2447       int i;
2448       for (i = 0; i < write_len; i++) {
2449          if (needs_dep[i])
2450             break;
2451       }
2452       if (i == write_len)
2453          return;
2454    }
2455
2456    /* If we hit the end of the program, resolve all remaining dependencies out
2457     * of paranoia.
2458     */
2459    fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2460    assert(last_inst->eot);
2461    for (int i = 0; i < write_len; i++) {
2462       if (needs_dep[i])
2463          last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2464    }
2465 }
2466
2467 void
2468 fs_visitor::insert_gen4_send_dependency_workarounds()
2469 {
2470    if (brw->gen != 4 || brw->is_g4x)
2471       return;
2472
2473    /* Note that we're done with register allocation, so GRF fs_regs always
2474     * have a .reg_offset of 0.
2475     */
2476
2477    foreach_list_safe(node, &this->instructions) {
2478       fs_inst *inst = (fs_inst *)node;
2479
2480       if (inst->mlen != 0 && inst->dst.file == GRF) {
2481          insert_gen4_pre_send_dependency_workarounds(inst);
2482          insert_gen4_post_send_dependency_workarounds(inst);
2483       }
2484    }
2485 }
2486
2487 /**
2488  * Turns the generic expression-style uniform pull constant load instruction
2489  * into a hardware-specific series of instructions for loading a pull
2490  * constant.
2491  *
2492  * The expression style allows the CSE pass before this to optimize out
2493  * repeated loads from the same offset, and gives the pre-register-allocation
2494  * scheduling full flexibility, while the conversion to native instructions
2495  * allows the post-register-allocation scheduler the best information
2496  * possible.
2497  *
2498  * Note that execution masking for setting up pull constant loads is special:
2499  * the channels that need to be written are unrelated to the current execution
2500  * mask, since a later instruction will use one of the result channels as a
2501  * source operand for all 8 or 16 of its channels.
2502  */
2503 void
2504 fs_visitor::lower_uniform_pull_constant_loads()
2505 {
2506    foreach_list(node, &this->instructions) {
2507       fs_inst *inst = (fs_inst *)node;
2508
2509       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2510          continue;
2511
2512       if (brw->gen >= 7) {
2513          /* The offset arg before was a vec4-aligned byte offset.  We need to
2514           * turn it into a dword offset.
2515           */
2516          fs_reg const_offset_reg = inst->src[1];
2517          assert(const_offset_reg.file == IMM &&
2518                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2519          const_offset_reg.imm.u /= 4;
2520          fs_reg payload = fs_reg(this, glsl_type::uint_type);
2521
2522          /* This is actually going to be a MOV, but since only the first dword
2523           * is accessed, we have a special opcode to do just that one.  Note
2524           * that this needs to be an operation that will be considered a def
2525           * by live variable analysis, or register allocation will explode.
2526           */
2527          fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2528                                                payload, const_offset_reg);
2529          setup->force_writemask_all = true;
2530
2531          setup->ir = inst->ir;
2532          setup->annotation = inst->annotation;
2533          inst->insert_before(setup);
2534
2535          /* Similarly, this will only populate the first 4 channels of the
2536           * result register (since we only use smear values from 0-3), but we
2537           * don't tell the optimizer.
2538           */
2539          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2540          inst->src[1] = payload;
2541
2542          invalidate_live_intervals();
2543       } else {
2544          /* Before register allocation, we didn't tell the scheduler about the
2545           * MRF we use.  We know it's safe to use this MRF because nothing
2546           * else does except for register spill/unspill, which generates and
2547           * uses its MRF within a single IR instruction.
2548           */
2549          inst->base_mrf = 14;
2550          inst->mlen = 1;
2551       }
2552    }
2553 }
2554
2555 void
2556 fs_visitor::dump_instructions()
2557 {
2558    dump_instructions(NULL);
2559 }
2560
2561 void
2562 fs_visitor::dump_instructions(const char *name)
2563 {
2564    calculate_register_pressure();
2565    FILE *file = stderr;
2566    if (name && geteuid() != 0) {
2567       file = fopen(name, "w");
2568       if (!file)
2569          file = stderr;
2570    }
2571
2572    int ip = 0, max_pressure = 0;
2573    foreach_list(node, &this->instructions) {
2574       backend_instruction *inst = (backend_instruction *)node;
2575       max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
2576       fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
2577       dump_instruction(inst, file);
2578       ++ip;
2579    }
2580    fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
2581
2582    if (file != stderr) {
2583       fclose(file);
2584    }
2585 }
2586
2587 void
2588 fs_visitor::dump_instruction(backend_instruction *be_inst)
2589 {
2590    dump_instruction(be_inst, stderr);
2591 }
2592
2593 void
2594 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
2595 {
2596    fs_inst *inst = (fs_inst *)be_inst;
2597
2598    if (inst->predicate) {
2599       fprintf(file, "(%cf0.%d) ",
2600              inst->predicate_inverse ? '-' : '+',
2601              inst->flag_subreg);
2602    }
2603
2604    fprintf(file, "%s", brw_instruction_name(inst->opcode));
2605    if (inst->saturate)
2606       fprintf(file, ".sat");
2607    if (inst->conditional_mod) {
2608       fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
2609       if (!inst->predicate &&
2610           (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2611                               inst->opcode != BRW_OPCODE_IF &&
2612                               inst->opcode != BRW_OPCODE_WHILE))) {
2613          fprintf(file, ".f0.%d", inst->flag_subreg);
2614       }
2615    }
2616    fprintf(file, " ");
2617
2618
2619    switch (inst->dst.file) {
2620    case GRF:
2621       fprintf(file, "vgrf%d", inst->dst.reg);
2622       if (virtual_grf_sizes[inst->dst.reg] != 1 ||
2623           inst->dst.subreg_offset)
2624          fprintf(file, "+%d.%d",
2625                  inst->dst.reg_offset, inst->dst.subreg_offset);
2626       break;
2627    case MRF:
2628       fprintf(file, "m%d", inst->dst.reg);
2629       break;
2630    case BAD_FILE:
2631       fprintf(file, "(null)");
2632       break;
2633    case UNIFORM:
2634       fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
2635       break;
2636    case HW_REG:
2637       if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
2638          switch (inst->dst.fixed_hw_reg.nr) {
2639          case BRW_ARF_NULL:
2640             fprintf(file, "null");
2641             break;
2642          case BRW_ARF_ADDRESS:
2643             fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
2644             break;
2645          case BRW_ARF_ACCUMULATOR:
2646             fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
2647             break;
2648          case BRW_ARF_FLAG:
2649             fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
2650                              inst->dst.fixed_hw_reg.subnr);
2651             break;
2652          default:
2653             fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
2654                                inst->dst.fixed_hw_reg.subnr);
2655             break;
2656          }
2657       } else {
2658          fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
2659       }
2660       if (inst->dst.fixed_hw_reg.subnr)
2661          fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
2662       break;
2663    default:
2664       fprintf(file, "???");
2665       break;
2666    }
2667    fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
2668
2669    for (int i = 0; i < inst->sources && inst->src[i].file != BAD_FILE; i++) {
2670       if (inst->src[i].negate)
2671          fprintf(file, "-");
2672       if (inst->src[i].abs)
2673          fprintf(file, "|");
2674       switch (inst->src[i].file) {
2675       case GRF:
2676          fprintf(file, "vgrf%d", inst->src[i].reg);
2677          if (virtual_grf_sizes[inst->src[i].reg] != 1 ||
2678              inst->src[i].subreg_offset)
2679             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
2680                     inst->src[i].subreg_offset);
2681          break;
2682       case MRF:
2683          fprintf(file, "***m%d***", inst->src[i].reg);
2684          break;
2685       case UNIFORM:
2686          fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
2687          if (inst->src[i].reladdr) {
2688             fprintf(file, "+reladdr");
2689          } else if (virtual_grf_sizes[inst->src[i].reg] != 1 ||
2690              inst->src[i].subreg_offset) {
2691             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
2692                     inst->src[i].subreg_offset);
2693          }
2694          break;
2695       case BAD_FILE:
2696          fprintf(file, "(null)");
2697          break;
2698       case IMM:
2699          switch (inst->src[i].type) {
2700          case BRW_REGISTER_TYPE_F:
2701             fprintf(file, "%ff", inst->src[i].imm.f);
2702             break;
2703          case BRW_REGISTER_TYPE_D:
2704             fprintf(file, "%dd", inst->src[i].imm.i);
2705             break;
2706          case BRW_REGISTER_TYPE_UD:
2707             fprintf(file, "%uu", inst->src[i].imm.u);
2708             break;
2709          default:
2710             fprintf(file, "???");
2711             break;
2712          }
2713          break;
2714       case HW_REG:
2715          if (inst->src[i].fixed_hw_reg.negate)
2716             fprintf(file, "-");
2717          if (inst->src[i].fixed_hw_reg.abs)
2718             fprintf(file, "|");
2719          if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
2720             switch (inst->src[i].fixed_hw_reg.nr) {
2721             case BRW_ARF_NULL:
2722                fprintf(file, "null");
2723                break;
2724             case BRW_ARF_ADDRESS:
2725                fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
2726                break;
2727             case BRW_ARF_ACCUMULATOR:
2728                fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
2729                break;
2730             case BRW_ARF_FLAG:
2731                fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
2732                                 inst->src[i].fixed_hw_reg.subnr);
2733                break;
2734             default:
2735                fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
2736                                   inst->src[i].fixed_hw_reg.subnr);
2737                break;
2738             }
2739          } else {
2740             fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
2741          }
2742          if (inst->src[i].fixed_hw_reg.subnr)
2743             fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
2744          if (inst->src[i].fixed_hw_reg.abs)
2745             fprintf(file, "|");
2746          break;
2747       default:
2748          fprintf(file, "???");
2749          break;
2750       }
2751       if (inst->src[i].abs)
2752          fprintf(file, "|");
2753
2754       if (inst->src[i].file != IMM) {
2755          fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
2756       }
2757
2758       if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
2759          fprintf(file, ", ");
2760    }
2761
2762    fprintf(file, " ");
2763
2764    if (inst->force_uncompressed)
2765       fprintf(file, "1sthalf ");
2766
2767    if (inst->force_sechalf)
2768       fprintf(file, "2ndhalf ");
2769
2770    fprintf(file, "\n");
2771 }
2772
2773 /**
2774  * Possibly returns an instruction that set up @param reg.
2775  *
2776  * Sometimes we want to take the result of some expression/variable
2777  * dereference tree and rewrite the instruction generating the result
2778  * of the tree.  When processing the tree, we know that the
2779  * instructions generated are all writing temporaries that are dead
2780  * outside of this tree.  So, if we have some instructions that write
2781  * a temporary, we're free to point that temp write somewhere else.
2782  *
2783  * Note that this doesn't guarantee that the instruction generated
2784  * only reg -- it might be the size=4 destination of a texture instruction.
2785  */
2786 fs_inst *
2787 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2788                                            fs_inst *end,
2789                                            const fs_reg &reg)
2790 {
2791    if (end == start ||
2792        end->is_partial_write() ||
2793        reg.reladdr ||
2794        !reg.equals(end->dst)) {
2795       return NULL;
2796    } else {
2797       return end;
2798    }
2799 }
2800
2801 void
2802 fs_visitor::setup_payload_gen6()
2803 {
2804    bool uses_depth =
2805       (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
2806    unsigned barycentric_interp_modes = prog_data->barycentric_interp_modes;
2807
2808    assert(brw->gen >= 6);
2809
2810    /* R0-1: masks, pixel X/Y coordinates. */
2811    payload.num_regs = 2;
2812    /* R2: only for 32-pixel dispatch.*/
2813
2814    /* R3-26: barycentric interpolation coordinates.  These appear in the
2815     * same order that they appear in the brw_wm_barycentric_interp_mode
2816     * enum.  Each set of coordinates occupies 2 registers if dispatch width
2817     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
2818     * appear if they were enabled using the "Barycentric Interpolation
2819     * Mode" bits in WM_STATE.
2820     */
2821    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2822       if (barycentric_interp_modes & (1 << i)) {
2823          payload.barycentric_coord_reg[i] = payload.num_regs;
2824          payload.num_regs += 2;
2825          if (dispatch_width == 16) {
2826             payload.num_regs += 2;
2827          }
2828       }
2829    }
2830
2831    /* R27: interpolated depth if uses source depth */
2832    if (uses_depth) {
2833       payload.source_depth_reg = payload.num_regs;
2834       payload.num_regs++;
2835       if (dispatch_width == 16) {
2836          /* R28: interpolated depth if not SIMD8. */
2837          payload.num_regs++;
2838       }
2839    }
2840    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
2841    if (uses_depth) {
2842       payload.source_w_reg = payload.num_regs;
2843       payload.num_regs++;
2844       if (dispatch_width == 16) {
2845          /* R30: interpolated W if not SIMD8. */
2846          payload.num_regs++;
2847       }
2848    }
2849
2850    prog_data->uses_pos_offset = key->compute_pos_offset;
2851    /* R31: MSAA position offsets. */
2852    if (prog_data->uses_pos_offset) {
2853       payload.sample_pos_reg = payload.num_regs;
2854       payload.num_regs++;
2855    }
2856
2857    /* R32: MSAA input coverage mask */
2858    if (fp->Base.SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
2859       assert(brw->gen >= 7);
2860       payload.sample_mask_in_reg = payload.num_regs;
2861       payload.num_regs++;
2862       if (dispatch_width == 16) {
2863          /* R33: input coverage mask if not SIMD8. */
2864          payload.num_regs++;
2865       }
2866    }
2867
2868    /* R34-: bary for 32-pixel. */
2869    /* R58-59: interp W for 32-pixel. */
2870
2871    if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2872       source_depth_to_render_target = true;
2873    }
2874 }
2875
2876 void
2877 fs_visitor::assign_binding_table_offsets()
2878 {
2879    uint32_t next_binding_table_offset = 0;
2880
2881    /* If there are no color regions, we still perform an FB write to a null
2882     * renderbuffer, which we place at surface index 0.
2883     */
2884    prog_data->binding_table.render_target_start = next_binding_table_offset;
2885    next_binding_table_offset += MAX2(key->nr_color_regions, 1);
2886
2887    assign_common_binding_table_offsets(next_binding_table_offset);
2888 }
2889
2890 void
2891 fs_visitor::calculate_register_pressure()
2892 {
2893    invalidate_live_intervals();
2894    calculate_live_intervals();
2895
2896    int num_instructions = 0;
2897    foreach_list(node, &this->instructions) {
2898       ++num_instructions;
2899    }
2900
2901    regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
2902
2903    for (int reg = 0; reg < virtual_grf_count; reg++) {
2904       for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
2905          regs_live_at_ip[ip] += virtual_grf_sizes[reg];
2906    }
2907 }
2908
2909 /**
2910  * Look for repeated FS_OPCODE_MOV_DISPATCH_TO_FLAGS and drop the later ones.
2911  *
2912  * The needs_unlit_centroid_workaround ends up producing one of these per
2913  * channel of centroid input, so it's good to clean them up.
2914  *
2915  * An assumption here is that nothing ever modifies the dispatched pixels
2916  * value that FS_OPCODE_MOV_DISPATCH_TO_FLAGS reads from, but the hardware
2917  * dictates that anyway.
2918  */
2919 void
2920 fs_visitor::opt_drop_redundant_mov_to_flags()
2921 {
2922    bool flag_mov_found[2] = {false};
2923
2924    foreach_list_safe(node, &this->instructions) {
2925       fs_inst *inst = (fs_inst *)node;
2926
2927       if (inst->is_control_flow()) {
2928          memset(flag_mov_found, 0, sizeof(flag_mov_found));
2929       } else if (inst->opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS) {
2930          if (!flag_mov_found[inst->flag_subreg])
2931             flag_mov_found[inst->flag_subreg] = true;
2932          else
2933             inst->remove();
2934       } else if (inst->writes_flag()) {
2935          flag_mov_found[inst->flag_subreg] = false;
2936       }
2937    }
2938 }
2939
2940 bool
2941 fs_visitor::run()
2942 {
2943    sanity_param_count = fp->Base.Parameters->NumParameters;
2944    bool allocated_without_spills;
2945
2946    assign_binding_table_offsets();
2947
2948    if (brw->gen >= 6)
2949       setup_payload_gen6();
2950    else
2951       setup_payload_gen4();
2952
2953    if (0) {
2954       emit_dummy_fs();
2955    } else {
2956       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2957          emit_shader_time_begin();
2958
2959       calculate_urb_setup();
2960       if (fp->Base.InputsRead > 0) {
2961          if (brw->gen < 6)
2962             emit_interpolation_setup_gen4();
2963          else
2964             emit_interpolation_setup_gen6();
2965       }
2966
2967       /* We handle discards by keeping track of the still-live pixels in f0.1.
2968        * Initialize it with the dispatched pixels.
2969        */
2970       if (fp->UsesKill || key->alpha_test_func) {
2971          fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
2972          discard_init->flag_subreg = 1;
2973       }
2974
2975       /* Generate FS IR for main().  (the visitor only descends into
2976        * functions called "main").
2977        */
2978       if (shader) {
2979          foreach_list(node, &*shader->base.ir) {
2980             ir_instruction *ir = (ir_instruction *)node;
2981             base_ir = ir;
2982             this->result = reg_undef;
2983             ir->accept(this);
2984          }
2985       } else {
2986          emit_fragment_program_code();
2987       }
2988       base_ir = NULL;
2989       if (failed)
2990          return false;
2991
2992       emit(FS_OPCODE_PLACEHOLDER_HALT);
2993
2994       if (key->alpha_test_func)
2995          emit_alpha_test();
2996
2997       emit_fb_writes();
2998
2999       split_virtual_grfs();
3000
3001       move_uniform_array_access_to_pull_constants();
3002       assign_constant_locations();
3003       demote_pull_constants();
3004
3005       opt_drop_redundant_mov_to_flags();
3006
3007 #define OPT(pass, args...) do {                                            \
3008       pass_num++;                                                          \
3009       bool this_progress = pass(args);                                     \
3010                                                                            \
3011       if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) {      \
3012          char filename[64];                                                \
3013          snprintf(filename, 64, "fs%d-%04d-%02d-%02d-" #pass,              \
3014                   dispatch_width, shader_prog->Name, iteration, pass_num); \
3015                                                                            \
3016          backend_visitor::dump_instructions(filename);                     \
3017       }                                                                    \
3018                                                                            \
3019       progress = progress || this_progress;                                \
3020    } while (false)
3021
3022       if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
3023          char filename[64];
3024          snprintf(filename, 64, "fs%d-%04d-00-start",
3025                   dispatch_width, shader_prog->Name);
3026
3027          backend_visitor::dump_instructions(filename);
3028       }
3029
3030       bool progress;
3031       int iteration = 0;
3032       do {
3033          progress = false;
3034          iteration++;
3035          int pass_num = 0;
3036
3037          compact_virtual_grfs();
3038
3039          OPT(remove_duplicate_mrf_writes);
3040
3041          OPT(opt_algebraic);
3042          OPT(opt_cse);
3043          OPT(opt_copy_propagate);
3044          OPT(opt_peephole_predicated_break);
3045          OPT(dead_code_eliminate);
3046          OPT(opt_peephole_sel);
3047          OPT(dead_control_flow_eliminate, this);
3048          OPT(opt_saturate_propagation);
3049          OPT(register_coalesce);
3050          OPT(compute_to_mrf);
3051       } while (progress);
3052
3053       lower_uniform_pull_constant_loads();
3054
3055       assign_curb_setup();
3056       assign_urb_setup();
3057
3058       static enum instruction_scheduler_mode pre_modes[] = {
3059          SCHEDULE_PRE,
3060          SCHEDULE_PRE_NON_LIFO,
3061          SCHEDULE_PRE_LIFO,
3062       };
3063
3064       /* Try each scheduling heuristic to see if it can successfully register
3065        * allocate without spilling.  They should be ordered by decreasing
3066        * performance but increasing likelihood of allocating.
3067        */
3068       for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3069          schedule_instructions(pre_modes[i]);
3070
3071          if (0) {
3072             assign_regs_trivial();
3073             allocated_without_spills = true;
3074          } else {
3075             allocated_without_spills = assign_regs(false);
3076          }
3077          if (allocated_without_spills)
3078             break;
3079       }
3080
3081       if (!allocated_without_spills) {
3082          /* We assume that any spilling is worse than just dropping back to
3083           * SIMD8.  There's probably actually some intermediate point where
3084           * SIMD16 with a couple of spills is still better.
3085           */
3086          if (dispatch_width == 16) {
3087             fail("Failure to register allocate.  Reduce number of "
3088                  "live scalar values to avoid this.");
3089          } else {
3090             perf_debug("Fragment shader triggered register spilling.  "
3091                        "Try reducing the number of live scalar values to "
3092                        "improve performance.\n");
3093          }
3094
3095          /* Since we're out of heuristics, just go spill registers until we
3096           * get an allocation.
3097           */
3098          while (!assign_regs(true)) {
3099             if (failed)
3100                break;
3101          }
3102       }
3103    }
3104    assert(force_uncompressed_stack == 0);
3105
3106    /* This must come after all optimization and register allocation, since
3107     * it inserts dead code that happens to have side effects, and it does
3108     * so based on the actual physical registers in use.
3109     */
3110    insert_gen4_send_dependency_workarounds();
3111
3112    if (failed)
3113       return false;
3114
3115    if (!allocated_without_spills)
3116       schedule_instructions(SCHEDULE_POST);
3117
3118    if (last_scratch > 0) {
3119       prog_data->total_scratch = brw_get_scratch_size(last_scratch);
3120    }
3121
3122    if (dispatch_width == 8)
3123       prog_data->reg_blocks = brw_register_blocks(grf_used);
3124    else
3125       prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
3126
3127    /* If any state parameters were appended, then ParameterValues could have
3128     * been realloced, in which case the driver uniform storage set up by
3129     * _mesa_associate_uniform_storage() would point to freed memory.  Make
3130     * sure that didn't happen.
3131     */
3132    assert(sanity_param_count == fp->Base.Parameters->NumParameters);
3133
3134    return !failed;
3135 }
3136
3137 const unsigned *
3138 brw_wm_fs_emit(struct brw_context *brw,
3139                void *mem_ctx,
3140                const struct brw_wm_prog_key *key,
3141                struct brw_wm_prog_data *prog_data,
3142                struct gl_fragment_program *fp,
3143                struct gl_shader_program *prog,
3144                unsigned *final_assembly_size)
3145 {
3146    bool start_busy = false;
3147    double start_time = 0;
3148
3149    if (unlikely(brw->perf_debug)) {
3150       start_busy = (brw->batch.last_bo &&
3151                     drm_intel_bo_busy(brw->batch.last_bo));
3152       start_time = get_time();
3153    }
3154
3155    struct brw_shader *shader = NULL;
3156    if (prog)
3157       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3158
3159    if (unlikely(INTEL_DEBUG & DEBUG_WM))
3160       brw_dump_ir(brw, "fragment", prog, &shader->base, &fp->Base);
3161
3162    /* Now the main event: Visit the shader IR and generate our FS IR for it.
3163     */
3164    fs_visitor v(brw, mem_ctx, key, prog_data, prog, fp, 8);
3165    if (!v.run()) {
3166       if (prog) {
3167          prog->LinkStatus = false;
3168          ralloc_strcat(&prog->InfoLog, v.fail_msg);
3169       }
3170
3171       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3172                     v.fail_msg);
3173
3174       return NULL;
3175    }
3176
3177    exec_list *simd16_instructions = NULL;
3178    fs_visitor v2(brw, mem_ctx, key, prog_data, prog, fp, 16);
3179    if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16))) {
3180       if (!v.simd16_unsupported) {
3181          /* Try a SIMD16 compile */
3182          v2.import_uniforms(&v);
3183          if (!v2.run()) {
3184             perf_debug("SIMD16 shader failed to compile, falling back to "
3185                        "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
3186          } else {
3187             simd16_instructions = &v2.instructions;
3188          }
3189       } else {
3190          perf_debug("SIMD16 shader unsupported, falling back to "
3191                     "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
3192       }
3193    }
3194
3195    const unsigned *assembly = NULL;
3196    if (brw->gen >= 8) {
3197       gen8_fs_generator g(brw, mem_ctx, key, prog_data, prog, fp, v.do_dual_src);
3198       assembly = g.generate_assembly(&v.instructions, simd16_instructions,
3199                                      final_assembly_size);
3200    } else {
3201       fs_generator g(brw, mem_ctx, key, prog_data, prog, fp, v.do_dual_src,
3202                      INTEL_DEBUG & DEBUG_WM);
3203       assembly = g.generate_assembly(&v.instructions, simd16_instructions,
3204                                      final_assembly_size);
3205    }
3206
3207    if (unlikely(brw->perf_debug) && shader) {
3208       if (shader->compiled_once)
3209          brw_wm_debug_recompile(brw, prog, key);
3210       shader->compiled_once = true;
3211
3212       if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3213          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3214                     (get_time() - start_time) * 1000);
3215       }
3216    }
3217
3218    return assembly;
3219 }
3220
3221 bool
3222 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
3223 {
3224    struct brw_context *brw = brw_context(ctx);
3225    struct brw_wm_prog_key key;
3226
3227    if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
3228       return true;
3229
3230    struct gl_fragment_program *fp = (struct gl_fragment_program *)
3231       prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
3232    struct brw_fragment_program *bfp = brw_fragment_program(fp);
3233    bool program_uses_dfdy = fp->UsesDFdy;
3234
3235    memset(&key, 0, sizeof(key));
3236
3237    if (brw->gen < 6) {
3238       if (fp->UsesKill)
3239          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3240
3241       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3242          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3243
3244       /* Just assume depth testing. */
3245       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3246       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3247    }
3248
3249    if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
3250                                          BRW_FS_VARYING_INPUT_MASK) > 16)
3251       key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
3252
3253    unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
3254    for (unsigned i = 0; i < sampler_count; i++) {
3255       if (fp->Base.ShadowSamplers & (1 << i)) {
3256          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3257          key.tex.swizzles[i] =
3258             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3259       } else {
3260          /* Color sampler: assume no swizzling. */
3261          key.tex.swizzles[i] = SWIZZLE_XYZW;
3262       }
3263    }
3264
3265    if (fp->Base.InputsRead & VARYING_BIT_POS) {
3266       key.drawable_height = ctx->DrawBuffer->Height;
3267    }
3268
3269    key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
3270          ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
3271          BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
3272
3273    if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3274       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
3275                           key.nr_color_regions > 1;
3276    }
3277
3278    /* GL_FRAGMENT_SHADER_DERIVATIVE_HINT is almost always GL_DONT_CARE.  The
3279     * quality of the derivatives is likely to be determined by the driconf
3280     * option.
3281     */
3282    key.high_quality_derivatives = brw->disable_derivative_optimization;
3283
3284    key.program_string_id = bfp->id;
3285
3286    uint32_t old_prog_offset = brw->wm.base.prog_offset;
3287    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3288
3289    bool success = do_wm_prog(brw, prog, bfp, &key);
3290
3291    brw->wm.base.prog_offset = old_prog_offset;
3292    brw->wm.prog_data = old_prog_data;
3293
3294    return success;
3295 }