src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 extern "C" {
  32
  33 #include <sys/types.h>
  34
  35 #include "main/hash_table.h"
  36 #include "main/macros.h"
  37 #include "main/shaderobj.h"
  38 #include "main/fbobject.h"
  39 #include "program/prog_parameter.h"
  40 #include "program/prog_print.h"
  41 #include "program/register_allocate.h"
  42 #include "program/sampler.h"
  43 #include "program/hash_table.h"
  44 #include "brw_context.h"
  45 #include "brw_eu.h"
  46 #include "brw_wm.h"
  47 }
  48 #include "brw_fs.h"
  49 #include "brw_dead_control_flow.h"
  50 #include "main/uniforms.h"
  51 #include "brw_fs_live_variables.h"
  52 #include "glsl/glsl_types.h"
  53
  54 void
  55 fs_inst::init()
  56 {
  57    memset(this, 0, sizeof(*this));
  58    this->conditional_mod = BRW_CONDITIONAL_NONE;
  59
  60    this->dst = reg_undef;
  61    this->src[0] = reg_undef;
  62    this->src[1] = reg_undef;
  63    this->src[2] = reg_undef;
  64
  65    /* This will be the case for almost all instructions. */
  66    this->regs_written = 1;
  67
  68    this->writes_accumulator = false;
  69 }
  70
  71 fs_inst::fs_inst()
  72 {
  73    init();
  74    this->opcode = BRW_OPCODE_NOP;
  75 }
  76
  77 fs_inst::fs_inst(enum opcode opcode)
  78 {
  79    init();
  80    this->opcode = opcode;
  81 }
  82
  83 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
  84 {
  85    init();
  86    this->opcode = opcode;
  87    this->dst = dst;
  88
  89    if (dst.file == GRF)
  90       assert(dst.reg_offset >= 0);
  91 }
  92
  93 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
  94 {
  95    init();
  96    this->opcode = opcode;
  97    this->dst = dst;
  98    this->src[0] = src0;
  99
 100    if (dst.file == GRF)
 101       assert(dst.reg_offset >= 0);
 102    if (src[0].file == GRF)
 103       assert(src[0].reg_offset >= 0);
 104 }
 105
 106 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 107 {
 108    init();
 109    this->opcode = opcode;
 110    this->dst = dst;
 111    this->src[0] = src0;
 112    this->src[1] = src1;
 113
 114    if (dst.file == GRF)
 115       assert(dst.reg_offset >= 0);
 116    if (src[0].file == GRF)
 117       assert(src[0].reg_offset >= 0);
 118    if (src[1].file == GRF)
 119       assert(src[1].reg_offset >= 0);
 120 }
 121
 122 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
 123                  fs_reg src0, fs_reg src1, fs_reg src2)
 124 {
 125    init();
 126    this->opcode = opcode;
 127    this->dst = dst;
 128    this->src[0] = src0;
 129    this->src[1] = src1;
 130    this->src[2] = src2;
 131
 132    if (dst.file == GRF)
 133       assert(dst.reg_offset >= 0);
 134    if (src[0].file == GRF)
 135       assert(src[0].reg_offset >= 0);
 136    if (src[1].file == GRF)
 137       assert(src[1].reg_offset >= 0);
 138    if (src[2].file == GRF)
 139       assert(src[2].reg_offset >= 0);
 140 }
 141
 142 #define ALU1(op)                                                        \
 143    fs_inst *                                                            \
 144    fs_visitor::op(fs_reg dst, fs_reg src0)                              \
 145    {                                                                    \
 146       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
 147    }
 148
 149 #define ALU2(op)                                                        \
 150    fs_inst *                                                            \
 151    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1)                 \
 152    {                                                                    \
 153       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
 154    }
 155
 156 #define ALU2_ACC(op)                                                    \
 157    fs_inst *                                                            \
 158    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1)                 \
 159    {                                                                    \
 160       fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
 161       inst->writes_accumulator = true;                                  \
 162       return inst;                                                      \
 163    }
 164
 165 #define ALU3(op)                                                        \
 166    fs_inst *                                                            \
 167    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2)    \
 168    {                                                                    \
 169       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
 170    }
 171
 172 ALU1(NOT)
 173 ALU1(MOV)
 174 ALU1(FRC)
 175 ALU1(RNDD)
 176 ALU1(RNDE)
 177 ALU1(RNDZ)
 178 ALU2(ADD)
 179 ALU2(MUL)
 180 ALU2_ACC(MACH)
 181 ALU2(AND)
 182 ALU2(OR)
 183 ALU2(XOR)
 184 ALU2(SHL)
 185 ALU2(SHR)
 186 ALU2(ASR)
 187 ALU3(LRP)
 188 ALU1(BFREV)
 189 ALU3(BFE)
 190 ALU2(BFI1)
 191 ALU3(BFI2)
 192 ALU1(FBH)
 193 ALU1(FBL)
 194 ALU1(CBIT)
 195 ALU3(MAD)
 196 ALU2_ACC(ADDC)
 197 ALU2_ACC(SUBB)
 198 ALU2(SEL)
 199 ALU2(MAC)
 200
 201 /** Gen4 predicated IF. */
 202 fs_inst *
 203 fs_visitor::IF(uint32_t predicate)
 204 {
 205    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
 206    inst->predicate = predicate;
 207    return inst;
 208 }
 209
 210 /** Gen6 IF with embedded comparison. */
 211 fs_inst *
 212 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
 213 {
 214    assert(brw->gen == 6);
 215    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
 216                                         reg_null_d, src0, src1);
 217    inst->conditional_mod = condition;
 218    return inst;
 219 }
 220
 221 /**
 222  * CMP: Sets the low bit of the destination channels with the result
 223  * of the comparison, while the upper bits are undefined, and updates
 224  * the flag register with the packed 16 bits of the result.
 225  */
 226 fs_inst *
 227 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
 228 {
 229    fs_inst *inst;
 230
 231    /* Take the instruction:
 232     *
 233     * CMP null<d> src0<f> src1<f>
 234     *
 235     * Original gen4 does type conversion to the destination type before
 236     * comparison, producing garbage results for floating point comparisons.
 237     * gen5 does the comparison on the execution type (resolved source types),
 238     * so dst type doesn't matter.  gen6 does comparison and then uses the
 239     * result as if it was the dst type with no conversion, which happens to
 240     * mostly work out for float-interpreted-as-int since our comparisons are
 241     * for >0, =0, <0.
 242     */
 243    if (brw->gen == 4) {
 244       dst.type = src0.type;
 245       if (dst.file == HW_REG)
 246          dst.fixed_hw_reg.type = dst.type;
 247    }
 248
 249    resolve_ud_negate(&src0);
 250    resolve_ud_negate(&src1);
 251
 252    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
 253    inst->conditional_mod = condition;
 254
 255    return inst;
 256 }
 257
 258 exec_list
 259 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
 260                                        const fs_reg &surf_index,
 261                                        const fs_reg &varying_offset,
 262                                        uint32_t const_offset)
 263 {
 264    exec_list instructions;
 265    fs_inst *inst;
 266
 267    /* We have our constant surface use a pitch of 4 bytes, so our index can
 268     * be any component of a vector, and then we load 4 contiguous
 269     * components starting from that.
 270     *
 271     * We break down the const_offset to a portion added to the variable
 272     * offset and a portion done using reg_offset, which means that if you
 273     * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
 274     * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
 275     * CSE can later notice that those loads are all the same and eliminate
 276     * the redundant ones.
 277     */
 278    fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
 279    instructions.push_tail(ADD(vec4_offset,
 280                               varying_offset, const_offset & ~3));
 281
 282    int scale = 1;
 283    if (brw->gen == 4 && dispatch_width == 8) {
 284       /* Pre-gen5, we can either use a SIMD8 message that requires (header,
 285        * u, v, r) as parameters, or we can just use the SIMD16 message
 286        * consisting of (header, u).  We choose the second, at the cost of a
 287        * longer return length.
 288        */
 289       scale = 2;
 290    }
 291
 292    enum opcode op;
 293    if (brw->gen >= 7)
 294       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
 295    else
 296       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
 297    fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type);
 298    inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
 299    inst->regs_written = 4 * scale;
 300    instructions.push_tail(inst);
 301
 302    if (brw->gen < 7) {
 303       inst->base_mrf = 13;
 304       inst->header_present = true;
 305       if (brw->gen == 4)
 306          inst->mlen = 3;
 307       else
 308          inst->mlen = 1 + dispatch_width / 8;
 309    }
 310
 311    vec4_result.reg_offset += (const_offset & 3) * scale;
 312    instructions.push_tail(MOV(dst, vec4_result));
 313
 314    return instructions;
 315 }
 316
 317 /**
 318  * A helper for MOV generation for fixing up broken hardware SEND dependency
 319  * handling.
 320  */
 321 fs_inst *
 322 fs_visitor::DEP_RESOLVE_MOV(int grf)
 323 {
 324    fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
 325
 326    inst->ir = NULL;
 327    inst->annotation = "send dependency resolve";
 328
 329    /* The caller always wants uncompressed to emit the minimal extra
 330     * dependencies, and to avoid having to deal with aligning its regs to 2.
 331     */
 332    inst->force_uncompressed = true;
 333
 334    return inst;
 335 }
 336
 337 bool
 338 fs_inst::equals(fs_inst *inst) const
 339 {
 340    return (opcode == inst->opcode &&
 341            dst.equals(inst->dst) &&
 342            src[0].equals(inst->src[0]) &&
 343            src[1].equals(inst->src[1]) &&
 344            src[2].equals(inst->src[2]) &&
 345            saturate == inst->saturate &&
 346            predicate == inst->predicate &&
 347            conditional_mod == inst->conditional_mod &&
 348            mlen == inst->mlen &&
 349            base_mrf == inst->base_mrf &&
 350            sampler == inst->sampler &&
 351            target == inst->target &&
 352            eot == inst->eot &&
 353            header_present == inst->header_present &&
 354            shadow_compare == inst->shadow_compare &&
 355            offset == inst->offset);
 356 }
 357
 358 bool
 359 fs_inst::overwrites_reg(const fs_reg &reg) const
 360 {
 361    return (reg.file == dst.file &&
 362            reg.reg == dst.reg &&
 363            reg.reg_offset >= dst.reg_offset  &&
 364            reg.reg_offset < dst.reg_offset + regs_written);
 365 }
 366
 367 bool
 368 fs_inst::is_send_from_grf() const
 369 {
 370    return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
 371            opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
 372            (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
 373             src[1].file == GRF) ||
 374            (is_tex() && src[0].file == GRF));
 375 }
 376
 377 bool
 378 fs_visitor::can_do_source_mods(fs_inst *inst)
 379 {
 380    if (brw->gen == 6 && inst->is_math())
 381       return false;
 382
 383    if (inst->is_send_from_grf())
 384       return false;
 385
 386    if (!inst->can_do_source_mods())
 387       return false;
 388
 389    return true;
 390 }
 391
 392 void
 393 fs_reg::init()
 394 {
 395    memset(this, 0, sizeof(*this));
 396    stride = 1;
 397 }
 398
 399 /** Generic unset register constructor. */
 400 fs_reg::fs_reg()
 401 {
 402    init();
 403    this->file = BAD_FILE;
 404 }
 405
 406 /** Immediate value constructor. */
 407 fs_reg::fs_reg(float f)
 408 {
 409    init();
 410    this->file = IMM;
 411    this->type = BRW_REGISTER_TYPE_F;
 412    this->imm.f = f;
 413 }
 414
 415 /** Immediate value constructor. */
 416 fs_reg::fs_reg(int32_t i)
 417 {
 418    init();
 419    this->file = IMM;
 420    this->type = BRW_REGISTER_TYPE_D;
 421    this->imm.i = i;
 422 }
 423
 424 /** Immediate value constructor. */
 425 fs_reg::fs_reg(uint32_t u)
 426 {
 427    init();
 428    this->file = IMM;
 429    this->type = BRW_REGISTER_TYPE_UD;
 430    this->imm.u = u;
 431 }
 432
 433 /** Fixed brw_reg. */
 434 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 435 {
 436    init();
 437    this->file = HW_REG;
 438    this->fixed_hw_reg = fixed_hw_reg;
 439    this->type = fixed_hw_reg.type;
 440 }
 441
 442 bool
 443 fs_reg::equals(const fs_reg &r) const
 444 {
 445    return (file == r.file &&
 446            reg == r.reg &&
 447            reg_offset == r.reg_offset &&
 448            subreg_offset == r.subreg_offset &&
 449            type == r.type &&
 450            negate == r.negate &&
 451            abs == r.abs &&
 452            !reladdr && !r.reladdr &&
 453            memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
 454                   sizeof(fixed_hw_reg)) == 0 &&
 455            stride == r.stride &&
 456            imm.u == r.imm.u);
 457 }
 458
 459 fs_reg &
 460 fs_reg::apply_stride(unsigned stride)
 461 {
 462    assert((this->stride * stride) <= 4 &&
 463           (is_power_of_two(stride) || stride == 0) &&
 464           file != HW_REG && file != IMM);
 465    this->stride *= stride;
 466    return *this;
 467 }
 468
 469 fs_reg &
 470 fs_reg::set_smear(unsigned subreg)
 471 {
 472    assert(file != HW_REG && file != IMM);
 473    subreg_offset = subreg * type_sz(type);
 474    stride = 0;
 475    return *this;
 476 }
 477
 478 bool
 479 fs_reg::is_contiguous() const
 480 {
 481    return stride == 1;
 482 }
 483
 484 bool
 485 fs_reg::is_zero() const
 486 {
 487    if (file != IMM)
 488       return false;
 489
 490    return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
 491 }
 492
 493 bool
 494 fs_reg::is_one() const
 495 {
 496    if (file != IMM)
 497       return false;
 498
 499    return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
 500 }
 501
 502 bool
 503 fs_reg::is_null() const
 504 {
 505    return file == HW_REG &&
 506           fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
 507           fixed_hw_reg.nr == BRW_ARF_NULL;
 508 }
 509
 510 bool
 511 fs_reg::is_valid_3src() const
 512 {
 513    return file == GRF || file == UNIFORM;
 514 }
 515
 516 bool
 517 fs_reg::is_accumulator() const
 518 {
 519    return file == HW_REG &&
 520           fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
 521           fixed_hw_reg.nr == BRW_ARF_ACCUMULATOR;
 522 }
 523
 524 int
 525 fs_visitor::type_size(const struct glsl_type *type)
 526 {
 527    unsigned int size, i;
 528
 529    switch (type->base_type) {
 530    case GLSL_TYPE_UINT:
 531    case GLSL_TYPE_INT:
 532    case GLSL_TYPE_FLOAT:
 533    case GLSL_TYPE_BOOL:
 534       return type->components();
 535    case GLSL_TYPE_ARRAY:
 536       return type_size(type->fields.array) * type->length;
 537    case GLSL_TYPE_STRUCT:
 538       size = 0;
 539       for (i = 0; i < type->length; i++) {
 540          size += type_size(type->fields.structure[i].type);
 541       }
 542       return size;
 543    case GLSL_TYPE_SAMPLER:
 544       /* Samplers take up no register space, since they're baked in at
 545        * link time.
 546        */
 547       return 0;
 548    case GLSL_TYPE_ATOMIC_UINT:
 549       return 0;
 550    case GLSL_TYPE_IMAGE:
 551    case GLSL_TYPE_VOID:
 552    case GLSL_TYPE_ERROR:
 553    case GLSL_TYPE_INTERFACE:
 554       assert(!"not reached");
 555       break;
 556    }
 557
 558    return 0;
 559 }
 560
 561 fs_reg
 562 fs_visitor::get_timestamp()
 563 {
 564    assert(brw->gen >= 7);
 565
 566    fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 567                                           BRW_ARF_TIMESTAMP,
 568                                           0),
 569                              BRW_REGISTER_TYPE_UD));
 570
 571    fs_reg dst = fs_reg(this, glsl_type::uint_type);
 572
 573    fs_inst *mov = emit(MOV(dst, ts));
 574    /* We want to read the 3 fields we care about (mostly field 0, but also 2)
 575     * even if it's not enabled in the dispatch.
 576     */
 577    mov->force_writemask_all = true;
 578    mov->force_uncompressed = true;
 579
 580    /* The caller wants the low 32 bits of the timestamp.  Since it's running
 581     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 582     * which is plenty of time for our purposes.  It is identical across the
 583     * EUs, but since it's tracking GPU core speed it will increment at a
 584     * varying rate as render P-states change.
 585     *
 586     * The caller could also check if render P-states have changed (or anything
 587     * else that might disrupt timing) by setting smear to 2 and checking if
 588     * that field is != 0.
 589     */
 590    dst.set_smear(0);
 591
 592    return dst;
 593 }
 594
 595 void
 596 fs_visitor::emit_shader_time_begin()
 597 {
 598    current_annotation = "shader time start";
 599    shader_start_time = get_timestamp();
 600 }
 601
 602 void
 603 fs_visitor::emit_shader_time_end()
 604 {
 605    current_annotation = "shader time end";
 606
 607    enum shader_time_shader_type type, written_type, reset_type;
 608    if (dispatch_width == 8) {
 609       type = ST_FS8;
 610       written_type = ST_FS8_WRITTEN;
 611       reset_type = ST_FS8_RESET;
 612    } else {
 613       assert(dispatch_width == 16);
 614       type = ST_FS16;
 615       written_type = ST_FS16_WRITTEN;
 616       reset_type = ST_FS16_RESET;
 617    }
 618
 619    fs_reg shader_end_time = get_timestamp();
 620
 621    /* Check that there weren't any timestamp reset events (assuming these
 622     * were the only two timestamp reads that happened).
 623     */
 624    fs_reg reset = shader_end_time;
 625    reset.set_smear(2);
 626    fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
 627    test->conditional_mod = BRW_CONDITIONAL_Z;
 628    emit(IF(BRW_PREDICATE_NORMAL));
 629
 630    push_force_uncompressed();
 631    fs_reg start = shader_start_time;
 632    start.negate = true;
 633    fs_reg diff = fs_reg(this, glsl_type::uint_type);
 634    emit(ADD(diff, start, shader_end_time));
 635
 636    /* If there were no instructions between the two timestamp gets, the diff
 637     * is 2 cycles.  Remove that overhead, so I can forget about that when
 638     * trying to determine the time taken for single instructions.
 639     */
 640    emit(ADD(diff, diff, fs_reg(-2u)));
 641
 642    emit_shader_time_write(type, diff);
 643    emit_shader_time_write(written_type, fs_reg(1u));
 644    emit(BRW_OPCODE_ELSE);
 645    emit_shader_time_write(reset_type, fs_reg(1u));
 646    emit(BRW_OPCODE_ENDIF);
 647
 648    pop_force_uncompressed();
 649 }
 650
 651 void
 652 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
 653                                    fs_reg value)
 654 {
 655    int shader_time_index =
 656       brw_get_shader_time_index(brw, shader_prog, &fp->Base, type);
 657    fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
 658
 659    fs_reg payload;
 660    if (dispatch_width == 8)
 661       payload = fs_reg(this, glsl_type::uvec2_type);
 662    else
 663       payload = fs_reg(this, glsl_type::uint_type);
 664
 665    emit(new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
 666                              fs_reg(), payload, offset, value));
 667 }
 668
 669 void
 670 fs_visitor::vfail(const char *format, va_list va)
 671 {
 672    char *msg;
 673
 674    if (failed)
 675       return;
 676
 677    failed = true;
 678
 679    msg = ralloc_vasprintf(mem_ctx, format, va);
 680    msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
 681
 682    this->fail_msg = msg;
 683
 684    if (INTEL_DEBUG & DEBUG_WM) {
 685       fprintf(stderr, "%s",  msg);
 686    }
 687 }
 688
 689 void
 690 fs_visitor::fail(const char *format, ...)
 691 {
 692    va_list va;
 693
 694    va_start(va, format);
 695    vfail(format, va);
 696    va_end(va);
 697 }
 698
 699 /**
 700  * Mark this program as impossible to compile in SIMD16 mode.
 701  *
 702  * During the SIMD8 compile (which happens first), we can detect and flag
 703  * things that are unsupported in SIMD16 mode, so the compiler can skip
 704  * the SIMD16 compile altogether.
 705  *
 706  * During a SIMD16 compile (if one happens anyway), this just calls fail().
 707  */
 708 void
 709 fs_visitor::no16(const char *format, ...)
 710 {
 711    va_list va;
 712
 713    va_start(va, format);
 714
 715    if (dispatch_width == 16) {
 716       vfail(format, va);
 717    } else {
 718       simd16_unsupported = true;
 719
 720       if (brw->perf_debug) {
 721          if (no16_msg)
 722             ralloc_vasprintf_append(&no16_msg, format, va);
 723          else
 724             no16_msg = ralloc_vasprintf(mem_ctx, format, va);
 725       }
 726    }
 727
 728    va_end(va);
 729 }
 730
 731 fs_inst *
 732 fs_visitor::emit(enum opcode opcode)
 733 {
 734    return emit(new(mem_ctx) fs_inst(opcode));
 735 }
 736
 737 fs_inst *
 738 fs_visitor::emit(enum opcode opcode, fs_reg dst)
 739 {
 740    return emit(new(mem_ctx) fs_inst(opcode, dst));
 741 }
 742
 743 fs_inst *
 744 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
 745 {
 746    return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
 747 }
 748
 749 fs_inst *
 750 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 751 {
 752    return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
 753 }
 754
 755 fs_inst *
 756 fs_visitor::emit(enum opcode opcode, fs_reg dst,
 757                  fs_reg src0, fs_reg src1, fs_reg src2)
 758 {
 759    return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
 760 }
 761
 762 void
 763 fs_visitor::push_force_uncompressed()
 764 {
 765    force_uncompressed_stack++;
 766 }
 767
 768 void
 769 fs_visitor::pop_force_uncompressed()
 770 {
 771    force_uncompressed_stack--;
 772    assert(force_uncompressed_stack >= 0);
 773 }
 774
 775 /**
 776  * Returns true if the instruction has a flag that means it won't
 777  * update an entire destination register.
 778  *
 779  * For example, dead code elimination and live variable analysis want to know
 780  * when a write to a variable screens off any preceding values that were in
 781  * it.
 782  */
 783 bool
 784 fs_inst::is_partial_write() const
 785 {
 786    return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
 787            this->force_uncompressed ||
 788            this->force_sechalf || !this->dst.is_contiguous());
 789 }
 790
 791 int
 792 fs_inst::regs_read(fs_visitor *v, int arg) const
 793 {
 794    if (is_tex() && arg == 0 && src[0].file == GRF) {
 795       if (v->dispatch_width == 16)
 796          return (mlen + 1) / 2;
 797       else
 798          return mlen;
 799    }
 800    return 1;
 801 }
 802
 803 bool
 804 fs_inst::reads_flag() const
 805 {
 806    return predicate;
 807 }
 808
 809 bool
 810 fs_inst::writes_flag() const
 811 {
 812    return (conditional_mod && opcode != BRW_OPCODE_SEL) ||
 813           opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
 814 }
 815
 816 /**
 817  * Returns how many MRFs an FS opcode will write over.
 818  *
 819  * Note that this is not the 0 or 1 implied writes in an actual gen
 820  * instruction -- the FS opcodes often generate MOVs in addition.
 821  */
 822 int
 823 fs_visitor::implied_mrf_writes(fs_inst *inst)
 824 {
 825    if (inst->mlen == 0)
 826       return 0;
 827
 828    if (inst->base_mrf == -1)
 829       return 0;
 830
 831    switch (inst->opcode) {
 832    case SHADER_OPCODE_RCP:
 833    case SHADER_OPCODE_RSQ:
 834    case SHADER_OPCODE_SQRT:
 835    case SHADER_OPCODE_EXP2:
 836    case SHADER_OPCODE_LOG2:
 837    case SHADER_OPCODE_SIN:
 838    case SHADER_OPCODE_COS:
 839       return 1 * dispatch_width / 8;
 840    case SHADER_OPCODE_POW:
 841    case SHADER_OPCODE_INT_QUOTIENT:
 842    case SHADER_OPCODE_INT_REMAINDER:
 843       return 2 * dispatch_width / 8;
 844    case SHADER_OPCODE_TEX:
 845    case FS_OPCODE_TXB:
 846    case SHADER_OPCODE_TXD:
 847    case SHADER_OPCODE_TXF:
 848    case SHADER_OPCODE_TXF_CMS:
 849    case SHADER_OPCODE_TXF_MCS:
 850    case SHADER_OPCODE_TG4:
 851    case SHADER_OPCODE_TG4_OFFSET:
 852    case SHADER_OPCODE_TXL:
 853    case SHADER_OPCODE_TXS:
 854    case SHADER_OPCODE_LOD:
 855       return 1;
 856    case FS_OPCODE_FB_WRITE:
 857       return 2;
 858    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 859    case SHADER_OPCODE_GEN4_SCRATCH_READ:
 860       return 1;
 861    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
 862       return inst->mlen;
 863    case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
 864       return 2;
 865    case SHADER_OPCODE_UNTYPED_ATOMIC:
 866    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
 867       return 0;
 868    default:
 869       assert(!"not reached");
 870       return inst->mlen;
 871    }
 872 }
 873
 874 int
 875 fs_visitor::virtual_grf_alloc(int size)
 876 {
 877    if (virtual_grf_array_size <= virtual_grf_count) {
 878       if (virtual_grf_array_size == 0)
 879          virtual_grf_array_size = 16;
 880       else
 881          virtual_grf_array_size *= 2;
 882       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 883                                    virtual_grf_array_size);
 884    }
 885    virtual_grf_sizes[virtual_grf_count] = size;
 886    return virtual_grf_count++;
 887 }
 888
 889 /** Fixed HW reg constructor. */
 890 fs_reg::fs_reg(enum register_file file, int reg)
 891 {
 892    init();
 893    this->file = file;
 894    this->reg = reg;
 895    this->type = BRW_REGISTER_TYPE_F;
 896 }
 897
 898 /** Fixed HW reg constructor. */
 899 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
 900 {
 901    init();
 902    this->file = file;
 903    this->reg = reg;
 904    this->type = type;
 905 }
 906
 907 /** Automatic reg constructor. */
 908 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
 909 {
 910    init();
 911
 912    this->file = GRF;
 913    this->reg = v->virtual_grf_alloc(v->type_size(type));
 914    this->reg_offset = 0;
 915    this->type = brw_type_for_base_type(type);
 916 }
 917
 918 fs_reg *
 919 fs_visitor::variable_storage(ir_variable *var)
 920 {
 921    return (fs_reg *)hash_table_find(this->variable_ht, var);
 922 }
 923
 924 void
 925 import_uniforms_callback(const void *key,
 926                          void *data,
 927                          void *closure)
 928 {
 929    struct hash_table *dst_ht = (struct hash_table *)closure;
 930    const fs_reg *reg = (const fs_reg *)data;
 931
 932    if (reg->file != UNIFORM)
 933       return;
 934
 935    hash_table_insert(dst_ht, data, key);
 936 }
 937
 938 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
 939  * This brings in those uniform definitions
 940  */
 941 void
 942 fs_visitor::import_uniforms(fs_visitor *v)
 943 {
 944    hash_table_call_foreach(v->variable_ht,
 945                            import_uniforms_callback,
 946                            variable_ht);
 947    this->push_constant_loc = v->push_constant_loc;
 948    this->pull_constant_loc = v->pull_constant_loc;
 949    this->uniforms = v->uniforms;
 950    this->param_size = v->param_size;
 951 }
 952
 953 /* Our support for uniforms is piggy-backed on the struct
 954  * gl_fragment_program, because that's where the values actually
 955  * get stored, rather than in some global gl_shader_program uniform
 956  * store.
 957  */
 958 void
 959 fs_visitor::setup_uniform_values(ir_variable *ir)
 960 {
 961    int namelen = strlen(ir->name);
 962
 963    /* The data for our (non-builtin) uniforms is stored in a series of
 964     * gl_uniform_driver_storage structs for each subcomponent that
 965     * glGetUniformLocation() could name.  We know it's been set up in the same
 966     * order we'd walk the type, so walk the list of storage and find anything
 967     * with our name, or the prefix of a component that starts with our name.
 968     */
 969    unsigned params_before = uniforms;
 970    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 971       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 972
 973       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 974           (storage->name[namelen] != 0 &&
 975            storage->name[namelen] != '.' &&
 976            storage->name[namelen] != '[')) {
 977          continue;
 978       }
 979
 980       unsigned slots = storage->type->component_slots();
 981       if (storage->array_elements)
 982          slots *= storage->array_elements;
 983
 984       for (unsigned i = 0; i < slots; i++) {
 985          stage_prog_data->param[uniforms++] = &storage->storage[i].f;
 986       }
 987    }
 988
 989    /* Make sure we actually initialized the right amount of stuff here. */
 990    assert(params_before + ir->type->component_slots() == uniforms);
 991    (void)params_before;
 992 }
 993
 994
 995 /* Our support for builtin uniforms is even scarier than non-builtin.
 996  * It sits on top of the PROG_STATE_VAR parameters that are
 997  * automatically updated from GL context state.
 998  */
 999 void
1000 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
1001 {
1002    const ir_state_slot *const slots = ir->state_slots;
1003    assert(ir->state_slots != NULL);
1004
1005    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
1006       /* This state reference has already been setup by ir_to_mesa, but we'll
1007        * get the same index back here.
1008        */
1009       int index = _mesa_add_state_reference(this->fp->Base.Parameters,
1010                                             (gl_state_index *)slots[i].tokens);
1011
1012       /* Add each of the unique swizzles of the element as a parameter.
1013        * This'll end up matching the expected layout of the
1014        * array/matrix/structure we're trying to fill in.
1015        */
1016       int last_swiz = -1;
1017       for (unsigned int j = 0; j < 4; j++) {
1018          int swiz = GET_SWZ(slots[i].swizzle, j);
1019          if (swiz == last_swiz)
1020             break;
1021          last_swiz = swiz;
1022
1023          stage_prog_data->param[uniforms++] =
1024             &fp->Base.Parameters->ParameterValues[index][swiz].f;
1025       }
1026    }
1027 }
1028
1029 fs_reg *
1030 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
1031 {
1032    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1033    fs_reg wpos = *reg;
1034    bool flip = !ir->data.origin_upper_left ^ c->key.render_to_fbo;
1035
1036    /* gl_FragCoord.x */
1037    if (ir->data.pixel_center_integer) {
1038       emit(MOV(wpos, this->pixel_x));
1039    } else {
1040       emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
1041    }
1042    wpos.reg_offset++;
1043
1044    /* gl_FragCoord.y */
1045    if (!flip && ir->data.pixel_center_integer) {
1046       emit(MOV(wpos, this->pixel_y));
1047    } else {
1048       fs_reg pixel_y = this->pixel_y;
1049       float offset = (ir->data.pixel_center_integer ? 0.0 : 0.5);
1050
1051       if (flip) {
1052          pixel_y.negate = true;
1053          offset += c->key.drawable_height - 1.0;
1054       }
1055
1056       emit(ADD(wpos, pixel_y, fs_reg(offset)));
1057    }
1058    wpos.reg_offset++;
1059
1060    /* gl_FragCoord.z */
1061    if (brw->gen >= 6) {
1062       emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
1063    } else {
1064       emit(FS_OPCODE_LINTERP, wpos,
1065            this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1066            this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1067            interp_reg(VARYING_SLOT_POS, 2));
1068    }
1069    wpos.reg_offset++;
1070
1071    /* gl_FragCoord.w: Already set up in emit_interpolation */
1072    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1073
1074    return reg;
1075 }
1076
1077 fs_inst *
1078 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1079                          glsl_interp_qualifier interpolation_mode,
1080                          bool is_centroid, bool is_sample)
1081 {
1082    brw_wm_barycentric_interp_mode barycoord_mode;
1083    if (brw->gen >= 6) {
1084       if (is_centroid) {
1085          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1086             barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1087          else
1088             barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1089       } else if (is_sample) {
1090           if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1091             barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1092          else
1093             barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1094       } else {
1095          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1096             barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1097          else
1098             barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1099       }
1100    } else {
1101       /* On Ironlake and below, there is only one interpolation mode.
1102        * Centroid interpolation doesn't mean anything on this hardware --
1103        * there is no multisampling.
1104        */
1105       barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1106    }
1107    return emit(FS_OPCODE_LINTERP, attr,
1108                this->delta_x[barycoord_mode],
1109                this->delta_y[barycoord_mode], interp);
1110 }
1111
1112 fs_reg *
1113 fs_visitor::emit_general_interpolation(ir_variable *ir)
1114 {
1115    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1116    reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
1117    fs_reg attr = *reg;
1118
1119    unsigned int array_elements;
1120    const glsl_type *type;
1121
1122    if (ir->type->is_array()) {
1123       array_elements = ir->type->length;
1124       if (array_elements == 0) {
1125          fail("dereferenced array '%s' has length 0\n", ir->name);
1126       }
1127       type = ir->type->fields.array;
1128    } else {
1129       array_elements = 1;
1130       type = ir->type;
1131    }
1132
1133    glsl_interp_qualifier interpolation_mode =
1134       ir->determine_interpolation_mode(c->key.flat_shade);
1135
1136    int location = ir->data.location;
1137    for (unsigned int i = 0; i < array_elements; i++) {
1138       for (unsigned int j = 0; j < type->matrix_columns; j++) {
1139          if (c->prog_data.urb_setup[location] == -1) {
1140             /* If there's no incoming setup data for this slot, don't
1141              * emit interpolation for it.
1142              */
1143             attr.reg_offset += type->vector_elements;
1144             location++;
1145             continue;
1146          }
1147
1148          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1149             /* Constant interpolation (flat shading) case. The SF has
1150              * handed us defined values in only the constant offset
1151              * field of the setup reg.
1152              */
1153             for (unsigned int k = 0; k < type->vector_elements; k++) {
1154                struct brw_reg interp = interp_reg(location, k);
1155                interp = suboffset(interp, 3);
1156                interp.type = reg->type;
1157                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1158                attr.reg_offset++;
1159             }
1160          } else {
1161             /* Smooth/noperspective interpolation case. */
1162             for (unsigned int k = 0; k < type->vector_elements; k++) {
1163                struct brw_reg interp = interp_reg(location, k);
1164                emit_linterp(attr, fs_reg(interp), interpolation_mode,
1165                             ir->data.centroid && !c->key.persample_shading,
1166                             ir->data.sample || c->key.persample_shading);
1167                if (brw->needs_unlit_centroid_workaround && ir->data.centroid) {
1168                   /* Get the pixel/sample mask into f0 so that we know
1169                    * which pixels are lit.  Then, for each channel that is
1170                    * unlit, replace the centroid data with non-centroid
1171                    * data.
1172                    */
1173                   emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1174                   fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1175                                                interpolation_mode,
1176                                                false, false);
1177                   inst->predicate = BRW_PREDICATE_NORMAL;
1178                   inst->predicate_inverse = true;
1179                }
1180                if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1181                   emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1182                }
1183                attr.reg_offset++;
1184             }
1185
1186          }
1187          location++;
1188       }
1189    }
1190
1191    return reg;
1192 }
1193
1194 fs_reg *
1195 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1196 {
1197    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1198
1199    /* The frontfacing comes in as a bit in the thread payload. */
1200    if (brw->gen >= 6) {
1201       emit(BRW_OPCODE_ASR, *reg,
1202            fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1203            fs_reg(15));
1204       emit(BRW_OPCODE_NOT, *reg, *reg);
1205       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1206    } else {
1207       struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1208       /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1209        * us front face
1210        */
1211       emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1212       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1213    }
1214
1215    return reg;
1216 }
1217
1218 void
1219 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1220 {
1221    assert(dst.type == BRW_REGISTER_TYPE_F);
1222
1223    if (c->key.compute_pos_offset) {
1224       /* Convert int_sample_pos to floating point */
1225       emit(MOV(dst, int_sample_pos));
1226       /* Scale to the range [0, 1] */
1227       emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1228    }
1229    else {
1230       /* From ARB_sample_shading specification:
1231        * "When rendering to a non-multisample buffer, or if multisample
1232        *  rasterization is disabled, gl_SamplePosition will always be
1233        *  (0.5, 0.5).
1234        */
1235       emit(MOV(dst, fs_reg(0.5f)));
1236    }
1237 }
1238
1239 fs_reg *
1240 fs_visitor::emit_samplepos_setup(ir_variable *ir)
1241 {
1242    assert(brw->gen >= 6);
1243    assert(ir->type == glsl_type::vec2_type);
1244
1245    this->current_annotation = "compute sample position";
1246    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1247    fs_reg pos = *reg;
1248    fs_reg int_sample_x = fs_reg(this, glsl_type::int_type);
1249    fs_reg int_sample_y = fs_reg(this, glsl_type::int_type);
1250
1251    /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1252     * mode will be enabled.
1253     *
1254     * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1255     * R31.1:0         Position Offset X/Y for Slot[3:0]
1256     * R31.3:2         Position Offset X/Y for Slot[7:4]
1257     * .....
1258     *
1259     * The X, Y sample positions come in as bytes in  thread payload. So, read
1260     * the positions using vstride=16, width=8, hstride=2.
1261     */
1262    struct brw_reg sample_pos_reg =
1263       stride(retype(brw_vec1_grf(c->sample_pos_reg, 0),
1264                     BRW_REGISTER_TYPE_B), 16, 8, 2);
1265
1266    emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1267    if (dispatch_width == 16) {
1268       fs_inst *inst = emit(MOV(half(int_sample_x, 1),
1269                                fs_reg(suboffset(sample_pos_reg, 16))));
1270       inst->force_sechalf = true;
1271    }
1272    /* Compute gl_SamplePosition.x */
1273    compute_sample_position(pos, int_sample_x);
1274    pos.reg_offset++;
1275    emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1276    if (dispatch_width == 16) {
1277       fs_inst *inst = emit(MOV(half(int_sample_y, 1),
1278                                fs_reg(suboffset(sample_pos_reg, 17))));
1279       inst->force_sechalf = true;
1280    }
1281    /* Compute gl_SamplePosition.y */
1282    compute_sample_position(pos, int_sample_y);
1283    return reg;
1284 }
1285
1286 fs_reg *
1287 fs_visitor::emit_sampleid_setup(ir_variable *ir)
1288 {
1289    assert(brw->gen >= 6);
1290
1291    this->current_annotation = "compute sample id";
1292    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1293
1294    if (c->key.compute_sample_id) {
1295       fs_reg t1 = fs_reg(this, glsl_type::int_type);
1296       fs_reg t2 = fs_reg(this, glsl_type::int_type);
1297       t2.type = BRW_REGISTER_TYPE_UW;
1298
1299       /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1300        * 8x multisampling, subspan 0 will represent sample N (where N
1301        * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1302        * 7. We can find the value of N by looking at R0.0 bits 7:6
1303        * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1304        * (since samples are always delivered in pairs). That is, we
1305        * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1306        * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1307        * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1308        * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1309        * populating a temporary variable with the sequence (0, 1, 2, 3),
1310        * and then reading from it using vstride=1, width=4, hstride=0.
1311        * These computations hold good for 4x multisampling as well.
1312        */
1313       emit(BRW_OPCODE_AND, t1,
1314            fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1315            fs_reg(brw_imm_d(0xc0)));
1316       emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1317       /* This works for both SIMD8 and SIMD16 */
1318       emit(MOV(t2, brw_imm_v(0x3210)));
1319       /* This special instruction takes care of setting vstride=1,
1320        * width=4, hstride=0 of t2 during an ADD instruction.
1321        */
1322       emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1323    } else {
1324       /* As per GL_ARB_sample_shading specification:
1325        * "When rendering to a non-multisample buffer, or if multisample
1326        *  rasterization is disabled, gl_SampleID will always be zero."
1327        */
1328       emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1329    }
1330
1331    return reg;
1332 }
1333
1334 fs_reg *
1335 fs_visitor::emit_samplemaskin_setup(ir_variable *ir)
1336 {
1337    assert(brw->gen >= 7);
1338    this->current_annotation = "compute gl_SampleMaskIn";
1339    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1340    emit(MOV(*reg, fs_reg(retype(brw_vec8_grf(c->sample_mask_reg, 0), BRW_REGISTER_TYPE_D))));
1341    return reg;
1342 }
1343
1344 fs_reg
1345 fs_visitor::fix_math_operand(fs_reg src)
1346 {
1347    /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1348     * might be able to do better by doing execsize = 1 math and then
1349     * expanding that result out, but we would need to be careful with
1350     * masking.
1351     *
1352     * The hardware ignores source modifiers (negate and abs) on math
1353     * instructions, so we also move to a temp to set those up.
1354     */
1355    if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1356        !src.abs && !src.negate)
1357       return src;
1358
1359    /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1360     * operands to math
1361     */
1362    if (brw->gen >= 7 && src.file != IMM)
1363       return src;
1364
1365    fs_reg expanded = fs_reg(this, glsl_type::float_type);
1366    expanded.type = src.type;
1367    emit(BRW_OPCODE_MOV, expanded, src);
1368    return expanded;
1369 }
1370
1371 fs_inst *
1372 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1373 {
1374    switch (opcode) {
1375    case SHADER_OPCODE_RCP:
1376    case SHADER_OPCODE_RSQ:
1377    case SHADER_OPCODE_SQRT:
1378    case SHADER_OPCODE_EXP2:
1379    case SHADER_OPCODE_LOG2:
1380    case SHADER_OPCODE_SIN:
1381    case SHADER_OPCODE_COS:
1382       break;
1383    default:
1384       assert(!"not reached: bad math opcode");
1385       return NULL;
1386    }
1387
1388    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
1389     * might be able to do better by doing execsize = 1 math and then
1390     * expanding that result out, but we would need to be careful with
1391     * masking.
1392     *
1393     * Gen 6 hardware ignores source modifiers (negate and abs) on math
1394     * instructions, so we also move to a temp to set those up.
1395     */
1396    if (brw->gen >= 6)
1397       src = fix_math_operand(src);
1398
1399    fs_inst *inst = emit(opcode, dst, src);
1400
1401    if (brw->gen < 6) {
1402       inst->base_mrf = 2;
1403       inst->mlen = dispatch_width / 8;
1404    }
1405
1406    return inst;
1407 }
1408
1409 fs_inst *
1410 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1411 {
1412    int base_mrf = 2;
1413    fs_inst *inst;
1414
1415    switch (opcode) {
1416    case SHADER_OPCODE_INT_QUOTIENT:
1417    case SHADER_OPCODE_INT_REMAINDER:
1418       if (brw->gen >= 7)
1419          no16("SIMD16 INTDIV unsupported\n");
1420       break;
1421    case SHADER_OPCODE_POW:
1422       break;
1423    default:
1424       assert(!"not reached: unsupported binary math opcode.");
1425       return NULL;
1426    }
1427
1428    if (brw->gen >= 6) {
1429       src0 = fix_math_operand(src0);
1430       src1 = fix_math_operand(src1);
1431
1432       inst = emit(opcode, dst, src0, src1);
1433    } else {
1434       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1435        * "Message Payload":
1436        *
1437        * "Operand0[7].  For the INT DIV functions, this operand is the
1438        *  denominator."
1439        *  ...
1440        * "Operand1[7].  For the INT DIV functions, this operand is the
1441        *  numerator."
1442        */
1443       bool is_int_div = opcode != SHADER_OPCODE_POW;
1444       fs_reg &op0 = is_int_div ? src1 : src0;
1445       fs_reg &op1 = is_int_div ? src0 : src1;
1446
1447       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1448       inst = emit(opcode, dst, op0, reg_null_f);
1449
1450       inst->base_mrf = base_mrf;
1451       inst->mlen = 2 * dispatch_width / 8;
1452    }
1453    return inst;
1454 }
1455
1456 void
1457 fs_visitor::assign_curb_setup()
1458 {
1459    if (dispatch_width == 8) {
1460       c->prog_data.first_curbe_grf = c->nr_payload_regs;
1461    } else {
1462       c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1463    }
1464
1465    c->prog_data.curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1466
1467    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1468    foreach_list(node, &this->instructions) {
1469       fs_inst *inst = (fs_inst *)node;
1470
1471       for (unsigned int i = 0; i < 3; i++) {
1472          if (inst->src[i].file == UNIFORM) {
1473             int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1474             int constant_nr;
1475             if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1476                constant_nr = push_constant_loc[uniform_nr];
1477             } else {
1478                /* Section 5.11 of the OpenGL 4.1 spec says:
1479                 * "Out-of-bounds reads return undefined values, which include
1480                 *  values from other variables of the active program or zero."
1481                 * Just return the first push constant.
1482                 */
1483                constant_nr = 0;
1484             }
1485
1486             struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1487                                                   constant_nr / 8,
1488                                                   constant_nr % 8);
1489
1490             inst->src[i].file = HW_REG;
1491             inst->src[i].fixed_hw_reg = byte_offset(
1492                retype(brw_reg, inst->src[i].type),
1493                inst->src[i].subreg_offset);
1494          }
1495       }
1496    }
1497 }
1498
1499 void
1500 fs_visitor::calculate_urb_setup()
1501 {
1502    for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1503       c->prog_data.urb_setup[i] = -1;
1504    }
1505
1506    int urb_next = 0;
1507    /* Figure out where each of the incoming setup attributes lands. */
1508    if (brw->gen >= 6) {
1509       if (_mesa_bitcount_64(fp->Base.InputsRead &
1510                             BRW_FS_VARYING_INPUT_MASK) <= 16) {
1511          /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1512           * first 16 varying inputs, so we can put them wherever we want.
1513           * Just put them in order.
1514           *
1515           * This is useful because it means that (a) inputs not used by the
1516           * fragment shader won't take up valuable register space, and (b) we
1517           * won't have to recompile the fragment shader if it gets paired with
1518           * a different vertex (or geometry) shader.
1519           */
1520          for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1521             if (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1522                 BITFIELD64_BIT(i)) {
1523                c->prog_data.urb_setup[i] = urb_next++;
1524             }
1525          }
1526       } else {
1527          /* We have enough input varyings that the SF/SBE pipeline stage can't
1528           * arbitrarily rearrange them to suit our whim; we have to put them
1529           * in an order that matches the output of the previous pipeline stage
1530           * (geometry or vertex shader).
1531           */
1532          struct brw_vue_map prev_stage_vue_map;
1533          brw_compute_vue_map(brw, &prev_stage_vue_map,
1534                              c->key.input_slots_valid);
1535          int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1536          assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1537          for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1538               slot++) {
1539             int varying = prev_stage_vue_map.slot_to_varying[slot];
1540             /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1541              * unused.
1542              */
1543             if (varying != BRW_VARYING_SLOT_COUNT &&
1544                 (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1545                  BITFIELD64_BIT(varying))) {
1546                c->prog_data.urb_setup[varying] = slot - first_slot;
1547             }
1548          }
1549          urb_next = prev_stage_vue_map.num_slots - first_slot;
1550       }
1551    } else {
1552       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1553       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1554          /* Point size is packed into the header, not as a general attribute */
1555          if (i == VARYING_SLOT_PSIZ)
1556             continue;
1557
1558          if (c->key.input_slots_valid & BITFIELD64_BIT(i)) {
1559             /* The back color slot is skipped when the front color is
1560              * also written to.  In addition, some slots can be
1561              * written in the vertex shader and not read in the
1562              * fragment shader.  So the register number must always be
1563              * incremented, mapped or not.
1564              */
1565             if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1566                c->prog_data.urb_setup[i] = urb_next;
1567             urb_next++;
1568          }
1569       }
1570
1571       /*
1572        * It's a FS only attribute, and we did interpolation for this attribute
1573        * in SF thread. So, count it here, too.
1574        *
1575        * See compile_sf_prog() for more info.
1576        */
1577       if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1578          c->prog_data.urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1579    }
1580
1581    c->prog_data.num_varying_inputs = urb_next;
1582 }
1583
1584 void
1585 fs_visitor::assign_urb_setup()
1586 {
1587    int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1588
1589    /* Offset all the urb_setup[] index by the actual position of the
1590     * setup regs, now that the location of the constants has been chosen.
1591     */
1592    foreach_list(node, &this->instructions) {
1593       fs_inst *inst = (fs_inst *)node;
1594
1595       if (inst->opcode == FS_OPCODE_LINTERP) {
1596          assert(inst->src[2].file == HW_REG);
1597          inst->src[2].fixed_hw_reg.nr += urb_start;
1598       }
1599
1600       if (inst->opcode == FS_OPCODE_CINTERP) {
1601          assert(inst->src[0].file == HW_REG);
1602          inst->src[0].fixed_hw_reg.nr += urb_start;
1603       }
1604    }
1605
1606    /* Each attribute is 4 setup channels, each of which is half a reg. */
1607    this->first_non_payload_grf =
1608       urb_start + c->prog_data.num_varying_inputs * 2;
1609 }
1610
1611 /**
1612  * Split large virtual GRFs into separate components if we can.
1613  *
1614  * This is mostly duplicated with what brw_fs_vector_splitting does,
1615  * but that's really conservative because it's afraid of doing
1616  * splitting that doesn't result in real progress after the rest of
1617  * the optimization phases, which would cause infinite looping in
1618  * optimization.  We can do it once here, safely.  This also has the
1619  * opportunity to split interpolated values, or maybe even uniforms,
1620  * which we don't have at the IR level.
1621  *
1622  * We want to split, because virtual GRFs are what we register
1623  * allocate and spill (due to contiguousness requirements for some
1624  * instructions), and they're what we naturally generate in the
1625  * codegen process, but most virtual GRFs don't actually need to be
1626  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1627  * live intervals and better dead code elimination and coalescing.
1628  */
1629 void
1630 fs_visitor::split_virtual_grfs()
1631 {
1632    int num_vars = this->virtual_grf_count;
1633    bool split_grf[num_vars];
1634    int new_virtual_grf[num_vars];
1635
1636    /* Try to split anything > 0 sized. */
1637    for (int i = 0; i < num_vars; i++) {
1638       if (this->virtual_grf_sizes[i] != 1)
1639          split_grf[i] = true;
1640       else
1641          split_grf[i] = false;
1642    }
1643
1644    if (brw->has_pln &&
1645        this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1646       /* PLN opcodes rely on the delta_xy being contiguous.  We only have to
1647        * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1648        * Gen6, that was the only supported interpolation mode, and since Gen6,
1649        * delta_x and delta_y are in fixed hardware registers.
1650        */
1651       split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1652          false;
1653    }
1654
1655    foreach_list(node, &this->instructions) {
1656       fs_inst *inst = (fs_inst *)node;
1657
1658       /* If there's a SEND message that requires contiguous destination
1659        * registers, no splitting is allowed.
1660        */
1661       if (inst->regs_written > 1) {
1662          split_grf[inst->dst.reg] = false;
1663       }
1664
1665       /* If we're sending from a GRF, don't split it, on the assumption that
1666        * the send is reading the whole thing.
1667        */
1668       if (inst->is_send_from_grf()) {
1669          for (int i = 0; i < 3; i++) {
1670             if (inst->src[i].file == GRF) {
1671                split_grf[inst->src[i].reg] = false;
1672             }
1673          }
1674       }
1675    }
1676
1677    /* Allocate new space for split regs.  Note that the virtual
1678     * numbers will be contiguous.
1679     */
1680    for (int i = 0; i < num_vars; i++) {
1681       if (split_grf[i]) {
1682          new_virtual_grf[i] = virtual_grf_alloc(1);
1683          for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1684             int reg = virtual_grf_alloc(1);
1685             assert(reg == new_virtual_grf[i] + j - 1);
1686             (void) reg;
1687          }
1688          this->virtual_grf_sizes[i] = 1;
1689       }
1690    }
1691
1692    foreach_list(node, &this->instructions) {
1693       fs_inst *inst = (fs_inst *)node;
1694
1695       if (inst->dst.file == GRF &&
1696           split_grf[inst->dst.reg] &&
1697           inst->dst.reg_offset != 0) {
1698          inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1699                           inst->dst.reg_offset - 1);
1700          inst->dst.reg_offset = 0;
1701       }
1702       for (int i = 0; i < 3; i++) {
1703          if (inst->src[i].file == GRF &&
1704              split_grf[inst->src[i].reg] &&
1705              inst->src[i].reg_offset != 0) {
1706             inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1707                                 inst->src[i].reg_offset - 1);
1708             inst->src[i].reg_offset = 0;
1709          }
1710       }
1711    }
1712    invalidate_live_intervals();
1713 }
1714
1715 /**
1716  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1717  *
1718  * During code generation, we create tons of temporary variables, many of
1719  * which get immediately killed and are never used again.  Yet, in later
1720  * optimization and analysis passes, such as compute_live_intervals, we need
1721  * to loop over all the virtual GRFs.  Compacting them can save a lot of
1722  * overhead.
1723  */
1724 void
1725 fs_visitor::compact_virtual_grfs()
1726 {
1727    /* Mark which virtual GRFs are used, and count how many. */
1728    int remap_table[this->virtual_grf_count];
1729    memset(remap_table, -1, sizeof(remap_table));
1730
1731    foreach_list(node, &this->instructions) {
1732       const fs_inst *inst = (const fs_inst *) node;
1733
1734       if (inst->dst.file == GRF)
1735          remap_table[inst->dst.reg] = 0;
1736
1737       for (int i = 0; i < 3; i++) {
1738          if (inst->src[i].file == GRF)
1739             remap_table[inst->src[i].reg] = 0;
1740       }
1741    }
1742
1743    /* Compact the GRF arrays. */
1744    int new_index = 0;
1745    for (int i = 0; i < this->virtual_grf_count; i++) {
1746       if (remap_table[i] != -1) {
1747          remap_table[i] = new_index;
1748          virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1749          invalidate_live_intervals();
1750          ++new_index;
1751       }
1752    }
1753
1754    this->virtual_grf_count = new_index;
1755
1756    /* Patch all the instructions to use the newly renumbered registers */
1757    foreach_list(node, &this->instructions) {
1758       fs_inst *inst = (fs_inst *) node;
1759
1760       if (inst->dst.file == GRF)
1761          inst->dst.reg = remap_table[inst->dst.reg];
1762
1763       for (int i = 0; i < 3; i++) {
1764          if (inst->src[i].file == GRF)
1765             inst->src[i].reg = remap_table[inst->src[i].reg];
1766       }
1767    }
1768 }
1769
1770 /*
1771  * Implements array access of uniforms by inserting a
1772  * PULL_CONSTANT_LOAD instruction.
1773  *
1774  * Unlike temporary GRF array access (where we don't support it due to
1775  * the difficulty of doing relative addressing on instruction
1776  * destinations), we could potentially do array access of uniforms
1777  * that were loaded in GRF space as push constants.  In real-world
1778  * usage we've seen, though, the arrays being used are always larger
1779  * than we could load as push constants, so just always move all
1780  * uniform array access out to a pull constant buffer.
1781  */
1782 void
1783 fs_visitor::move_uniform_array_access_to_pull_constants()
1784 {
1785    if (dispatch_width != 8)
1786       return;
1787
1788    pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
1789
1790    for (unsigned int i = 0; i < uniforms; i++) {
1791       pull_constant_loc[i] = -1;
1792    }
1793
1794    /* Walk through and find array access of uniforms.  Put a copy of that
1795     * uniform in the pull constant buffer.
1796     *
1797     * Note that we don't move constant-indexed accesses to arrays.  No
1798     * testing has been done of the performance impact of this choice.
1799     */
1800    foreach_list_safe(node, &this->instructions) {
1801       fs_inst *inst = (fs_inst *)node;
1802
1803       for (int i = 0 ; i < 3; i++) {
1804          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1805             continue;
1806
1807          int uniform = inst->src[i].reg;
1808
1809          /* If this array isn't already present in the pull constant buffer,
1810           * add it.
1811           */
1812          if (pull_constant_loc[uniform] == -1) {
1813             const float **values = &stage_prog_data->param[uniform];
1814
1815             assert(param_size[uniform]);
1816
1817             for (int j = 0; j < param_size[uniform]; j++) {
1818                pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
1819
1820                stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
1821                   values[j];
1822             }
1823          }
1824       }
1825    }
1826 }
1827
1828 /**
1829  * Assign UNIFORM file registers to either push constants or pull constants.
1830  *
1831  * We allow a fragment shader to have more than the specified minimum
1832  * maximum number of fragment shader uniform components (64).  If
1833  * there are too many of these, they'd fill up all of register space.
1834  * So, this will push some of them out to the pull constant buffer and
1835  * update the program to load them.
1836  */
1837 void
1838 fs_visitor::assign_constant_locations()
1839 {
1840    /* Only the first compile (SIMD8 mode) gets to decide on locations. */
1841    if (dispatch_width != 8)
1842       return;
1843
1844    /* Find which UNIFORM registers are still in use. */
1845    bool is_live[uniforms];
1846    for (unsigned int i = 0; i < uniforms; i++) {
1847       is_live[i] = false;
1848    }
1849
1850    foreach_list(node, &this->instructions) {
1851       fs_inst *inst = (fs_inst *) node;
1852
1853       for (int i = 0; i < 3; i++) {
1854          if (inst->src[i].file != UNIFORM)
1855             continue;
1856
1857          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1858          if (constant_nr >= 0 && constant_nr < (int) uniforms)
1859             is_live[constant_nr] = true;
1860       }
1861    }
1862
1863    /* Only allow 16 registers (128 uniform components) as push constants.
1864     *
1865     * Just demote the end of the list.  We could probably do better
1866     * here, demoting things that are rarely used in the program first.
1867     */
1868    unsigned int max_push_components = 16 * 8;
1869    unsigned int num_push_constants = 0;
1870
1871    push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
1872
1873    for (unsigned int i = 0; i < uniforms; i++) {
1874       if (!is_live[i] || pull_constant_loc[i] != -1) {
1875          /* This UNIFORM register is either dead, or has already been demoted
1876           * to a pull const.  Mark it as no longer living in the param[] array.
1877           */
1878          push_constant_loc[i] = -1;
1879          continue;
1880       }
1881
1882       if (num_push_constants < max_push_components) {
1883          /* Retain as a push constant.  Record the location in the params[]
1884           * array.
1885           */
1886          push_constant_loc[i] = num_push_constants++;
1887       } else {
1888          /* Demote to a pull constant. */
1889          push_constant_loc[i] = -1;
1890
1891          int pull_index = stage_prog_data->nr_pull_params++;
1892          stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
1893          pull_constant_loc[i] = pull_index;
1894       }
1895    }
1896
1897    stage_prog_data->nr_params = num_push_constants;
1898
1899    /* Up until now, the param[] array has been indexed by reg + reg_offset
1900     * of UNIFORM registers.  Condense it to only contain the uniforms we
1901     * chose to upload as push constants.
1902     */
1903    for (unsigned int i = 0; i < uniforms; i++) {
1904       int remapped = push_constant_loc[i];
1905
1906       if (remapped == -1)
1907          continue;
1908
1909       assert(remapped <= (int)i);
1910       stage_prog_data->param[remapped] = stage_prog_data->param[i];
1911    }
1912 }
1913
1914 /**
1915  * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
1916  * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
1917  */
1918 void
1919 fs_visitor::demote_pull_constants()
1920 {
1921    foreach_list(node, &this->instructions) {
1922       fs_inst *inst = (fs_inst *)node;
1923
1924       for (int i = 0; i < 3; i++) {
1925          if (inst->src[i].file != UNIFORM)
1926             continue;
1927
1928          int pull_index = pull_constant_loc[inst->src[i].reg +
1929                                             inst->src[i].reg_offset];
1930          if (pull_index == -1)
1931             continue;
1932
1933          /* Set up the annotation tracking for new generated instructions. */
1934          base_ir = inst->ir;
1935          current_annotation = inst->annotation;
1936
1937          fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
1938          fs_reg dst = fs_reg(this, glsl_type::float_type);
1939
1940          /* Generate a pull load into dst. */
1941          if (inst->src[i].reladdr) {
1942             exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
1943                                                         surf_index,
1944                                                         *inst->src[i].reladdr,
1945                                                         pull_index);
1946             inst->insert_before(&list);
1947             inst->src[i].reladdr = NULL;
1948          } else {
1949             fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1950             fs_inst *pull =
1951                new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1952                                     dst, surf_index, offset);
1953             inst->insert_before(pull);
1954             inst->src[i].set_smear(pull_index & 3);
1955          }
1956
1957          /* Rewrite the instruction to use the temporary VGRF. */
1958          inst->src[i].file = GRF;
1959          inst->src[i].reg = dst.reg;
1960          inst->src[i].reg_offset = 0;
1961       }
1962    }
1963    invalidate_live_intervals();
1964 }
1965
1966 bool
1967 fs_visitor::opt_algebraic()
1968 {
1969    bool progress = false;
1970
1971    foreach_list(node, &this->instructions) {
1972       fs_inst *inst = (fs_inst *)node;
1973
1974       switch (inst->opcode) {
1975       case BRW_OPCODE_MUL:
1976          if (inst->src[1].file != IMM)
1977             continue;
1978
1979          /* a * 1.0 = a */
1980          if (inst->src[1].is_one()) {
1981             inst->opcode = BRW_OPCODE_MOV;
1982             inst->src[1] = reg_undef;
1983             progress = true;
1984             break;
1985          }
1986
1987          /* a * 0.0 = 0.0 */
1988          if (inst->src[1].is_zero()) {
1989             inst->opcode = BRW_OPCODE_MOV;
1990             inst->src[0] = inst->src[1];
1991             inst->src[1] = reg_undef;
1992             progress = true;
1993             break;
1994          }
1995
1996          break;
1997       case BRW_OPCODE_ADD:
1998          if (inst->src[1].file != IMM)
1999             continue;
2000
2001          /* a + 0.0 = a */
2002          if (inst->src[1].is_zero()) {
2003             inst->opcode = BRW_OPCODE_MOV;
2004             inst->src[1] = reg_undef;
2005             progress = true;
2006             break;
2007          }
2008          break;
2009       case BRW_OPCODE_OR:
2010          if (inst->src[0].equals(inst->src[1])) {
2011             inst->opcode = BRW_OPCODE_MOV;
2012             inst->src[1] = reg_undef;
2013             progress = true;
2014             break;
2015          }
2016          break;
2017       case BRW_OPCODE_LRP:
2018          if (inst->src[1].equals(inst->src[2])) {
2019             inst->opcode = BRW_OPCODE_MOV;
2020             inst->src[0] = inst->src[1];
2021             inst->src[1] = reg_undef;
2022             inst->src[2] = reg_undef;
2023             progress = true;
2024             break;
2025          }
2026          break;
2027       case BRW_OPCODE_SEL:
2028          if (inst->saturate && inst->src[1].file == IMM) {
2029             switch (inst->conditional_mod) {
2030             case BRW_CONDITIONAL_LE:
2031             case BRW_CONDITIONAL_L:
2032                switch (inst->src[1].type) {
2033                case BRW_REGISTER_TYPE_F:
2034                   if (inst->src[1].imm.f >= 1.0f) {
2035                      inst->opcode = BRW_OPCODE_MOV;
2036                      inst->src[1] = reg_undef;
2037                      progress = true;
2038                   }
2039                   break;
2040                default:
2041                   break;
2042                }
2043                break;
2044             case BRW_CONDITIONAL_GE:
2045             case BRW_CONDITIONAL_G:
2046                switch (inst->src[1].type) {
2047                case BRW_REGISTER_TYPE_F:
2048                   if (inst->src[1].imm.f <= 0.0f) {
2049                      inst->opcode = BRW_OPCODE_MOV;
2050                      inst->src[1] = reg_undef;
2051                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
2052                      progress = true;
2053                   }
2054                   break;
2055                default:
2056                   break;
2057                }
2058             default:
2059                break;
2060             }
2061          }
2062          break;
2063       default:
2064          break;
2065       }
2066    }
2067
2068    return progress;
2069 }
2070
2071 bool
2072 fs_visitor::compute_to_mrf()
2073 {
2074    bool progress = false;
2075    int next_ip = 0;
2076
2077    calculate_live_intervals();
2078
2079    foreach_list_safe(node, &this->instructions) {
2080       fs_inst *inst = (fs_inst *)node;
2081
2082       int ip = next_ip;
2083       next_ip++;
2084
2085       if (inst->opcode != BRW_OPCODE_MOV ||
2086           inst->is_partial_write() ||
2087           inst->dst.file != MRF || inst->src[0].file != GRF ||
2088           inst->dst.type != inst->src[0].type ||
2089           inst->src[0].abs || inst->src[0].negate ||
2090           !inst->src[0].is_contiguous() ||
2091           inst->src[0].subreg_offset)
2092          continue;
2093
2094       /* Work out which hardware MRF registers are written by this
2095        * instruction.
2096        */
2097       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2098       int mrf_high;
2099       if (inst->dst.reg & BRW_MRF_COMPR4) {
2100          mrf_high = mrf_low + 4;
2101       } else if (dispatch_width == 16 &&
2102                  (!inst->force_uncompressed && !inst->force_sechalf)) {
2103          mrf_high = mrf_low + 1;
2104       } else {
2105          mrf_high = mrf_low;
2106       }
2107
2108       /* Can't compute-to-MRF this GRF if someone else was going to
2109        * read it later.
2110        */
2111       if (this->virtual_grf_end[inst->src[0].reg] > ip)
2112          continue;
2113
2114       /* Found a move of a GRF to a MRF.  Let's see if we can go
2115        * rewrite the thing that made this GRF to write into the MRF.
2116        */
2117       fs_inst *scan_inst;
2118       for (scan_inst = (fs_inst *)inst->prev;
2119            scan_inst->prev != NULL;
2120            scan_inst = (fs_inst *)scan_inst->prev) {
2121          if (scan_inst->dst.file == GRF &&
2122              scan_inst->dst.reg == inst->src[0].reg) {
2123             /* Found the last thing to write our reg we want to turn
2124              * into a compute-to-MRF.
2125              */
2126
2127             /* If this one instruction didn't populate all the
2128              * channels, bail.  We might be able to rewrite everything
2129              * that writes that reg, but it would require smarter
2130              * tracking to delay the rewriting until complete success.
2131              */
2132             if (scan_inst->is_partial_write())
2133                break;
2134
2135             /* Things returning more than one register would need us to
2136              * understand coalescing out more than one MOV at a time.
2137              */
2138             if (scan_inst->regs_written > 1)
2139                break;
2140
2141             /* SEND instructions can't have MRF as a destination. */
2142             if (scan_inst->mlen)
2143                break;
2144
2145             if (brw->gen == 6) {
2146                /* gen6 math instructions must have the destination be
2147                 * GRF, so no compute-to-MRF for them.
2148                 */
2149                if (scan_inst->is_math()) {
2150                   break;
2151                }
2152             }
2153
2154             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2155                /* Found the creator of our MRF's source value. */
2156                scan_inst->dst.file = MRF;
2157                scan_inst->dst.reg = inst->dst.reg;
2158                scan_inst->saturate |= inst->saturate;
2159                inst->remove();
2160                progress = true;
2161             }
2162             break;
2163          }
2164
2165          /* We don't handle control flow here.  Most computation of
2166           * values that end up in MRFs are shortly before the MRF
2167           * write anyway.
2168           */
2169          if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2170             break;
2171
2172          /* You can't read from an MRF, so if someone else reads our
2173           * MRF's source GRF that we wanted to rewrite, that stops us.
2174           */
2175          bool interfered = false;
2176          for (int i = 0; i < 3; i++) {
2177             if (scan_inst->src[i].file == GRF &&
2178                 scan_inst->src[i].reg == inst->src[0].reg &&
2179                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2180                interfered = true;
2181             }
2182          }
2183          if (interfered)
2184             break;
2185
2186          if (scan_inst->dst.file == MRF) {
2187             /* If somebody else writes our MRF here, we can't
2188              * compute-to-MRF before that.
2189              */
2190             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2191             int scan_mrf_high;
2192
2193             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2194                scan_mrf_high = scan_mrf_low + 4;
2195             } else if (dispatch_width == 16 &&
2196                        (!scan_inst->force_uncompressed &&
2197                         !scan_inst->force_sechalf)) {
2198                scan_mrf_high = scan_mrf_low + 1;
2199             } else {
2200                scan_mrf_high = scan_mrf_low;
2201             }
2202
2203             if (mrf_low == scan_mrf_low ||
2204                 mrf_low == scan_mrf_high ||
2205                 mrf_high == scan_mrf_low ||
2206                 mrf_high == scan_mrf_high) {
2207                break;
2208             }
2209          }
2210
2211          if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2212             /* Found a SEND instruction, which means that there are
2213              * live values in MRFs from base_mrf to base_mrf +
2214              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2215              * above it.
2216              */
2217             if (mrf_low >= scan_inst->base_mrf &&
2218                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2219                break;
2220             }
2221             if (mrf_high >= scan_inst->base_mrf &&
2222                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2223                break;
2224             }
2225          }
2226       }
2227    }
2228
2229    if (progress)
2230       invalidate_live_intervals();
2231
2232    return progress;
2233 }
2234
2235 /**
2236  * Walks through basic blocks, looking for repeated MRF writes and
2237  * removing the later ones.
2238  */
2239 bool
2240 fs_visitor::remove_duplicate_mrf_writes()
2241 {
2242    fs_inst *last_mrf_move[16];
2243    bool progress = false;
2244
2245    /* Need to update the MRF tracking for compressed instructions. */
2246    if (dispatch_width == 16)
2247       return false;
2248
2249    memset(last_mrf_move, 0, sizeof(last_mrf_move));
2250
2251    foreach_list_safe(node, &this->instructions) {
2252       fs_inst *inst = (fs_inst *)node;
2253
2254       if (inst->is_control_flow()) {
2255          memset(last_mrf_move, 0, sizeof(last_mrf_move));
2256       }
2257
2258       if (inst->opcode == BRW_OPCODE_MOV &&
2259           inst->dst.file == MRF) {
2260          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2261          if (prev_inst && inst->equals(prev_inst)) {
2262             inst->remove();
2263             progress = true;
2264             continue;
2265          }
2266       }
2267
2268       /* Clear out the last-write records for MRFs that were overwritten. */
2269       if (inst->dst.file == MRF) {
2270          last_mrf_move[inst->dst.reg] = NULL;
2271       }
2272
2273       if (inst->mlen > 0 && inst->base_mrf != -1) {
2274          /* Found a SEND instruction, which will include two or fewer
2275           * implied MRF writes.  We could do better here.
2276           */
2277          for (int i = 0; i < implied_mrf_writes(inst); i++) {
2278             last_mrf_move[inst->base_mrf + i] = NULL;
2279          }
2280       }
2281
2282       /* Clear out any MRF move records whose sources got overwritten. */
2283       if (inst->dst.file == GRF) {
2284          for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2285             if (last_mrf_move[i] &&
2286                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2287                last_mrf_move[i] = NULL;
2288             }
2289          }
2290       }
2291
2292       if (inst->opcode == BRW_OPCODE_MOV &&
2293           inst->dst.file == MRF &&
2294           inst->src[0].file == GRF &&
2295           !inst->is_partial_write()) {
2296          last_mrf_move[inst->dst.reg] = inst;
2297       }
2298    }
2299
2300    if (progress)
2301       invalidate_live_intervals();
2302
2303    return progress;
2304 }
2305
2306 static void
2307 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2308                         int first_grf, int grf_len)
2309 {
2310    bool inst_simd16 = (dispatch_width > 8 &&
2311                        !inst->force_uncompressed &&
2312                        !inst->force_sechalf);
2313
2314    /* Clear the flag for registers that actually got read (as expected). */
2315    for (int i = 0; i < 3; i++) {
2316       int grf;
2317       if (inst->src[i].file == GRF) {
2318          grf = inst->src[i].reg;
2319       } else if (inst->src[i].file == HW_REG &&
2320                  inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2321          grf = inst->src[i].fixed_hw_reg.nr;
2322       } else {
2323          continue;
2324       }
2325
2326       if (grf >= first_grf &&
2327           grf < first_grf + grf_len) {
2328          deps[grf - first_grf] = false;
2329          if (inst_simd16)
2330             deps[grf - first_grf + 1] = false;
2331       }
2332    }
2333 }
2334
2335 /**
2336  * Implements this workaround for the original 965:
2337  *
2338  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2339  *      check for post destination dependencies on this instruction, software
2340  *      must ensure that there is no destination hazard for the case of ‘write
2341  *      followed by a posted write’ shown in the following example.
2342  *
2343  *      1. mov r3 0
2344  *      2. send r3.xy <rest of send instruction>
2345  *      3. mov r2 r3
2346  *
2347  *      Due to no post-destination dependency check on the ‘send’, the above
2348  *      code sequence could have two instructions (1 and 2) in flight at the
2349  *      same time that both consider ‘r3’ as the target of their final writes.
2350  */
2351 void
2352 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2353 {
2354    int reg_size = dispatch_width / 8;
2355    int write_len = inst->regs_written * reg_size;
2356    int first_write_grf = inst->dst.reg;
2357    bool needs_dep[BRW_MAX_MRF];
2358    assert(write_len < (int)sizeof(needs_dep) - 1);
2359
2360    memset(needs_dep, false, sizeof(needs_dep));
2361    memset(needs_dep, true, write_len);
2362
2363    clear_deps_for_inst_src(inst, dispatch_width,
2364                            needs_dep, first_write_grf, write_len);
2365
2366    /* Walk backwards looking for writes to registers we're writing which
2367     * aren't read since being written.  If we hit the start of the program,
2368     * we assume that there are no outstanding dependencies on entry to the
2369     * program.
2370     */
2371    for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2372         scan_inst != NULL;
2373         scan_inst = (fs_inst *)scan_inst->prev) {
2374
2375       /* If we hit control flow, assume that there *are* outstanding
2376        * dependencies, and force their cleanup before our instruction.
2377        */
2378       if (scan_inst->is_control_flow()) {
2379          for (int i = 0; i < write_len; i++) {
2380             if (needs_dep[i]) {
2381                inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2382             }
2383          }
2384          return;
2385       }
2386
2387       bool scan_inst_simd16 = (dispatch_width > 8 &&
2388                                !scan_inst->force_uncompressed &&
2389                                !scan_inst->force_sechalf);
2390
2391       /* We insert our reads as late as possible on the assumption that any
2392        * instruction but a MOV that might have left us an outstanding
2393        * dependency has more latency than a MOV.
2394        */
2395       if (scan_inst->dst.file == GRF) {
2396          for (int i = 0; i < scan_inst->regs_written; i++) {
2397             int reg = scan_inst->dst.reg + i * reg_size;
2398
2399             if (reg >= first_write_grf &&
2400                 reg < first_write_grf + write_len &&
2401                 needs_dep[reg - first_write_grf]) {
2402                inst->insert_before(DEP_RESOLVE_MOV(reg));
2403                needs_dep[reg - first_write_grf] = false;
2404                if (scan_inst_simd16)
2405                   needs_dep[reg - first_write_grf + 1] = false;
2406             }
2407          }
2408       }
2409
2410       /* Clear the flag for registers that actually got read (as expected). */
2411       clear_deps_for_inst_src(scan_inst, dispatch_width,
2412                               needs_dep, first_write_grf, write_len);
2413
2414       /* Continue the loop only if we haven't resolved all the dependencies */
2415       int i;
2416       for (i = 0; i < write_len; i++) {
2417          if (needs_dep[i])
2418             break;
2419       }
2420       if (i == write_len)
2421          return;
2422    }
2423 }
2424
2425 /**
2426  * Implements this workaround for the original 965:
2427  *
2428  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
2429  *      used as a destination register until after it has been sourced by an
2430  *      instruction with a different destination register.
2431  */
2432 void
2433 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2434 {
2435    int write_len = inst->regs_written * dispatch_width / 8;
2436    int first_write_grf = inst->dst.reg;
2437    bool needs_dep[BRW_MAX_MRF];
2438    assert(write_len < (int)sizeof(needs_dep) - 1);
2439
2440    memset(needs_dep, false, sizeof(needs_dep));
2441    memset(needs_dep, true, write_len);
2442    /* Walk forwards looking for writes to registers we're writing which aren't
2443     * read before being written.
2444     */
2445    for (fs_inst *scan_inst = (fs_inst *)inst->next;
2446         !scan_inst->is_tail_sentinel();
2447         scan_inst = (fs_inst *)scan_inst->next) {
2448       /* If we hit control flow, force resolve all remaining dependencies. */
2449       if (scan_inst->is_control_flow()) {
2450          for (int i = 0; i < write_len; i++) {
2451             if (needs_dep[i])
2452                scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2453          }
2454          return;
2455       }
2456
2457       /* Clear the flag for registers that actually got read (as expected). */
2458       clear_deps_for_inst_src(scan_inst, dispatch_width,
2459                               needs_dep, first_write_grf, write_len);
2460
2461       /* We insert our reads as late as possible since they're reading the
2462        * result of a SEND, which has massive latency.
2463        */
2464       if (scan_inst->dst.file == GRF &&
2465           scan_inst->dst.reg >= first_write_grf &&
2466           scan_inst->dst.reg < first_write_grf + write_len &&
2467           needs_dep[scan_inst->dst.reg - first_write_grf]) {
2468          scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2469          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2470       }
2471
2472       /* Continue the loop only if we haven't resolved all the dependencies */
2473       int i;
2474       for (i = 0; i < write_len; i++) {
2475          if (needs_dep[i])
2476             break;
2477       }
2478       if (i == write_len)
2479          return;
2480    }
2481
2482    /* If we hit the end of the program, resolve all remaining dependencies out
2483     * of paranoia.
2484     */
2485    fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2486    assert(last_inst->eot);
2487    for (int i = 0; i < write_len; i++) {
2488       if (needs_dep[i])
2489          last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2490    }
2491 }
2492
2493 void
2494 fs_visitor::insert_gen4_send_dependency_workarounds()
2495 {
2496    if (brw->gen != 4 || brw->is_g4x)
2497       return;
2498
2499    /* Note that we're done with register allocation, so GRF fs_regs always
2500     * have a .reg_offset of 0.
2501     */
2502
2503    foreach_list_safe(node, &this->instructions) {
2504       fs_inst *inst = (fs_inst *)node;
2505
2506       if (inst->mlen != 0 && inst->dst.file == GRF) {
2507          insert_gen4_pre_send_dependency_workarounds(inst);
2508          insert_gen4_post_send_dependency_workarounds(inst);
2509       }
2510    }
2511 }
2512
2513 /**
2514  * Turns the generic expression-style uniform pull constant load instruction
2515  * into a hardware-specific series of instructions for loading a pull
2516  * constant.
2517  *
2518  * The expression style allows the CSE pass before this to optimize out
2519  * repeated loads from the same offset, and gives the pre-register-allocation
2520  * scheduling full flexibility, while the conversion to native instructions
2521  * allows the post-register-allocation scheduler the best information
2522  * possible.
2523  *
2524  * Note that execution masking for setting up pull constant loads is special:
2525  * the channels that need to be written are unrelated to the current execution
2526  * mask, since a later instruction will use one of the result channels as a
2527  * source operand for all 8 or 16 of its channels.
2528  */
2529 void
2530 fs_visitor::lower_uniform_pull_constant_loads()
2531 {
2532    foreach_list(node, &this->instructions) {
2533       fs_inst *inst = (fs_inst *)node;
2534
2535       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2536          continue;
2537
2538       if (brw->gen >= 7) {
2539          /* The offset arg before was a vec4-aligned byte offset.  We need to
2540           * turn it into a dword offset.
2541           */
2542          fs_reg const_offset_reg = inst->src[1];
2543          assert(const_offset_reg.file == IMM &&
2544                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2545          const_offset_reg.imm.u /= 4;
2546          fs_reg payload = fs_reg(this, glsl_type::uint_type);
2547
2548          /* This is actually going to be a MOV, but since only the first dword
2549           * is accessed, we have a special opcode to do just that one.  Note
2550           * that this needs to be an operation that will be considered a def
2551           * by live variable analysis, or register allocation will explode.
2552           */
2553          fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2554                                                payload, const_offset_reg);
2555          setup->force_writemask_all = true;
2556
2557          setup->ir = inst->ir;
2558          setup->annotation = inst->annotation;
2559          inst->insert_before(setup);
2560
2561          /* Similarly, this will only populate the first 4 channels of the
2562           * result register (since we only use smear values from 0-3), but we
2563           * don't tell the optimizer.
2564           */
2565          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2566          inst->src[1] = payload;
2567
2568          invalidate_live_intervals();
2569       } else {
2570          /* Before register allocation, we didn't tell the scheduler about the
2571           * MRF we use.  We know it's safe to use this MRF because nothing
2572           * else does except for register spill/unspill, which generates and
2573           * uses its MRF within a single IR instruction.
2574           */
2575          inst->base_mrf = 14;
2576          inst->mlen = 1;
2577       }
2578    }
2579 }
2580
2581 void
2582 fs_visitor::dump_instructions()
2583 {
2584    calculate_register_pressure();
2585
2586    int ip = 0, max_pressure = 0;
2587    foreach_list(node, &this->instructions) {
2588       backend_instruction *inst = (backend_instruction *)node;
2589       max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
2590       fprintf(stderr, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
2591       dump_instruction(inst);
2592       ++ip;
2593    }
2594    fprintf(stderr, "Maximum %3d registers live at once.\n", max_pressure);
2595 }
2596
2597 void
2598 fs_visitor::dump_instruction(backend_instruction *be_inst)
2599 {
2600    fs_inst *inst = (fs_inst *)be_inst;
2601
2602    if (inst->predicate) {
2603       fprintf(stderr, "(%cf0.%d) ",
2604              inst->predicate_inverse ? '-' : '+',
2605              inst->flag_subreg);
2606    }
2607
2608    fprintf(stderr, "%s", brw_instruction_name(inst->opcode));
2609    if (inst->saturate)
2610       fprintf(stderr, ".sat");
2611    if (inst->conditional_mod) {
2612       fprintf(stderr, "%s", conditional_modifier[inst->conditional_mod]);
2613       if (!inst->predicate &&
2614           (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2615                               inst->opcode != BRW_OPCODE_IF &&
2616                               inst->opcode != BRW_OPCODE_WHILE))) {
2617          fprintf(stderr, ".f0.%d", inst->flag_subreg);
2618       }
2619    }
2620    fprintf(stderr, " ");
2621
2622
2623    switch (inst->dst.file) {
2624    case GRF:
2625       fprintf(stderr, "vgrf%d", inst->dst.reg);
2626       if (virtual_grf_sizes[inst->dst.reg] != 1 ||
2627           inst->dst.subreg_offset)
2628          fprintf(stderr, "+%d.%d",
2629                  inst->dst.reg_offset, inst->dst.subreg_offset);
2630       break;
2631    case MRF:
2632       fprintf(stderr, "m%d", inst->dst.reg);
2633       break;
2634    case BAD_FILE:
2635       fprintf(stderr, "(null)");
2636       break;
2637    case UNIFORM:
2638       fprintf(stderr, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
2639       break;
2640    case HW_REG:
2641       if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
2642          switch (inst->dst.fixed_hw_reg.nr) {
2643          case BRW_ARF_NULL:
2644             fprintf(stderr, "null");
2645             break;
2646          case BRW_ARF_ADDRESS:
2647             fprintf(stderr, "a0.%d", inst->dst.fixed_hw_reg.subnr);
2648             break;
2649          case BRW_ARF_ACCUMULATOR:
2650             fprintf(stderr, "acc%d", inst->dst.fixed_hw_reg.subnr);
2651             break;
2652          case BRW_ARF_FLAG:
2653             fprintf(stderr, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
2654                              inst->dst.fixed_hw_reg.subnr);
2655             break;
2656          default:
2657             fprintf(stderr, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
2658                                inst->dst.fixed_hw_reg.subnr);
2659             break;
2660          }
2661       } else {
2662          fprintf(stderr, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
2663       }
2664       if (inst->dst.fixed_hw_reg.subnr)
2665          fprintf(stderr, "+%d", inst->dst.fixed_hw_reg.subnr);
2666       break;
2667    default:
2668       fprintf(stderr, "???");
2669       break;
2670    }
2671    fprintf(stderr, ":%s, ", brw_reg_type_letters(inst->dst.type));
2672
2673    for (int i = 0; i < 3 && inst->src[i].file != BAD_FILE; i++) {
2674       if (inst->src[i].negate)
2675          fprintf(stderr, "-");
2676       if (inst->src[i].abs)
2677          fprintf(stderr, "|");
2678       switch (inst->src[i].file) {
2679       case GRF:
2680          fprintf(stderr, "vgrf%d", inst->src[i].reg);
2681          if (virtual_grf_sizes[inst->src[i].reg] != 1 ||
2682              inst->src[i].subreg_offset)
2683             fprintf(stderr, "+%d.%d", inst->src[i].reg_offset,
2684                     inst->src[i].subreg_offset);
2685          break;
2686       case MRF:
2687          fprintf(stderr, "***m%d***", inst->src[i].reg);
2688          break;
2689       case UNIFORM:
2690          fprintf(stderr, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
2691          if (inst->src[i].reladdr) {
2692             fprintf(stderr, "+reladdr");
2693          } else if (virtual_grf_sizes[inst->src[i].reg] != 1 ||
2694              inst->src[i].subreg_offset) {
2695             fprintf(stderr, "+%d.%d", inst->src[i].reg_offset,
2696                     inst->src[i].subreg_offset);
2697          }
2698          break;
2699       case BAD_FILE:
2700          fprintf(stderr, "(null)");
2701          break;
2702       case IMM:
2703          switch (inst->src[i].type) {
2704          case BRW_REGISTER_TYPE_F:
2705             fprintf(stderr, "%ff", inst->src[i].imm.f);
2706             break;
2707          case BRW_REGISTER_TYPE_D:
2708             fprintf(stderr, "%dd", inst->src[i].imm.i);
2709             break;
2710          case BRW_REGISTER_TYPE_UD:
2711             fprintf(stderr, "%uu", inst->src[i].imm.u);
2712             break;
2713          default:
2714             fprintf(stderr, "???");
2715             break;
2716          }
2717          break;
2718       case HW_REG:
2719          if (inst->src[i].fixed_hw_reg.negate)
2720             fprintf(stderr, "-");
2721          if (inst->src[i].fixed_hw_reg.abs)
2722             fprintf(stderr, "|");
2723          if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
2724             switch (inst->src[i].fixed_hw_reg.nr) {
2725             case BRW_ARF_NULL:
2726                fprintf(stderr, "null");
2727                break;
2728             case BRW_ARF_ADDRESS:
2729                fprintf(stderr, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
2730                break;
2731             case BRW_ARF_ACCUMULATOR:
2732                fprintf(stderr, "acc%d", inst->src[i].fixed_hw_reg.subnr);
2733                break;
2734             case BRW_ARF_FLAG:
2735                fprintf(stderr, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
2736                                 inst->src[i].fixed_hw_reg.subnr);
2737                break;
2738             default:
2739                fprintf(stderr, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
2740                                   inst->src[i].fixed_hw_reg.subnr);
2741                break;
2742             }
2743          } else {
2744             fprintf(stderr, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
2745          }
2746          if (inst->src[i].fixed_hw_reg.subnr)
2747             fprintf(stderr, "+%d", inst->src[i].fixed_hw_reg.subnr);
2748          if (inst->src[i].fixed_hw_reg.abs)
2749             fprintf(stderr, "|");
2750          break;
2751       default:
2752          fprintf(stderr, "???");
2753          break;
2754       }
2755       if (inst->src[i].abs)
2756          fprintf(stderr, "|");
2757
2758       if (inst->src[i].file != IMM) {
2759          fprintf(stderr, ":%s", brw_reg_type_letters(inst->src[i].type));
2760       }
2761
2762       if (i < 2 && inst->src[i + 1].file != BAD_FILE)
2763          fprintf(stderr, ", ");
2764    }
2765
2766    fprintf(stderr, " ");
2767
2768    if (inst->force_uncompressed)
2769       fprintf(stderr, "1sthalf ");
2770
2771    if (inst->force_sechalf)
2772       fprintf(stderr, "2ndhalf ");
2773
2774    fprintf(stderr, "\n");
2775 }
2776
2777 /**
2778  * Possibly returns an instruction that set up @param reg.
2779  *
2780  * Sometimes we want to take the result of some expression/variable
2781  * dereference tree and rewrite the instruction generating the result
2782  * of the tree.  When processing the tree, we know that the
2783  * instructions generated are all writing temporaries that are dead
2784  * outside of this tree.  So, if we have some instructions that write
2785  * a temporary, we're free to point that temp write somewhere else.
2786  *
2787  * Note that this doesn't guarantee that the instruction generated
2788  * only reg -- it might be the size=4 destination of a texture instruction.
2789  */
2790 fs_inst *
2791 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2792                                            fs_inst *end,
2793                                            const fs_reg &reg)
2794 {
2795    if (end == start ||
2796        end->is_partial_write() ||
2797        reg.reladdr ||
2798        !reg.equals(end->dst)) {
2799       return NULL;
2800    } else {
2801       return end;
2802    }
2803 }
2804
2805 void
2806 fs_visitor::setup_payload_gen6()
2807 {
2808    bool uses_depth =
2809       (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
2810    unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
2811
2812    assert(brw->gen >= 6);
2813
2814    /* R0-1: masks, pixel X/Y coordinates. */
2815    c->nr_payload_regs = 2;
2816    /* R2: only for 32-pixel dispatch.*/
2817
2818    /* R3-26: barycentric interpolation coordinates.  These appear in the
2819     * same order that they appear in the brw_wm_barycentric_interp_mode
2820     * enum.  Each set of coordinates occupies 2 registers if dispatch width
2821     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
2822     * appear if they were enabled using the "Barycentric Interpolation
2823     * Mode" bits in WM_STATE.
2824     */
2825    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2826       if (barycentric_interp_modes & (1 << i)) {
2827          c->barycentric_coord_reg[i] = c->nr_payload_regs;
2828          c->nr_payload_regs += 2;
2829          if (dispatch_width == 16) {
2830             c->nr_payload_regs += 2;
2831          }
2832       }
2833    }
2834
2835    /* R27: interpolated depth if uses source depth */
2836    if (uses_depth) {
2837       c->source_depth_reg = c->nr_payload_regs;
2838       c->nr_payload_regs++;
2839       if (dispatch_width == 16) {
2840          /* R28: interpolated depth if not SIMD8. */
2841          c->nr_payload_regs++;
2842       }
2843    }
2844    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
2845    if (uses_depth) {
2846       c->source_w_reg = c->nr_payload_regs;
2847       c->nr_payload_regs++;
2848       if (dispatch_width == 16) {
2849          /* R30: interpolated W if not SIMD8. */
2850          c->nr_payload_regs++;
2851       }
2852    }
2853
2854    c->prog_data.uses_pos_offset = c->key.compute_pos_offset;
2855    /* R31: MSAA position offsets. */
2856    if (c->prog_data.uses_pos_offset) {
2857       c->sample_pos_reg = c->nr_payload_regs;
2858       c->nr_payload_regs++;
2859    }
2860
2861    /* R32: MSAA input coverage mask */
2862    if (fp->Base.SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
2863       assert(brw->gen >= 7);
2864       c->sample_mask_reg = c->nr_payload_regs;
2865       c->nr_payload_regs++;
2866       if (dispatch_width == 16) {
2867          /* R33: input coverage mask if not SIMD8. */
2868          c->nr_payload_regs++;
2869       }
2870    }
2871
2872    /* R34-: bary for 32-pixel. */
2873    /* R58-59: interp W for 32-pixel. */
2874
2875    if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2876       c->source_depth_to_render_target = true;
2877    }
2878 }
2879
2880 void
2881 fs_visitor::assign_binding_table_offsets()
2882 {
2883    uint32_t next_binding_table_offset = 0;
2884
2885    /* If there are no color regions, we still perform an FB write to a null
2886     * renderbuffer, which we place at surface index 0.
2887     */
2888    c->prog_data.binding_table.render_target_start = next_binding_table_offset;
2889    next_binding_table_offset += MAX2(c->key.nr_color_regions, 1);
2890
2891    assign_common_binding_table_offsets(next_binding_table_offset);
2892 }
2893
2894 void
2895 fs_visitor::calculate_register_pressure()
2896 {
2897    invalidate_live_intervals();
2898    calculate_live_intervals();
2899
2900    int num_instructions = 0;
2901    foreach_list(node, &this->instructions) {
2902       ++num_instructions;
2903    }
2904
2905    regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
2906
2907    for (int reg = 0; reg < virtual_grf_count; reg++) {
2908       for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
2909          regs_live_at_ip[ip] += virtual_grf_sizes[reg];
2910    }
2911 }
2912
2913 /**
2914  * Look for repeated FS_OPCODE_MOV_DISPATCH_TO_FLAGS and drop the later ones.
2915  *
2916  * The needs_unlit_centroid_workaround ends up producing one of these per
2917  * channel of centroid input, so it's good to clean them up.
2918  *
2919  * An assumption here is that nothing ever modifies the dispatched pixels
2920  * value that FS_OPCODE_MOV_DISPATCH_TO_FLAGS reads from, but the hardware
2921  * dictates that anyway.
2922  */
2923 void
2924 fs_visitor::opt_drop_redundant_mov_to_flags()
2925 {
2926    bool flag_mov_found[2] = {false};
2927
2928    foreach_list_safe(node, &this->instructions) {
2929       fs_inst *inst = (fs_inst *)node;
2930
2931       if (inst->is_control_flow()) {
2932          memset(flag_mov_found, 0, sizeof(flag_mov_found));
2933       } else if (inst->opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS) {
2934          if (!flag_mov_found[inst->flag_subreg])
2935             flag_mov_found[inst->flag_subreg] = true;
2936          else
2937             inst->remove();
2938       } else if (inst->writes_flag()) {
2939          flag_mov_found[inst->flag_subreg] = false;
2940       }
2941    }
2942 }
2943
2944 bool
2945 fs_visitor::run()
2946 {
2947    sanity_param_count = fp->Base.Parameters->NumParameters;
2948    bool allocated_without_spills;
2949
2950    assign_binding_table_offsets();
2951
2952    if (brw->gen >= 6)
2953       setup_payload_gen6();
2954    else
2955       setup_payload_gen4();
2956
2957    if (0) {
2958       emit_dummy_fs();
2959    } else {
2960       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2961          emit_shader_time_begin();
2962
2963       calculate_urb_setup();
2964       if (fp->Base.InputsRead > 0) {
2965          if (brw->gen < 6)
2966             emit_interpolation_setup_gen4();
2967          else
2968             emit_interpolation_setup_gen6();
2969       }
2970
2971       /* We handle discards by keeping track of the still-live pixels in f0.1.
2972        * Initialize it with the dispatched pixels.
2973        */
2974       if (fp->UsesKill || c->key.alpha_test_func) {
2975          fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
2976          discard_init->flag_subreg = 1;
2977       }
2978
2979       /* Generate FS IR for main().  (the visitor only descends into
2980        * functions called "main").
2981        */
2982       if (shader) {
2983          foreach_list(node, &*shader->base.ir) {
2984             ir_instruction *ir = (ir_instruction *)node;
2985             base_ir = ir;
2986             this->result = reg_undef;
2987             ir->accept(this);
2988          }
2989       } else {
2990          emit_fragment_program_code();
2991       }
2992       base_ir = NULL;
2993       if (failed)
2994          return false;
2995
2996       emit(FS_OPCODE_PLACEHOLDER_HALT);
2997
2998       if (c->key.alpha_test_func)
2999          emit_alpha_test();
3000
3001       emit_fb_writes();
3002
3003       split_virtual_grfs();
3004
3005       move_uniform_array_access_to_pull_constants();
3006       assign_constant_locations();
3007       demote_pull_constants();
3008
3009       opt_drop_redundant_mov_to_flags();
3010
3011       bool progress;
3012       do {
3013          progress = false;
3014
3015          compact_virtual_grfs();
3016
3017          progress = remove_duplicate_mrf_writes() || progress;
3018
3019          progress = opt_algebraic() || progress;
3020          progress = opt_cse() || progress;
3021          progress = opt_copy_propagate() || progress;
3022          progress = opt_peephole_predicated_break() || progress;
3023          progress = dead_code_eliminate() || progress;
3024          progress = opt_peephole_sel() || progress;
3025          progress = dead_control_flow_eliminate(this) || progress;
3026          progress = opt_saturate_propagation() || progress;
3027          progress = register_coalesce() || progress;
3028          progress = compute_to_mrf() || progress;
3029       } while (progress);
3030
3031       lower_uniform_pull_constant_loads();
3032
3033       assign_curb_setup();
3034       assign_urb_setup();
3035
3036       static enum instruction_scheduler_mode pre_modes[] = {
3037          SCHEDULE_PRE,
3038          SCHEDULE_PRE_NON_LIFO,
3039          SCHEDULE_PRE_LIFO,
3040       };
3041
3042       /* Try each scheduling heuristic to see if it can successfully register
3043        * allocate without spilling.  They should be ordered by decreasing
3044        * performance but increasing likelihood of allocating.
3045        */
3046       for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3047          schedule_instructions(pre_modes[i]);
3048
3049          if (0) {
3050             assign_regs_trivial();
3051             allocated_without_spills = true;
3052          } else {
3053             allocated_without_spills = assign_regs(false);
3054          }
3055          if (allocated_without_spills)
3056             break;
3057       }
3058
3059       if (!allocated_without_spills) {
3060          /* We assume that any spilling is worse than just dropping back to
3061           * SIMD8.  There's probably actually some intermediate point where
3062           * SIMD16 with a couple of spills is still better.
3063           */
3064          if (dispatch_width == 16) {
3065             fail("Failure to register allocate.  Reduce number of "
3066                  "live scalar values to avoid this.");
3067          } else {
3068             perf_debug("Fragment shader triggered register spilling.  "
3069                        "Try reducing the number of live scalar values to "
3070                        "improve performance.\n");
3071          }
3072
3073          /* Since we're out of heuristics, just go spill registers until we
3074           * get an allocation.
3075           */
3076          while (!assign_regs(true)) {
3077             if (failed)
3078                break;
3079          }
3080       }
3081    }
3082    assert(force_uncompressed_stack == 0);
3083
3084    /* This must come after all optimization and register allocation, since
3085     * it inserts dead code that happens to have side effects, and it does
3086     * so based on the actual physical registers in use.
3087     */
3088    insert_gen4_send_dependency_workarounds();
3089
3090    if (failed)
3091       return false;
3092
3093    if (!allocated_without_spills)
3094       schedule_instructions(SCHEDULE_POST);
3095
3096    if (dispatch_width == 8)
3097       c->prog_data.reg_blocks = brw_register_blocks(grf_used);
3098    else
3099       c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
3100
3101    /* If any state parameters were appended, then ParameterValues could have
3102     * been realloced, in which case the driver uniform storage set up by
3103     * _mesa_associate_uniform_storage() would point to freed memory.  Make
3104     * sure that didn't happen.
3105     */
3106    assert(sanity_param_count == fp->Base.Parameters->NumParameters);
3107
3108    return !failed;
3109 }
3110
3111 const unsigned *
3112 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
3113                struct gl_fragment_program *fp,
3114                struct gl_shader_program *prog,
3115                unsigned *final_assembly_size)
3116 {
3117    bool start_busy = false;
3118    double start_time = 0;
3119
3120    if (unlikely(brw->perf_debug)) {
3121       start_busy = (brw->batch.last_bo &&
3122                     drm_intel_bo_busy(brw->batch.last_bo));
3123       start_time = get_time();
3124    }
3125
3126    struct brw_shader *shader = NULL;
3127    if (prog)
3128       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3129
3130    if (unlikely(INTEL_DEBUG & DEBUG_WM))
3131       brw_dump_ir(brw, "fragment", prog, &shader->base, &fp->Base);
3132
3133    /* Now the main event: Visit the shader IR and generate our FS IR for it.
3134     */
3135    fs_visitor v(brw, c, prog, fp, 8);
3136    if (!v.run()) {
3137       if (prog) {
3138          prog->LinkStatus = false;
3139          ralloc_strcat(&prog->InfoLog, v.fail_msg);
3140       }
3141
3142       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3143                     v.fail_msg);
3144
3145       return NULL;
3146    }
3147
3148    exec_list *simd16_instructions = NULL;
3149    fs_visitor v2(brw, c, prog, fp, 16);
3150    if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16))) {
3151       if (!v.simd16_unsupported) {
3152          /* Try a SIMD16 compile */
3153          v2.import_uniforms(&v);
3154          if (!v2.run()) {
3155             perf_debug("SIMD16 shader failed to compile, falling back to "
3156                        "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
3157          } else {
3158             simd16_instructions = &v2.instructions;
3159          }
3160       } else {
3161          perf_debug("SIMD16 shader unsupported, falling back to "
3162                     "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
3163       }
3164    }
3165
3166    const unsigned *assembly = NULL;
3167    if (brw->gen >= 8) {
3168       gen8_fs_generator g(brw, c, prog, fp, v.do_dual_src);
3169       assembly = g.generate_assembly(&v.instructions, simd16_instructions,
3170                                      final_assembly_size);
3171    } else {
3172       fs_generator g(brw, c, prog, fp, v.do_dual_src);
3173       assembly = g.generate_assembly(&v.instructions, simd16_instructions,
3174                                      final_assembly_size);
3175    }
3176
3177    if (unlikely(brw->perf_debug) && shader) {
3178       if (shader->compiled_once)
3179          brw_wm_debug_recompile(brw, prog, &c->key);
3180       shader->compiled_once = true;
3181
3182       if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3183          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3184                     (get_time() - start_time) * 1000);
3185       }
3186    }
3187
3188    return assembly;
3189 }
3190
3191 bool
3192 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
3193 {
3194    struct brw_context *brw = brw_context(ctx);
3195    struct brw_wm_prog_key key;
3196
3197    if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
3198       return true;
3199
3200    struct gl_fragment_program *fp = (struct gl_fragment_program *)
3201       prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
3202    struct brw_fragment_program *bfp = brw_fragment_program(fp);
3203    bool program_uses_dfdy = fp->UsesDFdy;
3204
3205    memset(&key, 0, sizeof(key));
3206
3207    if (brw->gen < 6) {
3208       if (fp->UsesKill)
3209          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3210
3211       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3212          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3213
3214       /* Just assume depth testing. */
3215       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3216       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3217    }
3218
3219    if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
3220                                          BRW_FS_VARYING_INPUT_MASK) > 16)
3221       key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
3222
3223    key.clamp_fragment_color = ctx->API == API_OPENGL_COMPAT;
3224
3225    unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
3226    for (unsigned i = 0; i < sampler_count; i++) {
3227       if (fp->Base.ShadowSamplers & (1 << i)) {
3228          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3229          key.tex.swizzles[i] =
3230             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3231       } else {
3232          /* Color sampler: assume no swizzling. */
3233          key.tex.swizzles[i] = SWIZZLE_XYZW;
3234       }
3235    }
3236
3237    if (fp->Base.InputsRead & VARYING_BIT_POS) {
3238       key.drawable_height = ctx->DrawBuffer->Height;
3239    }
3240
3241    key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
3242          ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
3243          BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
3244
3245    if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3246       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
3247                           key.nr_color_regions > 1;
3248    }
3249
3250    /* GL_FRAGMENT_SHADER_DERIVATIVE_HINT is almost always GL_DONT_CARE.  The
3251     * quality of the derivatives is likely to be determined by the driconf
3252     * option.
3253     */
3254    key.high_quality_derivatives = brw->disable_derivative_optimization;
3255
3256    key.program_string_id = bfp->id;
3257
3258    uint32_t old_prog_offset = brw->wm.base.prog_offset;
3259    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3260
3261    bool success = do_wm_prog(brw, prog, bfp, &key);
3262
3263    brw->wm.base.prog_offset = old_prog_offset;
3264    brw->wm.prog_data = old_prog_data;
3265
3266    return success;
3267 }