src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 extern "C" {
  32
  33 #include <sys/types.h>
  34
  35 #include "util/hash_table.h"
  36 #include "main/macros.h"
  37 #include "main/shaderobj.h"
  38 #include "main/fbobject.h"
  39 #include "program/prog_parameter.h"
  40 #include "program/prog_print.h"
  41 #include "util/register_allocate.h"
  42 #include "program/sampler.h"
  43 #include "program/hash_table.h"
  44 #include "brw_context.h"
  45 #include "brw_eu.h"
  46 #include "brw_wm.h"
  47 }
  48 #include "brw_fs.h"
  49 #include "brw_cfg.h"
  50 #include "brw_dead_control_flow.h"
  51 #include "main/uniforms.h"
  52 #include "brw_fs_live_variables.h"
  53 #include "glsl/glsl_types.h"
  54
  55 void
  56 fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
  57               fs_reg *src, int sources)
  58 {
  59    memset(this, 0, sizeof(*this));
  60
  61    this->opcode = opcode;
  62    this->dst = dst;
  63    this->src = src;
  64    this->sources = sources;
  65    this->exec_size = exec_size;
  66
  67    assert(dst.file != IMM && dst.file != UNIFORM);
  68
  69    /* If exec_size == 0, try to guess it from the registers.  Since all
  70     * manner of things may use hardware registers, we first try to guess
  71     * based on GRF registers.  If this fails, we will go ahead and take the
  72     * width from the destination register.
  73     */
  74    if (this->exec_size == 0) {
  75       if (dst.file == GRF) {
  76          this->exec_size = dst.width;
  77       } else {
  78          for (int i = 0; i < sources; ++i) {
  79             if (src[i].file != GRF)
  80                continue;
  81
  82             if (this->exec_size <= 1)
  83                this->exec_size = src[i].width;
  84             assert(src[i].width == 1 || src[i].width == this->exec_size);
  85          }
  86       }
  87
  88       if (this->exec_size == 0 && dst.file != BAD_FILE)
  89          this->exec_size = dst.width;
  90    }
  91    assert(this->exec_size != 0);
  92
  93    for (int i = 0; i < sources; ++i) {
  94       switch (this->src[i].file) {
  95       case BAD_FILE:
  96          this->src[i].effective_width = 8;
  97          break;
  98       case GRF:
  99       case HW_REG:
 100          assert(this->src[i].width > 0);
 101          if (this->src[i].width == 1) {
 102             this->src[i].effective_width = this->exec_size;
 103          } else {
 104             this->src[i].effective_width = this->src[i].width;
 105          }
 106          break;
 107       case IMM:
 108       case UNIFORM:
 109          this->src[i].effective_width = this->exec_size;
 110          break;
 111       default:
 112          unreachable("Invalid source register file");
 113       }
 114    }
 115    this->dst.effective_width = this->exec_size;
 116
 117    this->conditional_mod = BRW_CONDITIONAL_NONE;
 118
 119    /* This will be the case for almost all instructions. */
 120    switch (dst.file) {
 121    case GRF:
 122    case HW_REG:
 123    case MRF:
 124       this->regs_written = (dst.width * dst.stride * type_sz(dst.type) + 31) / 32;
 125       break;
 126    case BAD_FILE:
 127       this->regs_written = 0;
 128       break;
 129    case IMM:
 130    case UNIFORM:
 131       unreachable("Invalid destination register file");
 132    default:
 133       unreachable("Invalid register file");
 134    }
 135
 136    this->writes_accumulator = false;
 137 }
 138
 139 fs_inst::fs_inst()
 140 {
 141    fs_reg *src = ralloc_array(this, fs_reg, 3);
 142    init(BRW_OPCODE_NOP, 8, dst, src, 0);
 143 }
 144
 145 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
 146 {
 147    fs_reg *src = ralloc_array(this, fs_reg, 3);
 148    init(opcode, exec_size, reg_undef, src, 0);
 149 }
 150
 151 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
 152 {
 153    fs_reg *src = ralloc_array(this, fs_reg, 3);
 154    init(opcode, 0, dst, src, 0);
 155 }
 156
 157 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 158                  const fs_reg &src0)
 159 {
 160    fs_reg *src = ralloc_array(this, fs_reg, 3);
 161    src[0] = src0;
 162    init(opcode, exec_size, dst, src, 1);
 163 }
 164
 165 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
 166 {
 167    fs_reg *src = ralloc_array(this, fs_reg, 3);
 168    src[0] = src0;
 169    init(opcode, 0, dst, src, 1);
 170 }
 171
 172 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 173                  const fs_reg &src0, const fs_reg &src1)
 174 {
 175    fs_reg *src = ralloc_array(this, fs_reg, 3);
 176    src[0] = src0;
 177    src[1] = src1;
 178    init(opcode, exec_size, dst, src, 2);
 179 }
 180
 181 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 182                  const fs_reg &src1)
 183 {
 184    fs_reg *src = ralloc_array(this, fs_reg, 3);
 185    src[0] = src0;
 186    src[1] = src1;
 187    init(opcode, 0, dst, src, 2);
 188 }
 189
 190 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 191                  const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
 192 {
 193    fs_reg *src = ralloc_array(this, fs_reg, 3);
 194    src[0] = src0;
 195    src[1] = src1;
 196    src[2] = src2;
 197    init(opcode, exec_size, dst, src, 3);
 198 }
 199
 200 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 201                  const fs_reg &src1, const fs_reg &src2)
 202 {
 203    fs_reg *src = ralloc_array(this, fs_reg, 3);
 204    src[0] = src0;
 205    src[1] = src1;
 206    src[2] = src2;
 207    init(opcode, 0, dst, src, 3);
 208 }
 209
 210 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, fs_reg src[], int sources)
 211 {
 212    init(opcode, 0, dst, src, sources);
 213 }
 214
 215 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
 216                  fs_reg src[], int sources)
 217 {
 218    init(opcode, exec_width, dst, src, sources);
 219 }
 220
 221 fs_inst::fs_inst(const fs_inst &that)
 222 {
 223    memcpy(this, &that, sizeof(that));
 224
 225    this->src = ralloc_array(this, fs_reg, that.sources);
 226
 227    for (int i = 0; i < that.sources; i++)
 228       this->src[i] = that.src[i];
 229 }
 230
 231 void
 232 fs_inst::resize_sources(uint8_t num_sources)
 233 {
 234    if (this->sources != num_sources) {
 235       this->src = reralloc(this, this->src, fs_reg, num_sources);
 236       this->sources = num_sources;
 237    }
 238 }
 239
 240 #define ALU1(op)                                                        \
 241    fs_inst *                                                            \
 242    fs_visitor::op(const fs_reg &dst, const fs_reg &src0)                \
 243    {                                                                    \
 244       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
 245    }
 246
 247 #define ALU2(op)                                                        \
 248    fs_inst *                                                            \
 249    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 250                   const fs_reg &src1)                                   \
 251    {                                                                    \
 252       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
 253    }
 254
 255 #define ALU2_ACC(op)                                                    \
 256    fs_inst *                                                            \
 257    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 258                   const fs_reg &src1)                                   \
 259    {                                                                    \
 260       fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
 261       inst->writes_accumulator = true;                                  \
 262       return inst;                                                      \
 263    }
 264
 265 #define ALU3(op)                                                        \
 266    fs_inst *                                                            \
 267    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 268                   const fs_reg &src1, const fs_reg &src2)               \
 269    {                                                                    \
 270       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
 271    }
 272
 273 ALU1(NOT)
 274 ALU1(MOV)
 275 ALU1(FRC)
 276 ALU1(RNDD)
 277 ALU1(RNDE)
 278 ALU1(RNDZ)
 279 ALU2(ADD)
 280 ALU2(MUL)
 281 ALU2_ACC(MACH)
 282 ALU2(AND)
 283 ALU2(OR)
 284 ALU2(XOR)
 285 ALU2(SHL)
 286 ALU2(SHR)
 287 ALU2(ASR)
 288 ALU3(LRP)
 289 ALU1(BFREV)
 290 ALU3(BFE)
 291 ALU2(BFI1)
 292 ALU3(BFI2)
 293 ALU1(FBH)
 294 ALU1(FBL)
 295 ALU1(CBIT)
 296 ALU3(MAD)
 297 ALU2_ACC(ADDC)
 298 ALU2_ACC(SUBB)
 299 ALU2(SEL)
 300 ALU2(MAC)
 301
 302 /** Gen4 predicated IF. */
 303 fs_inst *
 304 fs_visitor::IF(enum brw_predicate predicate)
 305 {
 306    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width);
 307    inst->predicate = predicate;
 308    return inst;
 309 }
 310
 311 /** Gen6 IF with embedded comparison. */
 312 fs_inst *
 313 fs_visitor::IF(const fs_reg &src0, const fs_reg &src1,
 314                enum brw_conditional_mod condition)
 315 {
 316    assert(brw->gen == 6);
 317    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width,
 318                                         reg_null_d, src0, src1);
 319    inst->conditional_mod = condition;
 320    return inst;
 321 }
 322
 323 /**
 324  * CMP: Sets the low bit of the destination channels with the result
 325  * of the comparison, while the upper bits are undefined, and updates
 326  * the flag register with the packed 16 bits of the result.
 327  */
 328 fs_inst *
 329 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
 330                 enum brw_conditional_mod condition)
 331 {
 332    fs_inst *inst;
 333
 334    /* Take the instruction:
 335     *
 336     * CMP null<d> src0<f> src1<f>
 337     *
 338     * Original gen4 does type conversion to the destination type before
 339     * comparison, producing garbage results for floating point comparisons.
 340     * gen5 does the comparison on the execution type (resolved source types),
 341     * so dst type doesn't matter.  gen6 does comparison and then uses the
 342     * result as if it was the dst type with no conversion, which happens to
 343     * mostly work out for float-interpreted-as-int since our comparisons are
 344     * for >0, =0, <0.
 345     */
 346    if (brw->gen == 4) {
 347       dst.type = src0.type;
 348       if (dst.file == HW_REG)
 349          dst.fixed_hw_reg.type = dst.type;
 350    }
 351
 352    resolve_ud_negate(&src0);
 353    resolve_ud_negate(&src1);
 354
 355    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
 356    inst->conditional_mod = condition;
 357
 358    return inst;
 359 }
 360
 361 fs_inst *
 362 fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources)
 363 {
 364    uint8_t exec_size = dst.width;
 365    for (int i = 0; i < sources; ++i) {
 366       assert(src[i].width % dst.width == 0);
 367       if (src[i].width > exec_size)
 368          exec_size = src[i].width;
 369    }
 370
 371    fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, exec_size,
 372                                         dst, src, sources);
 373    inst->regs_written = 0;
 374    for (int i = 0; i < sources; ++i) {
 375       /* The LOAD_PAYLOAD instruction only really makes sense if we are
 376        * dealing with whole registers.  If this ever changes, we can deal
 377        * with it later.
 378        */
 379       int size = src[i].effective_width * type_sz(src[i].type);
 380       assert(size % 32 == 0);
 381       inst->regs_written += (size + 31) / 32;
 382    }
 383
 384    return inst;
 385 }
 386
 387 exec_list
 388 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
 389                                        const fs_reg &surf_index,
 390                                        const fs_reg &varying_offset,
 391                                        uint32_t const_offset)
 392 {
 393    exec_list instructions;
 394    fs_inst *inst;
 395
 396    /* We have our constant surface use a pitch of 4 bytes, so our index can
 397     * be any component of a vector, and then we load 4 contiguous
 398     * components starting from that.
 399     *
 400     * We break down the const_offset to a portion added to the variable
 401     * offset and a portion done using reg_offset, which means that if you
 402     * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
 403     * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
 404     * CSE can later notice that those loads are all the same and eliminate
 405     * the redundant ones.
 406     */
 407    fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
 408    instructions.push_tail(ADD(vec4_offset,
 409                               varying_offset, fs_reg(const_offset & ~3)));
 410
 411    int scale = 1;
 412    if (brw->gen == 4 && dst.width == 8) {
 413       /* Pre-gen5, we can either use a SIMD8 message that requires (header,
 414        * u, v, r) as parameters, or we can just use the SIMD16 message
 415        * consisting of (header, u).  We choose the second, at the cost of a
 416        * longer return length.
 417        */
 418       scale = 2;
 419    }
 420
 421    enum opcode op;
 422    if (brw->gen >= 7)
 423       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
 424    else
 425       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
 426
 427    assert(dst.width % 8 == 0);
 428    int regs_written = 4 * (dst.width / 8) * scale;
 429    fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(regs_written),
 430                                dst.type, dst.width);
 431    inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
 432    inst->regs_written = regs_written;
 433    instructions.push_tail(inst);
 434
 435    if (brw->gen < 7) {
 436       inst->base_mrf = 13;
 437       inst->header_present = true;
 438       if (brw->gen == 4)
 439          inst->mlen = 3;
 440       else
 441          inst->mlen = 1 + dispatch_width / 8;
 442    }
 443
 444    fs_reg result = offset(vec4_result, (const_offset & 3) * scale);
 445    instructions.push_tail(MOV(dst, result));
 446
 447    return instructions;
 448 }
 449
 450 /**
 451  * A helper for MOV generation for fixing up broken hardware SEND dependency
 452  * handling.
 453  */
 454 fs_inst *
 455 fs_visitor::DEP_RESOLVE_MOV(int grf)
 456 {
 457    fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
 458
 459    inst->ir = NULL;
 460    inst->annotation = "send dependency resolve";
 461
 462    /* The caller always wants uncompressed to emit the minimal extra
 463     * dependencies, and to avoid having to deal with aligning its regs to 2.
 464     */
 465    inst->exec_size = 8;
 466
 467    return inst;
 468 }
 469
 470 bool
 471 fs_inst::equals(fs_inst *inst) const
 472 {
 473    return (opcode == inst->opcode &&
 474            dst.equals(inst->dst) &&
 475            src[0].equals(inst->src[0]) &&
 476            src[1].equals(inst->src[1]) &&
 477            src[2].equals(inst->src[2]) &&
 478            saturate == inst->saturate &&
 479            predicate == inst->predicate &&
 480            conditional_mod == inst->conditional_mod &&
 481            mlen == inst->mlen &&
 482            base_mrf == inst->base_mrf &&
 483            target == inst->target &&
 484            eot == inst->eot &&
 485            header_present == inst->header_present &&
 486            shadow_compare == inst->shadow_compare &&
 487            exec_size == inst->exec_size &&
 488            offset == inst->offset);
 489 }
 490
 491 bool
 492 fs_inst::overwrites_reg(const fs_reg &reg) const
 493 {
 494    return (reg.file == dst.file &&
 495            reg.reg == dst.reg &&
 496            reg.reg_offset >= dst.reg_offset  &&
 497            reg.reg_offset < dst.reg_offset + regs_written);
 498 }
 499
 500 bool
 501 fs_inst::is_send_from_grf() const
 502 {
 503    switch (opcode) {
 504    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
 505    case SHADER_OPCODE_SHADER_TIME_ADD:
 506    case FS_OPCODE_INTERPOLATE_AT_CENTROID:
 507    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
 508    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
 509    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
 510    case SHADER_OPCODE_UNTYPED_ATOMIC:
 511    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
 512       return true;
 513    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 514       return src[1].file == GRF;
 515    case FS_OPCODE_FB_WRITE:
 516       return src[0].file == GRF;
 517    default:
 518       if (is_tex())
 519          return src[0].file == GRF;
 520
 521       return false;
 522    }
 523 }
 524
 525 bool
 526 fs_inst::can_do_source_mods(struct brw_context *brw)
 527 {
 528    if (brw->gen == 6 && is_math())
 529       return false;
 530
 531    if (is_send_from_grf())
 532       return false;
 533
 534    if (!backend_instruction::can_do_source_mods())
 535       return false;
 536
 537    return true;
 538 }
 539
 540 void
 541 fs_reg::init()
 542 {
 543    memset(this, 0, sizeof(*this));
 544    stride = 1;
 545 }
 546
 547 /** Generic unset register constructor. */
 548 fs_reg::fs_reg()
 549 {
 550    init();
 551    this->file = BAD_FILE;
 552 }
 553
 554 /** Immediate value constructor. */
 555 fs_reg::fs_reg(float f)
 556 {
 557    init();
 558    this->file = IMM;
 559    this->type = BRW_REGISTER_TYPE_F;
 560    this->fixed_hw_reg.dw1.f = f;
 561    this->width = 1;
 562 }
 563
 564 /** Immediate value constructor. */
 565 fs_reg::fs_reg(int32_t i)
 566 {
 567    init();
 568    this->file = IMM;
 569    this->type = BRW_REGISTER_TYPE_D;
 570    this->fixed_hw_reg.dw1.d = i;
 571    this->width = 1;
 572 }
 573
 574 /** Immediate value constructor. */
 575 fs_reg::fs_reg(uint32_t u)
 576 {
 577    init();
 578    this->file = IMM;
 579    this->type = BRW_REGISTER_TYPE_UD;
 580    this->fixed_hw_reg.dw1.ud = u;
 581    this->width = 1;
 582 }
 583
 584 /** Vector float immediate value constructor. */
 585 fs_reg::fs_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
 586 {
 587    init();
 588    this->file = IMM;
 589    this->type = BRW_REGISTER_TYPE_VF;
 590    this->fixed_hw_reg.dw1.ud = (vf0 <<  0) |
 591                                (vf1 <<  8) |
 592                                (vf2 << 16) |
 593                                (vf3 << 24);
 594 }
 595
 596 /** Fixed brw_reg. */
 597 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 598 {
 599    init();
 600    this->file = HW_REG;
 601    this->fixed_hw_reg = fixed_hw_reg;
 602    this->type = fixed_hw_reg.type;
 603    this->width = 1 << fixed_hw_reg.width;
 604 }
 605
 606 bool
 607 fs_reg::equals(const fs_reg &r) const
 608 {
 609    return (file == r.file &&
 610            reg == r.reg &&
 611            reg_offset == r.reg_offset &&
 612            subreg_offset == r.subreg_offset &&
 613            type == r.type &&
 614            negate == r.negate &&
 615            abs == r.abs &&
 616            !reladdr && !r.reladdr &&
 617            memcmp(&fixed_hw_reg, &r.fixed_hw_reg, sizeof(fixed_hw_reg)) == 0 &&
 618            width == r.width &&
 619            stride == r.stride);
 620 }
 621
 622 fs_reg &
 623 fs_reg::set_smear(unsigned subreg)
 624 {
 625    assert(file != HW_REG && file != IMM);
 626    subreg_offset = subreg * type_sz(type);
 627    stride = 0;
 628    return *this;
 629 }
 630
 631 bool
 632 fs_reg::is_contiguous() const
 633 {
 634    return stride == 1;
 635 }
 636
 637 int
 638 fs_visitor::type_size(const struct glsl_type *type)
 639 {
 640    unsigned int size, i;
 641
 642    switch (type->base_type) {
 643    case GLSL_TYPE_UINT:
 644    case GLSL_TYPE_INT:
 645    case GLSL_TYPE_FLOAT:
 646    case GLSL_TYPE_BOOL:
 647       return type->components();
 648    case GLSL_TYPE_ARRAY:
 649       return type_size(type->fields.array) * type->length;
 650    case GLSL_TYPE_STRUCT:
 651       size = 0;
 652       for (i = 0; i < type->length; i++) {
 653          size += type_size(type->fields.structure[i].type);
 654       }
 655       return size;
 656    case GLSL_TYPE_SAMPLER:
 657       /* Samplers take up no register space, since they're baked in at
 658        * link time.
 659        */
 660       return 0;
 661    case GLSL_TYPE_ATOMIC_UINT:
 662       return 0;
 663    case GLSL_TYPE_IMAGE:
 664    case GLSL_TYPE_VOID:
 665    case GLSL_TYPE_ERROR:
 666    case GLSL_TYPE_INTERFACE:
 667       unreachable("not reached");
 668    }
 669
 670    return 0;
 671 }
 672
 673 fs_reg
 674 fs_visitor::get_timestamp()
 675 {
 676    assert(brw->gen >= 7);
 677
 678    fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 679                                           BRW_ARF_TIMESTAMP,
 680                                           0),
 681                              BRW_REGISTER_TYPE_UD));
 682
 683    fs_reg dst = fs_reg(GRF, virtual_grf_alloc(1), BRW_REGISTER_TYPE_UD, 4);
 684
 685    fs_inst *mov = emit(MOV(dst, ts));
 686    /* We want to read the 3 fields we care about even if it's not enabled in
 687     * the dispatch.
 688     */
 689    mov->force_writemask_all = true;
 690
 691    /* The caller wants the low 32 bits of the timestamp.  Since it's running
 692     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 693     * which is plenty of time for our purposes.  It is identical across the
 694     * EUs, but since it's tracking GPU core speed it will increment at a
 695     * varying rate as render P-states change.
 696     *
 697     * The caller could also check if render P-states have changed (or anything
 698     * else that might disrupt timing) by setting smear to 2 and checking if
 699     * that field is != 0.
 700     */
 701    dst.set_smear(0);
 702
 703    return dst;
 704 }
 705
 706 void
 707 fs_visitor::emit_shader_time_begin()
 708 {
 709    current_annotation = "shader time start";
 710    shader_start_time = get_timestamp();
 711 }
 712
 713 void
 714 fs_visitor::emit_shader_time_end()
 715 {
 716    current_annotation = "shader time end";
 717
 718    enum shader_time_shader_type type, written_type, reset_type;
 719    if (dispatch_width == 8) {
 720       type = ST_FS8;
 721       written_type = ST_FS8_WRITTEN;
 722       reset_type = ST_FS8_RESET;
 723    } else {
 724       assert(dispatch_width == 16);
 725       type = ST_FS16;
 726       written_type = ST_FS16_WRITTEN;
 727       reset_type = ST_FS16_RESET;
 728    }
 729
 730    fs_reg shader_end_time = get_timestamp();
 731
 732    /* Check that there weren't any timestamp reset events (assuming these
 733     * were the only two timestamp reads that happened).
 734     */
 735    fs_reg reset = shader_end_time;
 736    reset.set_smear(2);
 737    fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
 738    test->conditional_mod = BRW_CONDITIONAL_Z;
 739    emit(IF(BRW_PREDICATE_NORMAL));
 740
 741    fs_reg start = shader_start_time;
 742    start.negate = true;
 743    fs_reg diff = fs_reg(GRF, virtual_grf_alloc(1), BRW_REGISTER_TYPE_UD, 1);
 744    emit(ADD(diff, start, shader_end_time));
 745
 746    /* If there were no instructions between the two timestamp gets, the diff
 747     * is 2 cycles.  Remove that overhead, so I can forget about that when
 748     * trying to determine the time taken for single instructions.
 749     */
 750    emit(ADD(diff, diff, fs_reg(-2u)));
 751
 752    emit_shader_time_write(type, diff);
 753    emit_shader_time_write(written_type, fs_reg(1u));
 754    emit(BRW_OPCODE_ELSE);
 755    emit_shader_time_write(reset_type, fs_reg(1u));
 756    emit(BRW_OPCODE_ENDIF);
 757 }
 758
 759 void
 760 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
 761                                    fs_reg value)
 762 {
 763    int shader_time_index =
 764       brw_get_shader_time_index(brw, shader_prog, prog, type);
 765    fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
 766
 767    fs_reg payload;
 768    if (dispatch_width == 8)
 769       payload = fs_reg(this, glsl_type::uvec2_type);
 770    else
 771       payload = fs_reg(this, glsl_type::uint_type);
 772
 773    emit(new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
 774                              fs_reg(), payload, offset, value));
 775 }
 776
 777 void
 778 fs_visitor::vfail(const char *format, va_list va)
 779 {
 780    char *msg;
 781
 782    if (failed)
 783       return;
 784
 785    failed = true;
 786
 787    msg = ralloc_vasprintf(mem_ctx, format, va);
 788    msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
 789
 790    this->fail_msg = msg;
 791
 792    if (INTEL_DEBUG & DEBUG_WM) {
 793       fprintf(stderr, "%s",  msg);
 794    }
 795 }
 796
 797 void
 798 fs_visitor::fail(const char *format, ...)
 799 {
 800    va_list va;
 801
 802    va_start(va, format);
 803    vfail(format, va);
 804    va_end(va);
 805 }
 806
 807 /**
 808  * Mark this program as impossible to compile in SIMD16 mode.
 809  *
 810  * During the SIMD8 compile (which happens first), we can detect and flag
 811  * things that are unsupported in SIMD16 mode, so the compiler can skip
 812  * the SIMD16 compile altogether.
 813  *
 814  * During a SIMD16 compile (if one happens anyway), this just calls fail().
 815  */
 816 void
 817 fs_visitor::no16(const char *format, ...)
 818 {
 819    va_list va;
 820
 821    va_start(va, format);
 822
 823    if (dispatch_width == 16) {
 824       vfail(format, va);
 825    } else {
 826       simd16_unsupported = true;
 827
 828       if (brw->perf_debug) {
 829          if (no16_msg)
 830             ralloc_vasprintf_append(&no16_msg, format, va);
 831          else
 832             no16_msg = ralloc_vasprintf(mem_ctx, format, va);
 833       }
 834    }
 835
 836    va_end(va);
 837 }
 838
 839 fs_inst *
 840 fs_visitor::emit(enum opcode opcode)
 841 {
 842    return emit(new(mem_ctx) fs_inst(opcode, dispatch_width));
 843 }
 844
 845 fs_inst *
 846 fs_visitor::emit(enum opcode opcode, const fs_reg &dst)
 847 {
 848    return emit(new(mem_ctx) fs_inst(opcode, dst));
 849 }
 850
 851 fs_inst *
 852 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
 853 {
 854    return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
 855 }
 856
 857 fs_inst *
 858 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 859                  const fs_reg &src1)
 860 {
 861    return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
 862 }
 863
 864 fs_inst *
 865 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 866                  const fs_reg &src1, const fs_reg &src2)
 867 {
 868    return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
 869 }
 870
 871 fs_inst *
 872 fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
 873                  fs_reg src[], int sources)
 874 {
 875    return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
 876 }
 877
 878 /**
 879  * Returns true if the instruction has a flag that means it won't
 880  * update an entire destination register.
 881  *
 882  * For example, dead code elimination and live variable analysis want to know
 883  * when a write to a variable screens off any preceding values that were in
 884  * it.
 885  */
 886 bool
 887 fs_inst::is_partial_write() const
 888 {
 889    return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
 890            (this->dst.width * type_sz(this->dst.type)) < 32 ||
 891            !this->dst.is_contiguous());
 892 }
 893
 894 int
 895 fs_inst::regs_read(fs_visitor *v, int arg) const
 896 {
 897    if (is_tex() && arg == 0 && src[0].file == GRF) {
 898       return mlen;
 899    } else if (opcode == FS_OPCODE_FB_WRITE && arg == 0) {
 900       return mlen;
 901    } else if (opcode == SHADER_OPCODE_UNTYPED_ATOMIC && arg == 0) {
 902       return mlen;
 903    } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ && arg == 0) {
 904       return mlen;
 905    } else if (opcode == FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET && arg == 0) {
 906       return mlen;
 907    }
 908
 909    switch (src[arg].file) {
 910    case BAD_FILE:
 911    case UNIFORM:
 912    case IMM:
 913       return 1;
 914    case GRF:
 915    case HW_REG:
 916       if (src[arg].stride == 0) {
 917          return 1;
 918       } else {
 919          int size = src[arg].width * src[arg].stride * type_sz(src[arg].type);
 920          return (size + 31) / 32;
 921       }
 922    case MRF:
 923       unreachable("MRF registers are not allowed as sources");
 924    default:
 925       unreachable("Invalid register file");
 926    }
 927 }
 928
 929 bool
 930 fs_inst::reads_flag() const
 931 {
 932    return predicate;
 933 }
 934
 935 bool
 936 fs_inst::writes_flag() const
 937 {
 938    return (conditional_mod && (opcode != BRW_OPCODE_SEL &&
 939                                opcode != BRW_OPCODE_IF &&
 940                                opcode != BRW_OPCODE_WHILE)) ||
 941           opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
 942 }
 943
 944 /**
 945  * Returns how many MRFs an FS opcode will write over.
 946  *
 947  * Note that this is not the 0 or 1 implied writes in an actual gen
 948  * instruction -- the FS opcodes often generate MOVs in addition.
 949  */
 950 int
 951 fs_visitor::implied_mrf_writes(fs_inst *inst)
 952 {
 953    if (inst->mlen == 0)
 954       return 0;
 955
 956    if (inst->base_mrf == -1)
 957       return 0;
 958
 959    switch (inst->opcode) {
 960    case SHADER_OPCODE_RCP:
 961    case SHADER_OPCODE_RSQ:
 962    case SHADER_OPCODE_SQRT:
 963    case SHADER_OPCODE_EXP2:
 964    case SHADER_OPCODE_LOG2:
 965    case SHADER_OPCODE_SIN:
 966    case SHADER_OPCODE_COS:
 967       return 1 * dispatch_width / 8;
 968    case SHADER_OPCODE_POW:
 969    case SHADER_OPCODE_INT_QUOTIENT:
 970    case SHADER_OPCODE_INT_REMAINDER:
 971       return 2 * dispatch_width / 8;
 972    case SHADER_OPCODE_TEX:
 973    case FS_OPCODE_TXB:
 974    case SHADER_OPCODE_TXD:
 975    case SHADER_OPCODE_TXF:
 976    case SHADER_OPCODE_TXF_CMS:
 977    case SHADER_OPCODE_TXF_MCS:
 978    case SHADER_OPCODE_TG4:
 979    case SHADER_OPCODE_TG4_OFFSET:
 980    case SHADER_OPCODE_TXL:
 981    case SHADER_OPCODE_TXS:
 982    case SHADER_OPCODE_LOD:
 983       return 1;
 984    case FS_OPCODE_FB_WRITE:
 985       return 2;
 986    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 987    case SHADER_OPCODE_GEN4_SCRATCH_READ:
 988       return 1;
 989    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
 990       return inst->mlen;
 991    case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
 992       return 2;
 993    case SHADER_OPCODE_UNTYPED_ATOMIC:
 994    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
 995    case FS_OPCODE_INTERPOLATE_AT_CENTROID:
 996    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
 997    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
 998    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
 999       return 0;
1000    default:
1001       unreachable("not reached");
1002    }
1003 }
1004
1005 int
1006 fs_visitor::virtual_grf_alloc(int size)
1007 {
1008    if (virtual_grf_array_size <= virtual_grf_count) {
1009       if (virtual_grf_array_size == 0)
1010          virtual_grf_array_size = 16;
1011       else
1012          virtual_grf_array_size *= 2;
1013       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
1014                                    virtual_grf_array_size);
1015    }
1016    virtual_grf_sizes[virtual_grf_count] = size;
1017    return virtual_grf_count++;
1018 }
1019
1020 /** Fixed HW reg constructor. */
1021 fs_reg::fs_reg(enum register_file file, int reg)
1022 {
1023    init();
1024    this->file = file;
1025    this->reg = reg;
1026    this->type = BRW_REGISTER_TYPE_F;
1027
1028    switch (file) {
1029    case UNIFORM:
1030       this->width = 1;
1031       break;
1032    default:
1033       this->width = 8;
1034    }
1035 }
1036
1037 /** Fixed HW reg constructor. */
1038 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
1039 {
1040    init();
1041    this->file = file;
1042    this->reg = reg;
1043    this->type = type;
1044
1045    switch (file) {
1046    case UNIFORM:
1047       this->width = 1;
1048       break;
1049    default:
1050       this->width = 8;
1051    }
1052 }
1053
1054 /** Fixed HW reg constructor. */
1055 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type,
1056                uint8_t width)
1057 {
1058    init();
1059    this->file = file;
1060    this->reg = reg;
1061    this->type = type;
1062    this->width = width;
1063 }
1064
1065 /** Automatic reg constructor. */
1066 fs_reg::fs_reg(fs_visitor *v, const struct glsl_type *type)
1067 {
1068    init();
1069    int reg_width = v->dispatch_width / 8;
1070
1071    this->file = GRF;
1072    this->reg = v->virtual_grf_alloc(v->type_size(type) * reg_width);
1073    this->reg_offset = 0;
1074    this->type = brw_type_for_base_type(type);
1075    this->width = v->dispatch_width;
1076    assert(this->width == 8 || this->width == 16);
1077 }
1078
1079 fs_reg *
1080 fs_visitor::variable_storage(ir_variable *var)
1081 {
1082    return (fs_reg *)hash_table_find(this->variable_ht, var);
1083 }
1084
1085 void
1086 import_uniforms_callback(const void *key,
1087                          void *data,
1088                          void *closure)
1089 {
1090    struct hash_table *dst_ht = (struct hash_table *)closure;
1091    const fs_reg *reg = (const fs_reg *)data;
1092
1093    if (reg->file != UNIFORM)
1094       return;
1095
1096    hash_table_insert(dst_ht, data, key);
1097 }
1098
1099 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
1100  * This brings in those uniform definitions
1101  */
1102 void
1103 fs_visitor::import_uniforms(fs_visitor *v)
1104 {
1105    hash_table_call_foreach(v->variable_ht,
1106                            import_uniforms_callback,
1107                            variable_ht);
1108    this->push_constant_loc = v->push_constant_loc;
1109    this->pull_constant_loc = v->pull_constant_loc;
1110    this->uniforms = v->uniforms;
1111    this->param_size = v->param_size;
1112 }
1113
1114 /* Our support for uniforms is piggy-backed on the struct
1115  * gl_fragment_program, because that's where the values actually
1116  * get stored, rather than in some global gl_shader_program uniform
1117  * store.
1118  */
1119 void
1120 fs_visitor::setup_uniform_values(ir_variable *ir)
1121 {
1122    int namelen = strlen(ir->name);
1123
1124    /* The data for our (non-builtin) uniforms is stored in a series of
1125     * gl_uniform_driver_storage structs for each subcomponent that
1126     * glGetUniformLocation() could name.  We know it's been set up in the same
1127     * order we'd walk the type, so walk the list of storage and find anything
1128     * with our name, or the prefix of a component that starts with our name.
1129     */
1130    unsigned params_before = uniforms;
1131    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
1132       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
1133
1134       if (strncmp(ir->name, storage->name, namelen) != 0 ||
1135           (storage->name[namelen] != 0 &&
1136            storage->name[namelen] != '.' &&
1137            storage->name[namelen] != '[')) {
1138          continue;
1139       }
1140
1141       unsigned slots = storage->type->component_slots();
1142       if (storage->array_elements)
1143          slots *= storage->array_elements;
1144
1145       for (unsigned i = 0; i < slots; i++) {
1146          stage_prog_data->param[uniforms++] = &storage->storage[i];
1147       }
1148    }
1149
1150    /* Make sure we actually initialized the right amount of stuff here. */
1151    assert(params_before + ir->type->component_slots() == uniforms);
1152    (void)params_before;
1153 }
1154
1155
1156 /* Our support for builtin uniforms is even scarier than non-builtin.
1157  * It sits on top of the PROG_STATE_VAR parameters that are
1158  * automatically updated from GL context state.
1159  */
1160 void
1161 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
1162 {
1163    const ir_state_slot *const slots = ir->get_state_slots();
1164    assert(slots != NULL);
1165
1166    for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
1167       /* This state reference has already been setup by ir_to_mesa, but we'll
1168        * get the same index back here.
1169        */
1170       int index = _mesa_add_state_reference(this->prog->Parameters,
1171                                             (gl_state_index *)slots[i].tokens);
1172
1173       /* Add each of the unique swizzles of the element as a parameter.
1174        * This'll end up matching the expected layout of the
1175        * array/matrix/structure we're trying to fill in.
1176        */
1177       int last_swiz = -1;
1178       for (unsigned int j = 0; j < 4; j++) {
1179          int swiz = GET_SWZ(slots[i].swizzle, j);
1180          if (swiz == last_swiz)
1181             break;
1182          last_swiz = swiz;
1183
1184          stage_prog_data->param[uniforms++] =
1185             &prog->Parameters->ParameterValues[index][swiz];
1186       }
1187    }
1188 }
1189
1190 fs_reg *
1191 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
1192 {
1193    assert(stage == MESA_SHADER_FRAGMENT);
1194    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1195    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1196    fs_reg wpos = *reg;
1197    bool flip = !ir->data.origin_upper_left ^ key->render_to_fbo;
1198
1199    /* gl_FragCoord.x */
1200    if (ir->data.pixel_center_integer) {
1201       emit(MOV(wpos, this->pixel_x));
1202    } else {
1203       emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
1204    }
1205    wpos = offset(wpos, 1);
1206
1207    /* gl_FragCoord.y */
1208    if (!flip && ir->data.pixel_center_integer) {
1209       emit(MOV(wpos, this->pixel_y));
1210    } else {
1211       fs_reg pixel_y = this->pixel_y;
1212       float offset = (ir->data.pixel_center_integer ? 0.0 : 0.5);
1213
1214       if (flip) {
1215          pixel_y.negate = true;
1216          offset += key->drawable_height - 1.0;
1217       }
1218
1219       emit(ADD(wpos, pixel_y, fs_reg(offset)));
1220    }
1221    wpos = offset(wpos, 1);
1222
1223    /* gl_FragCoord.z */
1224    if (brw->gen >= 6) {
1225       emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
1226    } else {
1227       emit(FS_OPCODE_LINTERP, wpos,
1228            this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1229            this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1230            interp_reg(VARYING_SLOT_POS, 2));
1231    }
1232    wpos = offset(wpos, 1);
1233
1234    /* gl_FragCoord.w: Already set up in emit_interpolation */
1235    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1236
1237    return reg;
1238 }
1239
1240 fs_inst *
1241 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1242                          glsl_interp_qualifier interpolation_mode,
1243                          bool is_centroid, bool is_sample)
1244 {
1245    brw_wm_barycentric_interp_mode barycoord_mode;
1246    if (brw->gen >= 6) {
1247       if (is_centroid) {
1248          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1249             barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1250          else
1251             barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1252       } else if (is_sample) {
1253           if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1254             barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1255          else
1256             barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1257       } else {
1258          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1259             barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1260          else
1261             barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1262       }
1263    } else {
1264       /* On Ironlake and below, there is only one interpolation mode.
1265        * Centroid interpolation doesn't mean anything on this hardware --
1266        * there is no multisampling.
1267        */
1268       barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1269    }
1270    return emit(FS_OPCODE_LINTERP, attr,
1271                this->delta_x[barycoord_mode],
1272                this->delta_y[barycoord_mode], interp);
1273 }
1274
1275 fs_reg *
1276 fs_visitor::emit_general_interpolation(ir_variable *ir)
1277 {
1278    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1279    reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
1280    fs_reg attr = *reg;
1281
1282    assert(stage == MESA_SHADER_FRAGMENT);
1283    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1284    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1285
1286    unsigned int array_elements;
1287    const glsl_type *type;
1288
1289    if (ir->type->is_array()) {
1290       array_elements = ir->type->length;
1291       if (array_elements == 0) {
1292          fail("dereferenced array '%s' has length 0\n", ir->name);
1293       }
1294       type = ir->type->fields.array;
1295    } else {
1296       array_elements = 1;
1297       type = ir->type;
1298    }
1299
1300    glsl_interp_qualifier interpolation_mode =
1301       ir->determine_interpolation_mode(key->flat_shade);
1302
1303    int location = ir->data.location;
1304    for (unsigned int i = 0; i < array_elements; i++) {
1305       for (unsigned int j = 0; j < type->matrix_columns; j++) {
1306          if (prog_data->urb_setup[location] == -1) {
1307             /* If there's no incoming setup data for this slot, don't
1308              * emit interpolation for it.
1309              */
1310             attr = offset(attr, type->vector_elements);
1311             location++;
1312             continue;
1313          }
1314
1315          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1316             /* Constant interpolation (flat shading) case. The SF has
1317              * handed us defined values in only the constant offset
1318              * field of the setup reg.
1319              */
1320             for (unsigned int k = 0; k < type->vector_elements; k++) {
1321                struct brw_reg interp = interp_reg(location, k);
1322                interp = suboffset(interp, 3);
1323                interp.type = reg->type;
1324                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1325                attr = offset(attr, 1);
1326             }
1327          } else {
1328             /* Smooth/noperspective interpolation case. */
1329             for (unsigned int k = 0; k < type->vector_elements; k++) {
1330                struct brw_reg interp = interp_reg(location, k);
1331                if (brw->needs_unlit_centroid_workaround && ir->data.centroid) {
1332                   /* Get the pixel/sample mask into f0 so that we know
1333                    * which pixels are lit.  Then, for each channel that is
1334                    * unlit, replace the centroid data with non-centroid
1335                    * data.
1336                    */
1337                   emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1338
1339                   fs_inst *inst;
1340                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1341                                       false, false);
1342                   inst->predicate = BRW_PREDICATE_NORMAL;
1343                   inst->predicate_inverse = true;
1344                   if (brw->has_pln)
1345                      inst->no_dd_clear = true;
1346
1347                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1348                                       ir->data.centroid && !key->persample_shading,
1349                                       ir->data.sample || key->persample_shading);
1350                   inst->predicate = BRW_PREDICATE_NORMAL;
1351                   inst->predicate_inverse = false;
1352                   if (brw->has_pln)
1353                      inst->no_dd_check = true;
1354
1355                } else {
1356                   emit_linterp(attr, fs_reg(interp), interpolation_mode,
1357                                ir->data.centroid && !key->persample_shading,
1358                                ir->data.sample || key->persample_shading);
1359                }
1360                if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1361                   emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1362                }
1363                attr = offset(attr, 1);
1364             }
1365
1366          }
1367          location++;
1368       }
1369    }
1370
1371    return reg;
1372 }
1373
1374 fs_reg *
1375 fs_visitor::emit_frontfacing_interpolation()
1376 {
1377    fs_reg *reg = new(this->mem_ctx) fs_reg(this, glsl_type::bool_type);
1378
1379    if (brw->gen >= 6) {
1380       /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
1381        * a boolean result from this (~0/true or 0/false).
1382        *
1383        * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
1384        * this task in only one instruction:
1385        *    - a negation source modifier will flip the bit; and
1386        *    - a W -> D type conversion will sign extend the bit into the high
1387        *      word of the destination.
1388        *
1389        * An ASR 15 fills the low word of the destination.
1390        */
1391       fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
1392       g0.negate = true;
1393
1394       emit(ASR(*reg, g0, fs_reg(15)));
1395    } else {
1396       /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
1397        * a boolean result from this (1/true or 0/false).
1398        *
1399        * Like in the above case, since the bit is the MSB of g1.6:UD we can use
1400        * the negation source modifier to flip it. Unfortunately the SHR
1401        * instruction only operates on UD (or D with an abs source modifier)
1402        * sources without negation.
1403        *
1404        * Instead, use ASR (which will give ~0/true or 0/false).
1405        */
1406       fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
1407       g1_6.negate = true;
1408
1409       emit(ASR(*reg, g1_6, fs_reg(31)));
1410    }
1411
1412    return reg;
1413 }
1414
1415 void
1416 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1417 {
1418    assert(stage == MESA_SHADER_FRAGMENT);
1419    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1420    assert(dst.type == BRW_REGISTER_TYPE_F);
1421
1422    if (key->compute_pos_offset) {
1423       /* Convert int_sample_pos to floating point */
1424       emit(MOV(dst, int_sample_pos));
1425       /* Scale to the range [0, 1] */
1426       emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1427    }
1428    else {
1429       /* From ARB_sample_shading specification:
1430        * "When rendering to a non-multisample buffer, or if multisample
1431        *  rasterization is disabled, gl_SamplePosition will always be
1432        *  (0.5, 0.5).
1433        */
1434       emit(MOV(dst, fs_reg(0.5f)));
1435    }
1436 }
1437
1438 fs_reg *
1439 fs_visitor::emit_samplepos_setup()
1440 {
1441    assert(brw->gen >= 6);
1442
1443    this->current_annotation = "compute sample position";
1444    fs_reg *reg = new(this->mem_ctx) fs_reg(this, glsl_type::vec2_type);
1445    fs_reg pos = *reg;
1446    fs_reg int_sample_x = fs_reg(this, glsl_type::int_type);
1447    fs_reg int_sample_y = fs_reg(this, glsl_type::int_type);
1448
1449    /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1450     * mode will be enabled.
1451     *
1452     * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1453     * R31.1:0         Position Offset X/Y for Slot[3:0]
1454     * R31.3:2         Position Offset X/Y for Slot[7:4]
1455     * .....
1456     *
1457     * The X, Y sample positions come in as bytes in  thread payload. So, read
1458     * the positions using vstride=16, width=8, hstride=2.
1459     */
1460    struct brw_reg sample_pos_reg =
1461       stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1462                     BRW_REGISTER_TYPE_B), 16, 8, 2);
1463
1464    if (dispatch_width == 8) {
1465       emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1466    } else {
1467       emit(MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg)));
1468       emit(MOV(half(int_sample_x, 1), fs_reg(suboffset(sample_pos_reg, 16))))
1469          ->force_sechalf = true;
1470    }
1471    /* Compute gl_SamplePosition.x */
1472    compute_sample_position(pos, int_sample_x);
1473    pos = offset(pos, 1);
1474    if (dispatch_width == 8) {
1475       emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1476    } else {
1477       emit(MOV(half(int_sample_y, 0),
1478                fs_reg(suboffset(sample_pos_reg, 1))));
1479       emit(MOV(half(int_sample_y, 1), fs_reg(suboffset(sample_pos_reg, 17))))
1480          ->force_sechalf = true;
1481    }
1482    /* Compute gl_SamplePosition.y */
1483    compute_sample_position(pos, int_sample_y);
1484    return reg;
1485 }
1486
1487 fs_reg *
1488 fs_visitor::emit_sampleid_setup()
1489 {
1490    assert(stage == MESA_SHADER_FRAGMENT);
1491    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1492    assert(brw->gen >= 6);
1493
1494    this->current_annotation = "compute sample id";
1495    fs_reg *reg = new(this->mem_ctx) fs_reg(this, glsl_type::int_type);
1496
1497    if (key->compute_sample_id) {
1498       fs_reg t1 = fs_reg(this, glsl_type::int_type);
1499       fs_reg t2 = fs_reg(this, glsl_type::int_type);
1500       t2.type = BRW_REGISTER_TYPE_UW;
1501
1502       /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1503        * 8x multisampling, subspan 0 will represent sample N (where N
1504        * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1505        * 7. We can find the value of N by looking at R0.0 bits 7:6
1506        * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1507        * (since samples are always delivered in pairs). That is, we
1508        * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1509        * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1510        * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1511        * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1512        * populating a temporary variable with the sequence (0, 1, 2, 3),
1513        * and then reading from it using vstride=1, width=4, hstride=0.
1514        * These computations hold good for 4x multisampling as well.
1515        *
1516        * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1517        * the first four slots are sample 0 of subspan 0; the next four
1518        * are sample 1 of subspan 0; the third group is sample 0 of
1519        * subspan 1, and finally sample 1 of subspan 1.
1520        */
1521       fs_inst *inst;
1522       inst = emit(BRW_OPCODE_AND, t1,
1523                   fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1524                   fs_reg(0xc0));
1525       inst->force_writemask_all = true;
1526       inst = emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1527       inst->force_writemask_all = true;
1528       /* This works for both SIMD8 and SIMD16 */
1529       inst = emit(MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210)));
1530       inst->force_writemask_all = true;
1531       /* This special instruction takes care of setting vstride=1,
1532        * width=4, hstride=0 of t2 during an ADD instruction.
1533        */
1534       emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1535    } else {
1536       /* As per GL_ARB_sample_shading specification:
1537        * "When rendering to a non-multisample buffer, or if multisample
1538        *  rasterization is disabled, gl_SampleID will always be zero."
1539        */
1540       emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1541    }
1542
1543    return reg;
1544 }
1545
1546 fs_reg
1547 fs_visitor::fix_math_operand(fs_reg src)
1548 {
1549    /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1550     * might be able to do better by doing execsize = 1 math and then
1551     * expanding that result out, but we would need to be careful with
1552     * masking.
1553     *
1554     * The hardware ignores source modifiers (negate and abs) on math
1555     * instructions, so we also move to a temp to set those up.
1556     */
1557    if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1558        !src.abs && !src.negate)
1559       return src;
1560
1561    /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1562     * operands to math
1563     */
1564    if (brw->gen >= 7 && src.file != IMM)
1565       return src;
1566
1567    fs_reg expanded = fs_reg(this, glsl_type::float_type);
1568    expanded.type = src.type;
1569    emit(BRW_OPCODE_MOV, expanded, src);
1570    return expanded;
1571 }
1572
1573 fs_inst *
1574 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1575 {
1576    switch (opcode) {
1577    case SHADER_OPCODE_RCP:
1578    case SHADER_OPCODE_RSQ:
1579    case SHADER_OPCODE_SQRT:
1580    case SHADER_OPCODE_EXP2:
1581    case SHADER_OPCODE_LOG2:
1582    case SHADER_OPCODE_SIN:
1583    case SHADER_OPCODE_COS:
1584       break;
1585    default:
1586       unreachable("not reached: bad math opcode");
1587    }
1588
1589    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
1590     * might be able to do better by doing execsize = 1 math and then
1591     * expanding that result out, but we would need to be careful with
1592     * masking.
1593     *
1594     * Gen 6 hardware ignores source modifiers (negate and abs) on math
1595     * instructions, so we also move to a temp to set those up.
1596     */
1597    if (brw->gen == 6 || brw->gen == 7)
1598       src = fix_math_operand(src);
1599
1600    fs_inst *inst = emit(opcode, dst, src);
1601
1602    if (brw->gen < 6) {
1603       inst->base_mrf = 2;
1604       inst->mlen = dispatch_width / 8;
1605    }
1606
1607    return inst;
1608 }
1609
1610 fs_inst *
1611 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1612 {
1613    int base_mrf = 2;
1614    fs_inst *inst;
1615
1616    if (brw->gen >= 8) {
1617       inst = emit(opcode, dst, src0, src1);
1618    } else if (brw->gen >= 6) {
1619       src0 = fix_math_operand(src0);
1620       src1 = fix_math_operand(src1);
1621
1622       inst = emit(opcode, dst, src0, src1);
1623    } else {
1624       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1625        * "Message Payload":
1626        *
1627        * "Operand0[7].  For the INT DIV functions, this operand is the
1628        *  denominator."
1629        *  ...
1630        * "Operand1[7].  For the INT DIV functions, this operand is the
1631        *  numerator."
1632        */
1633       bool is_int_div = opcode != SHADER_OPCODE_POW;
1634       fs_reg &op0 = is_int_div ? src1 : src0;
1635       fs_reg &op1 = is_int_div ? src0 : src1;
1636
1637       emit(MOV(fs_reg(MRF, base_mrf + 1, op1.type, dispatch_width), op1));
1638       inst = emit(opcode, dst, op0, reg_null_f);
1639
1640       inst->base_mrf = base_mrf;
1641       inst->mlen = 2 * dispatch_width / 8;
1642    }
1643    return inst;
1644 }
1645
1646 void
1647 fs_visitor::assign_curb_setup()
1648 {
1649    if (dispatch_width == 8) {
1650       prog_data->dispatch_grf_start_reg = payload.num_regs;
1651    } else {
1652       assert(stage == MESA_SHADER_FRAGMENT);
1653       brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1654       prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1655    }
1656
1657    prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1658
1659    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1660    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1661       for (unsigned int i = 0; i < inst->sources; i++) {
1662          if (inst->src[i].file == UNIFORM) {
1663             int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1664             int constant_nr;
1665             if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1666                constant_nr = push_constant_loc[uniform_nr];
1667             } else {
1668                /* Section 5.11 of the OpenGL 4.1 spec says:
1669                 * "Out-of-bounds reads return undefined values, which include
1670                 *  values from other variables of the active program or zero."
1671                 * Just return the first push constant.
1672                 */
1673                constant_nr = 0;
1674             }
1675
1676             struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1677                                                   constant_nr / 8,
1678                                                   constant_nr % 8);
1679
1680             inst->src[i].file = HW_REG;
1681             inst->src[i].fixed_hw_reg = byte_offset(
1682                retype(brw_reg, inst->src[i].type),
1683                inst->src[i].subreg_offset);
1684          }
1685       }
1686    }
1687 }
1688
1689 void
1690 fs_visitor::calculate_urb_setup()
1691 {
1692    assert(stage == MESA_SHADER_FRAGMENT);
1693    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1694    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1695
1696    memset(prog_data->urb_setup, -1,
1697           sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
1698
1699    int urb_next = 0;
1700    /* Figure out where each of the incoming setup attributes lands. */
1701    if (brw->gen >= 6) {
1702       if (_mesa_bitcount_64(prog->InputsRead &
1703                             BRW_FS_VARYING_INPUT_MASK) <= 16) {
1704          /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1705           * first 16 varying inputs, so we can put them wherever we want.
1706           * Just put them in order.
1707           *
1708           * This is useful because it means that (a) inputs not used by the
1709           * fragment shader won't take up valuable register space, and (b) we
1710           * won't have to recompile the fragment shader if it gets paired with
1711           * a different vertex (or geometry) shader.
1712           */
1713          for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1714             if (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1715                 BITFIELD64_BIT(i)) {
1716                prog_data->urb_setup[i] = urb_next++;
1717             }
1718          }
1719       } else {
1720          /* We have enough input varyings that the SF/SBE pipeline stage can't
1721           * arbitrarily rearrange them to suit our whim; we have to put them
1722           * in an order that matches the output of the previous pipeline stage
1723           * (geometry or vertex shader).
1724           */
1725          struct brw_vue_map prev_stage_vue_map;
1726          brw_compute_vue_map(brw, &prev_stage_vue_map,
1727                              key->input_slots_valid);
1728          int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1729          assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1730          for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1731               slot++) {
1732             int varying = prev_stage_vue_map.slot_to_varying[slot];
1733             /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1734              * unused.
1735              */
1736             if (varying != BRW_VARYING_SLOT_COUNT &&
1737                 (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1738                  BITFIELD64_BIT(varying))) {
1739                prog_data->urb_setup[varying] = slot - first_slot;
1740             }
1741          }
1742          urb_next = prev_stage_vue_map.num_slots - first_slot;
1743       }
1744    } else {
1745       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1746       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1747          /* Point size is packed into the header, not as a general attribute */
1748          if (i == VARYING_SLOT_PSIZ)
1749             continue;
1750
1751          if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1752             /* The back color slot is skipped when the front color is
1753              * also written to.  In addition, some slots can be
1754              * written in the vertex shader and not read in the
1755              * fragment shader.  So the register number must always be
1756              * incremented, mapped or not.
1757              */
1758             if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1759                prog_data->urb_setup[i] = urb_next;
1760             urb_next++;
1761          }
1762       }
1763
1764       /*
1765        * It's a FS only attribute, and we did interpolation for this attribute
1766        * in SF thread. So, count it here, too.
1767        *
1768        * See compile_sf_prog() for more info.
1769        */
1770       if (prog->InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1771          prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1772    }
1773
1774    prog_data->num_varying_inputs = urb_next;
1775 }
1776
1777 void
1778 fs_visitor::assign_urb_setup()
1779 {
1780    assert(stage == MESA_SHADER_FRAGMENT);
1781    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1782
1783    int urb_start = payload.num_regs + prog_data->base.curb_read_length;
1784
1785    /* Offset all the urb_setup[] index by the actual position of the
1786     * setup regs, now that the location of the constants has been chosen.
1787     */
1788    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1789       if (inst->opcode == FS_OPCODE_LINTERP) {
1790          assert(inst->src[2].file == HW_REG);
1791          inst->src[2].fixed_hw_reg.nr += urb_start;
1792       }
1793
1794       if (inst->opcode == FS_OPCODE_CINTERP) {
1795          assert(inst->src[0].file == HW_REG);
1796          inst->src[0].fixed_hw_reg.nr += urb_start;
1797       }
1798    }
1799
1800    /* Each attribute is 4 setup channels, each of which is half a reg. */
1801    this->first_non_payload_grf =
1802       urb_start + prog_data->num_varying_inputs * 2;
1803 }
1804
1805 /**
1806  * Split large virtual GRFs into separate components if we can.
1807  *
1808  * This is mostly duplicated with what brw_fs_vector_splitting does,
1809  * but that's really conservative because it's afraid of doing
1810  * splitting that doesn't result in real progress after the rest of
1811  * the optimization phases, which would cause infinite looping in
1812  * optimization.  We can do it once here, safely.  This also has the
1813  * opportunity to split interpolated values, or maybe even uniforms,
1814  * which we don't have at the IR level.
1815  *
1816  * We want to split, because virtual GRFs are what we register
1817  * allocate and spill (due to contiguousness requirements for some
1818  * instructions), and they're what we naturally generate in the
1819  * codegen process, but most virtual GRFs don't actually need to be
1820  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1821  * live intervals and better dead code elimination and coalescing.
1822  */
1823 void
1824 fs_visitor::split_virtual_grfs()
1825 {
1826    int num_vars = this->virtual_grf_count;
1827
1828    /* Count the total number of registers */
1829    int reg_count = 0;
1830    int vgrf_to_reg[num_vars];
1831    for (int i = 0; i < num_vars; i++) {
1832       vgrf_to_reg[i] = reg_count;
1833       reg_count += virtual_grf_sizes[i];
1834    }
1835
1836    /* An array of "split points".  For each register slot, this indicates
1837     * if this slot can be separated from the previous slot.  Every time an
1838     * instruction uses multiple elements of a register (as a source or
1839     * destination), we mark the used slots as inseparable.  Then we go
1840     * through and split the registers into the smallest pieces we can.
1841     */
1842    bool split_points[reg_count];
1843    memset(split_points, 0, sizeof(split_points));
1844
1845    /* Mark all used registers as fully splittable */
1846    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1847       if (inst->dst.file == GRF) {
1848          int reg = vgrf_to_reg[inst->dst.reg];
1849          for (int j = 1; j < this->virtual_grf_sizes[inst->dst.reg]; j++)
1850             split_points[reg + j] = true;
1851       }
1852
1853       for (int i = 0; i < inst->sources; i++) {
1854          if (inst->src[i].file == GRF) {
1855             int reg = vgrf_to_reg[inst->src[i].reg];
1856             for (int j = 1; j < this->virtual_grf_sizes[inst->src[i].reg]; j++)
1857                split_points[reg + j] = true;
1858          }
1859       }
1860    }
1861
1862    if (brw->has_pln &&
1863        this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1864       /* PLN opcodes rely on the delta_xy being contiguous.  We only have to
1865        * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1866        * Gen6, that was the only supported interpolation mode, and since Gen6,
1867        * delta_x and delta_y are in fixed hardware registers.
1868        */
1869       int vgrf = this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg;
1870       split_points[vgrf_to_reg[vgrf] + 1] = false;
1871    }
1872
1873    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1874       if (inst->dst.file == GRF) {
1875          int reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
1876          for (int j = 1; j < inst->regs_written; j++)
1877             split_points[reg + j] = false;
1878       }
1879       for (int i = 0; i < inst->sources; i++) {
1880          if (inst->src[i].file == GRF) {
1881             int reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
1882             for (int j = 1; j < inst->regs_read(this, i); j++)
1883                split_points[reg + j] = false;
1884          }
1885       }
1886    }
1887
1888    int new_virtual_grf[reg_count];
1889    int new_reg_offset[reg_count];
1890
1891    int reg = 0;
1892    for (int i = 0; i < num_vars; i++) {
1893       /* The first one should always be 0 as a quick sanity check. */
1894       assert(split_points[reg] == false);
1895
1896       /* j = 0 case */
1897       new_reg_offset[reg] = 0;
1898       reg++;
1899       int offset = 1;
1900
1901       /* j > 0 case */
1902       for (int j = 1; j < virtual_grf_sizes[i]; j++) {
1903          /* If this is a split point, reset the offset to 0 and allocate a
1904           * new virtual GRF for the previous offset many registers
1905           */
1906          if (split_points[reg]) {
1907             assert(offset <= MAX_VGRF_SIZE);
1908             int grf = virtual_grf_alloc(offset);
1909             for (int k = reg - offset; k < reg; k++)
1910                new_virtual_grf[k] = grf;
1911             offset = 0;
1912          }
1913          new_reg_offset[reg] = offset;
1914          offset++;
1915          reg++;
1916       }
1917
1918       /* The last one gets the original register number */
1919       assert(offset <= MAX_VGRF_SIZE);
1920       virtual_grf_sizes[i] = offset;
1921       for (int k = reg - offset; k < reg; k++)
1922          new_virtual_grf[k] = i;
1923    }
1924    assert(reg == reg_count);
1925
1926    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1927       if (inst->dst.file == GRF) {
1928          reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
1929          inst->dst.reg = new_virtual_grf[reg];
1930          inst->dst.reg_offset = new_reg_offset[reg];
1931          assert(new_reg_offset[reg] < virtual_grf_sizes[new_virtual_grf[reg]]);
1932       }
1933       for (int i = 0; i < inst->sources; i++) {
1934          if (inst->src[i].file == GRF) {
1935             reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
1936             inst->src[i].reg = new_virtual_grf[reg];
1937             inst->src[i].reg_offset = new_reg_offset[reg];
1938             assert(new_reg_offset[reg] < virtual_grf_sizes[new_virtual_grf[reg]]);
1939          }
1940       }
1941    }
1942    invalidate_live_intervals();
1943 }
1944
1945 /**
1946  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1947  *
1948  * During code generation, we create tons of temporary variables, many of
1949  * which get immediately killed and are never used again.  Yet, in later
1950  * optimization and analysis passes, such as compute_live_intervals, we need
1951  * to loop over all the virtual GRFs.  Compacting them can save a lot of
1952  * overhead.
1953  */
1954 bool
1955 fs_visitor::compact_virtual_grfs()
1956 {
1957    bool progress = false;
1958    int remap_table[this->virtual_grf_count];
1959    memset(remap_table, -1, sizeof(remap_table));
1960
1961    /* Mark which virtual GRFs are used. */
1962    foreach_block_and_inst(block, const fs_inst, inst, cfg) {
1963       if (inst->dst.file == GRF)
1964          remap_table[inst->dst.reg] = 0;
1965
1966       for (int i = 0; i < inst->sources; i++) {
1967          if (inst->src[i].file == GRF)
1968             remap_table[inst->src[i].reg] = 0;
1969       }
1970    }
1971
1972    /* Compact the GRF arrays. */
1973    int new_index = 0;
1974    for (int i = 0; i < this->virtual_grf_count; i++) {
1975       if (remap_table[i] == -1) {
1976          /* We just found an unused register.  This means that we are
1977           * actually going to compact something.
1978           */
1979          progress = true;
1980       } else {
1981          remap_table[i] = new_index;
1982          virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1983          invalidate_live_intervals();
1984          ++new_index;
1985       }
1986    }
1987
1988    this->virtual_grf_count = new_index;
1989
1990    /* Patch all the instructions to use the newly renumbered registers */
1991    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1992       if (inst->dst.file == GRF)
1993          inst->dst.reg = remap_table[inst->dst.reg];
1994
1995       for (int i = 0; i < inst->sources; i++) {
1996          if (inst->src[i].file == GRF)
1997             inst->src[i].reg = remap_table[inst->src[i].reg];
1998       }
1999    }
2000
2001    /* Patch all the references to delta_x/delta_y, since they're used in
2002     * register allocation.  If they're unused, switch them to BAD_FILE so
2003     * we don't think some random VGRF is delta_x/delta_y.
2004     */
2005    for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
2006       if (delta_x[i].file == GRF) {
2007          if (remap_table[delta_x[i].reg] != -1) {
2008             delta_x[i].reg = remap_table[delta_x[i].reg];
2009          } else {
2010             delta_x[i].file = BAD_FILE;
2011          }
2012       }
2013    }
2014    for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
2015       if (delta_y[i].file == GRF) {
2016          if (remap_table[delta_y[i].reg] != -1) {
2017             delta_y[i].reg = remap_table[delta_y[i].reg];
2018          } else {
2019             delta_y[i].file = BAD_FILE;
2020          }
2021       }
2022    }
2023
2024    return progress;
2025 }
2026
2027 /*
2028  * Implements array access of uniforms by inserting a
2029  * PULL_CONSTANT_LOAD instruction.
2030  *
2031  * Unlike temporary GRF array access (where we don't support it due to
2032  * the difficulty of doing relative addressing on instruction
2033  * destinations), we could potentially do array access of uniforms
2034  * that were loaded in GRF space as push constants.  In real-world
2035  * usage we've seen, though, the arrays being used are always larger
2036  * than we could load as push constants, so just always move all
2037  * uniform array access out to a pull constant buffer.
2038  */
2039 void
2040 fs_visitor::move_uniform_array_access_to_pull_constants()
2041 {
2042    if (dispatch_width != 8)
2043       return;
2044
2045    pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2046    memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
2047
2048    /* Walk through and find array access of uniforms.  Put a copy of that
2049     * uniform in the pull constant buffer.
2050     *
2051     * Note that we don't move constant-indexed accesses to arrays.  No
2052     * testing has been done of the performance impact of this choice.
2053     */
2054    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2055       for (int i = 0 ; i < inst->sources; i++) {
2056          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2057             continue;
2058
2059          int uniform = inst->src[i].reg;
2060
2061          /* If this array isn't already present in the pull constant buffer,
2062           * add it.
2063           */
2064          if (pull_constant_loc[uniform] == -1) {
2065             const gl_constant_value **values = &stage_prog_data->param[uniform];
2066
2067             assert(param_size[uniform]);
2068
2069             for (int j = 0; j < param_size[uniform]; j++) {
2070                pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
2071
2072                stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
2073                   values[j];
2074             }
2075          }
2076       }
2077    }
2078 }
2079
2080 /**
2081  * Assign UNIFORM file registers to either push constants or pull constants.
2082  *
2083  * We allow a fragment shader to have more than the specified minimum
2084  * maximum number of fragment shader uniform components (64).  If
2085  * there are too many of these, they'd fill up all of register space.
2086  * So, this will push some of them out to the pull constant buffer and
2087  * update the program to load them.
2088  */
2089 void
2090 fs_visitor::assign_constant_locations()
2091 {
2092    /* Only the first compile (SIMD8 mode) gets to decide on locations. */
2093    if (dispatch_width != 8)
2094       return;
2095
2096    /* Find which UNIFORM registers are still in use. */
2097    bool is_live[uniforms];
2098    for (unsigned int i = 0; i < uniforms; i++) {
2099       is_live[i] = false;
2100    }
2101
2102    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2103       for (int i = 0; i < inst->sources; i++) {
2104          if (inst->src[i].file != UNIFORM)
2105             continue;
2106
2107          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
2108          if (constant_nr >= 0 && constant_nr < (int) uniforms)
2109             is_live[constant_nr] = true;
2110       }
2111    }
2112
2113    /* Only allow 16 registers (128 uniform components) as push constants.
2114     *
2115     * Just demote the end of the list.  We could probably do better
2116     * here, demoting things that are rarely used in the program first.
2117     *
2118     * If changing this value, note the limitation about total_regs in
2119     * brw_curbe.c.
2120     */
2121    unsigned int max_push_components = 16 * 8;
2122    unsigned int num_push_constants = 0;
2123
2124    push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2125
2126    for (unsigned int i = 0; i < uniforms; i++) {
2127       if (!is_live[i] || pull_constant_loc[i] != -1) {
2128          /* This UNIFORM register is either dead, or has already been demoted
2129           * to a pull const.  Mark it as no longer living in the param[] array.
2130           */
2131          push_constant_loc[i] = -1;
2132          continue;
2133       }
2134
2135       if (num_push_constants < max_push_components) {
2136          /* Retain as a push constant.  Record the location in the params[]
2137           * array.
2138           */
2139          push_constant_loc[i] = num_push_constants++;
2140       } else {
2141          /* Demote to a pull constant. */
2142          push_constant_loc[i] = -1;
2143
2144          int pull_index = stage_prog_data->nr_pull_params++;
2145          stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
2146          pull_constant_loc[i] = pull_index;
2147       }
2148    }
2149
2150    stage_prog_data->nr_params = num_push_constants;
2151
2152    /* Up until now, the param[] array has been indexed by reg + reg_offset
2153     * of UNIFORM registers.  Condense it to only contain the uniforms we
2154     * chose to upload as push constants.
2155     */
2156    for (unsigned int i = 0; i < uniforms; i++) {
2157       int remapped = push_constant_loc[i];
2158
2159       if (remapped == -1)
2160          continue;
2161
2162       assert(remapped <= (int)i);
2163       stage_prog_data->param[remapped] = stage_prog_data->param[i];
2164    }
2165 }
2166
2167 /**
2168  * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
2169  * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
2170  */
2171 void
2172 fs_visitor::demote_pull_constants()
2173 {
2174    foreach_block_and_inst (block, fs_inst, inst, cfg) {
2175       for (int i = 0; i < inst->sources; i++) {
2176          if (inst->src[i].file != UNIFORM)
2177             continue;
2178
2179          int pull_index = pull_constant_loc[inst->src[i].reg +
2180                                             inst->src[i].reg_offset];
2181          if (pull_index == -1)
2182             continue;
2183
2184          /* Set up the annotation tracking for new generated instructions. */
2185          base_ir = inst->ir;
2186          current_annotation = inst->annotation;
2187
2188          fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
2189          fs_reg dst = fs_reg(this, glsl_type::float_type);
2190
2191          /* Generate a pull load into dst. */
2192          if (inst->src[i].reladdr) {
2193             exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
2194                                                         surf_index,
2195                                                         *inst->src[i].reladdr,
2196                                                         pull_index);
2197             inst->insert_before(block, &list);
2198             inst->src[i].reladdr = NULL;
2199          } else {
2200             fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
2201             fs_inst *pull =
2202                new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 8,
2203                                     dst, surf_index, offset);
2204             inst->insert_before(block, pull);
2205             inst->src[i].set_smear(pull_index & 3);
2206          }
2207
2208          /* Rewrite the instruction to use the temporary VGRF. */
2209          inst->src[i].file = GRF;
2210          inst->src[i].reg = dst.reg;
2211          inst->src[i].reg_offset = 0;
2212          inst->src[i].width = dispatch_width;
2213       }
2214    }
2215    invalidate_live_intervals();
2216 }
2217
2218 bool
2219 fs_visitor::opt_algebraic()
2220 {
2221    bool progress = false;
2222
2223    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2224       switch (inst->opcode) {
2225       case BRW_OPCODE_MUL:
2226          if (inst->src[1].file != IMM)
2227             continue;
2228
2229          /* a * 1.0 = a */
2230          if (inst->src[1].is_one()) {
2231             inst->opcode = BRW_OPCODE_MOV;
2232             inst->src[1] = reg_undef;
2233             progress = true;
2234             break;
2235          }
2236
2237          /* a * 0.0 = 0.0 */
2238          if (inst->src[1].is_zero()) {
2239             inst->opcode = BRW_OPCODE_MOV;
2240             inst->src[0] = inst->src[1];
2241             inst->src[1] = reg_undef;
2242             progress = true;
2243             break;
2244          }
2245
2246          break;
2247       case BRW_OPCODE_ADD:
2248          if (inst->src[1].file != IMM)
2249             continue;
2250
2251          /* a + 0.0 = a */
2252          if (inst->src[1].is_zero()) {
2253             inst->opcode = BRW_OPCODE_MOV;
2254             inst->src[1] = reg_undef;
2255             progress = true;
2256             break;
2257          }
2258          break;
2259       case BRW_OPCODE_OR:
2260          if (inst->src[0].equals(inst->src[1])) {
2261             inst->opcode = BRW_OPCODE_MOV;
2262             inst->src[1] = reg_undef;
2263             progress = true;
2264             break;
2265          }
2266          break;
2267       case BRW_OPCODE_LRP:
2268          if (inst->src[1].equals(inst->src[2])) {
2269             inst->opcode = BRW_OPCODE_MOV;
2270             inst->src[0] = inst->src[1];
2271             inst->src[1] = reg_undef;
2272             inst->src[2] = reg_undef;
2273             progress = true;
2274             break;
2275          }
2276          break;
2277       case BRW_OPCODE_SEL:
2278          if (inst->src[0].equals(inst->src[1])) {
2279             inst->opcode = BRW_OPCODE_MOV;
2280             inst->src[1] = reg_undef;
2281             inst->predicate = BRW_PREDICATE_NONE;
2282             inst->predicate_inverse = false;
2283             progress = true;
2284          } else if (inst->saturate && inst->src[1].file == IMM) {
2285             switch (inst->conditional_mod) {
2286             case BRW_CONDITIONAL_LE:
2287             case BRW_CONDITIONAL_L:
2288                switch (inst->src[1].type) {
2289                case BRW_REGISTER_TYPE_F:
2290                   if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
2291                      inst->opcode = BRW_OPCODE_MOV;
2292                      inst->src[1] = reg_undef;
2293                      progress = true;
2294                   }
2295                   break;
2296                default:
2297                   break;
2298                }
2299                break;
2300             case BRW_CONDITIONAL_GE:
2301             case BRW_CONDITIONAL_G:
2302                switch (inst->src[1].type) {
2303                case BRW_REGISTER_TYPE_F:
2304                   if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
2305                      inst->opcode = BRW_OPCODE_MOV;
2306                      inst->src[1] = reg_undef;
2307                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
2308                      progress = true;
2309                   }
2310                   break;
2311                default:
2312                   break;
2313                }
2314             default:
2315                break;
2316             }
2317          }
2318          break;
2319       case SHADER_OPCODE_RCP: {
2320          fs_inst *prev = (fs_inst *)inst->prev;
2321          if (prev->opcode == SHADER_OPCODE_SQRT) {
2322             if (inst->src[0].equals(prev->dst)) {
2323                inst->opcode = SHADER_OPCODE_RSQ;
2324                inst->src[0] = prev->src[0];
2325                progress = true;
2326             }
2327          }
2328          break;
2329       }
2330       default:
2331          break;
2332       }
2333    }
2334
2335    return progress;
2336 }
2337
2338 bool
2339 fs_visitor::opt_register_renaming()
2340 {
2341    bool progress = false;
2342    int depth = 0;
2343
2344    int remap[virtual_grf_count];
2345    memset(remap, -1, sizeof(int) * virtual_grf_count);
2346
2347    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2348       if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
2349          depth++;
2350       } else if (inst->opcode == BRW_OPCODE_ENDIF ||
2351                  inst->opcode == BRW_OPCODE_WHILE) {
2352          depth--;
2353       }
2354
2355       /* Rewrite instruction sources. */
2356       for (int i = 0; i < inst->sources; i++) {
2357          if (inst->src[i].file == GRF &&
2358              remap[inst->src[i].reg] != -1 &&
2359              remap[inst->src[i].reg] != inst->src[i].reg) {
2360             inst->src[i].reg = remap[inst->src[i].reg];
2361             progress = true;
2362          }
2363       }
2364
2365       const int dst = inst->dst.reg;
2366
2367       if (depth == 0 &&
2368           inst->dst.file == GRF &&
2369           virtual_grf_sizes[inst->dst.reg] == inst->dst.width / 8 &&
2370           !inst->is_partial_write()) {
2371          if (remap[dst] == -1) {
2372             remap[dst] = dst;
2373          } else {
2374             remap[dst] = virtual_grf_alloc(inst->dst.width / 8);
2375             inst->dst.reg = remap[dst];
2376             progress = true;
2377          }
2378       } else if (inst->dst.file == GRF &&
2379                  remap[dst] != -1 &&
2380                  remap[dst] != dst) {
2381          inst->dst.reg = remap[dst];
2382          progress = true;
2383       }
2384    }
2385
2386    if (progress) {
2387       invalidate_live_intervals();
2388
2389       for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
2390          if (delta_x[i].file == GRF && remap[delta_x[i].reg] != -1) {
2391             delta_x[i].reg = remap[delta_x[i].reg];
2392          }
2393       }
2394       for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
2395          if (delta_y[i].file == GRF && remap[delta_y[i].reg] != -1) {
2396             delta_y[i].reg = remap[delta_y[i].reg];
2397          }
2398       }
2399    }
2400
2401    return progress;
2402 }
2403
2404 bool
2405 fs_visitor::compute_to_mrf()
2406 {
2407    bool progress = false;
2408    int next_ip = 0;
2409
2410    /* No MRFs on Gen >= 7. */
2411    if (brw->gen >= 7)
2412       return false;
2413
2414    calculate_live_intervals();
2415
2416    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2417       int ip = next_ip;
2418       next_ip++;
2419
2420       if (inst->opcode != BRW_OPCODE_MOV ||
2421           inst->is_partial_write() ||
2422           inst->dst.file != MRF || inst->src[0].file != GRF ||
2423           inst->dst.type != inst->src[0].type ||
2424           inst->src[0].abs || inst->src[0].negate ||
2425           !inst->src[0].is_contiguous() ||
2426           inst->src[0].subreg_offset)
2427          continue;
2428
2429       /* Work out which hardware MRF registers are written by this
2430        * instruction.
2431        */
2432       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2433       int mrf_high;
2434       if (inst->dst.reg & BRW_MRF_COMPR4) {
2435          mrf_high = mrf_low + 4;
2436       } else if (inst->exec_size == 16) {
2437          mrf_high = mrf_low + 1;
2438       } else {
2439          mrf_high = mrf_low;
2440       }
2441
2442       /* Can't compute-to-MRF this GRF if someone else was going to
2443        * read it later.
2444        */
2445       if (this->virtual_grf_end[inst->src[0].reg] > ip)
2446          continue;
2447
2448       /* Found a move of a GRF to a MRF.  Let's see if we can go
2449        * rewrite the thing that made this GRF to write into the MRF.
2450        */
2451       foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2452          if (scan_inst->dst.file == GRF &&
2453              scan_inst->dst.reg == inst->src[0].reg) {
2454             /* Found the last thing to write our reg we want to turn
2455              * into a compute-to-MRF.
2456              */
2457
2458             /* If this one instruction didn't populate all the
2459              * channels, bail.  We might be able to rewrite everything
2460              * that writes that reg, but it would require smarter
2461              * tracking to delay the rewriting until complete success.
2462              */
2463             if (scan_inst->is_partial_write())
2464                break;
2465
2466             /* Things returning more than one register would need us to
2467              * understand coalescing out more than one MOV at a time.
2468              */
2469             if (scan_inst->regs_written > scan_inst->dst.width / 8)
2470                break;
2471
2472             /* SEND instructions can't have MRF as a destination. */
2473             if (scan_inst->mlen)
2474                break;
2475
2476             if (brw->gen == 6) {
2477                /* gen6 math instructions must have the destination be
2478                 * GRF, so no compute-to-MRF for them.
2479                 */
2480                if (scan_inst->is_math()) {
2481                   break;
2482                }
2483             }
2484
2485             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2486                /* Found the creator of our MRF's source value. */
2487                scan_inst->dst.file = MRF;
2488                scan_inst->dst.reg = inst->dst.reg;
2489                scan_inst->saturate |= inst->saturate;
2490                inst->remove(block);
2491                progress = true;
2492             }
2493             break;
2494          }
2495
2496          /* We don't handle control flow here.  Most computation of
2497           * values that end up in MRFs are shortly before the MRF
2498           * write anyway.
2499           */
2500          if (block->start() == scan_inst)
2501             break;
2502
2503          /* You can't read from an MRF, so if someone else reads our
2504           * MRF's source GRF that we wanted to rewrite, that stops us.
2505           */
2506          bool interfered = false;
2507          for (int i = 0; i < scan_inst->sources; i++) {
2508             if (scan_inst->src[i].file == GRF &&
2509                 scan_inst->src[i].reg == inst->src[0].reg &&
2510                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2511                interfered = true;
2512             }
2513          }
2514          if (interfered)
2515             break;
2516
2517          if (scan_inst->dst.file == MRF) {
2518             /* If somebody else writes our MRF here, we can't
2519              * compute-to-MRF before that.
2520              */
2521             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2522             int scan_mrf_high;
2523
2524             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2525                scan_mrf_high = scan_mrf_low + 4;
2526             } else if (scan_inst->exec_size == 16) {
2527                scan_mrf_high = scan_mrf_low + 1;
2528             } else {
2529                scan_mrf_high = scan_mrf_low;
2530             }
2531
2532             if (mrf_low == scan_mrf_low ||
2533                 mrf_low == scan_mrf_high ||
2534                 mrf_high == scan_mrf_low ||
2535                 mrf_high == scan_mrf_high) {
2536                break;
2537             }
2538          }
2539
2540          if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2541             /* Found a SEND instruction, which means that there are
2542              * live values in MRFs from base_mrf to base_mrf +
2543              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2544              * above it.
2545              */
2546             if (mrf_low >= scan_inst->base_mrf &&
2547                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2548                break;
2549             }
2550             if (mrf_high >= scan_inst->base_mrf &&
2551                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2552                break;
2553             }
2554          }
2555       }
2556    }
2557
2558    if (progress)
2559       invalidate_live_intervals();
2560
2561    return progress;
2562 }
2563
2564 /**
2565  * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
2566  * instructions to FS_OPCODE_REP_FB_WRITE.
2567  */
2568 void
2569 fs_visitor::emit_repclear_shader()
2570 {
2571    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2572    int base_mrf = 1;
2573    int color_mrf = base_mrf + 2;
2574
2575    fs_inst *mov = emit(MOV(vec4(brw_message_reg(color_mrf)),
2576                            fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F)));
2577    mov->force_writemask_all = true;
2578
2579    fs_inst *write;
2580    if (key->nr_color_regions == 1) {
2581       write = emit(FS_OPCODE_REP_FB_WRITE);
2582       write->saturate = key->clamp_fragment_color;
2583       write->base_mrf = color_mrf;
2584       write->target = 0;
2585       write->header_present = false;
2586       write->mlen = 1;
2587    } else {
2588       assume(key->nr_color_regions > 0);
2589       for (int i = 0; i < key->nr_color_regions; ++i) {
2590          write = emit(FS_OPCODE_REP_FB_WRITE);
2591          write->saturate = key->clamp_fragment_color;
2592          write->base_mrf = base_mrf;
2593          write->target = i;
2594          write->header_present = true;
2595          write->mlen = 3;
2596       }
2597    }
2598    write->eot = true;
2599
2600    calculate_cfg();
2601
2602    assign_constant_locations();
2603    assign_curb_setup();
2604
2605    /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
2606    assert(mov->src[0].file == HW_REG);
2607    mov->src[0] = brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
2608 }
2609
2610 /**
2611  * Walks through basic blocks, looking for repeated MRF writes and
2612  * removing the later ones.
2613  */
2614 bool
2615 fs_visitor::remove_duplicate_mrf_writes()
2616 {
2617    fs_inst *last_mrf_move[16];
2618    bool progress = false;
2619
2620    /* Need to update the MRF tracking for compressed instructions. */
2621    if (dispatch_width == 16)
2622       return false;
2623
2624    memset(last_mrf_move, 0, sizeof(last_mrf_move));
2625
2626    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
2627       if (inst->is_control_flow()) {
2628          memset(last_mrf_move, 0, sizeof(last_mrf_move));
2629       }
2630
2631       if (inst->opcode == BRW_OPCODE_MOV &&
2632           inst->dst.file == MRF) {
2633          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2634          if (prev_inst && inst->equals(prev_inst)) {
2635             inst->remove(block);
2636             progress = true;
2637             continue;
2638          }
2639       }
2640
2641       /* Clear out the last-write records for MRFs that were overwritten. */
2642       if (inst->dst.file == MRF) {
2643          last_mrf_move[inst->dst.reg] = NULL;
2644       }
2645
2646       if (inst->mlen > 0 && inst->base_mrf != -1) {
2647          /* Found a SEND instruction, which will include two or fewer
2648           * implied MRF writes.  We could do better here.
2649           */
2650          for (int i = 0; i < implied_mrf_writes(inst); i++) {
2651             last_mrf_move[inst->base_mrf + i] = NULL;
2652          }
2653       }
2654
2655       /* Clear out any MRF move records whose sources got overwritten. */
2656       if (inst->dst.file == GRF) {
2657          for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2658             if (last_mrf_move[i] &&
2659                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2660                last_mrf_move[i] = NULL;
2661             }
2662          }
2663       }
2664
2665       if (inst->opcode == BRW_OPCODE_MOV &&
2666           inst->dst.file == MRF &&
2667           inst->src[0].file == GRF &&
2668           !inst->is_partial_write()) {
2669          last_mrf_move[inst->dst.reg] = inst;
2670       }
2671    }
2672
2673    if (progress)
2674       invalidate_live_intervals();
2675
2676    return progress;
2677 }
2678
2679 static void
2680 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2681                         int first_grf, int grf_len)
2682 {
2683    /* Clear the flag for registers that actually got read (as expected). */
2684    for (int i = 0; i < inst->sources; i++) {
2685       int grf;
2686       if (inst->src[i].file == GRF) {
2687          grf = inst->src[i].reg;
2688       } else if (inst->src[i].file == HW_REG &&
2689                  inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2690          grf = inst->src[i].fixed_hw_reg.nr;
2691       } else {
2692          continue;
2693       }
2694
2695       if (grf >= first_grf &&
2696           grf < first_grf + grf_len) {
2697          deps[grf - first_grf] = false;
2698          if (inst->exec_size == 16)
2699             deps[grf - first_grf + 1] = false;
2700       }
2701    }
2702 }
2703
2704 /**
2705  * Implements this workaround for the original 965:
2706  *
2707  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2708  *      check for post destination dependencies on this instruction, software
2709  *      must ensure that there is no destination hazard for the case of ‘write
2710  *      followed by a posted write’ shown in the following example.
2711  *
2712  *      1. mov r3 0
2713  *      2. send r3.xy <rest of send instruction>
2714  *      3. mov r2 r3
2715  *
2716  *      Due to no post-destination dependency check on the ‘send’, the above
2717  *      code sequence could have two instructions (1 and 2) in flight at the
2718  *      same time that both consider ‘r3’ as the target of their final writes.
2719  */
2720 void
2721 fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
2722                                                         fs_inst *inst)
2723 {
2724    int write_len = inst->regs_written;
2725    int first_write_grf = inst->dst.reg;
2726    bool needs_dep[BRW_MAX_MRF];
2727    assert(write_len < (int)sizeof(needs_dep) - 1);
2728
2729    memset(needs_dep, false, sizeof(needs_dep));
2730    memset(needs_dep, true, write_len);
2731
2732    clear_deps_for_inst_src(inst, dispatch_width,
2733                            needs_dep, first_write_grf, write_len);
2734
2735    /* Walk backwards looking for writes to registers we're writing which
2736     * aren't read since being written.  If we hit the start of the program,
2737     * we assume that there are no outstanding dependencies on entry to the
2738     * program.
2739     */
2740    foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2741       /* If we hit control flow, assume that there *are* outstanding
2742        * dependencies, and force their cleanup before our instruction.
2743        */
2744       if (block->start() == scan_inst) {
2745          for (int i = 0; i < write_len; i++) {
2746             if (needs_dep[i]) {
2747                inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
2748             }
2749          }
2750          return;
2751       }
2752
2753       /* We insert our reads as late as possible on the assumption that any
2754        * instruction but a MOV that might have left us an outstanding
2755        * dependency has more latency than a MOV.
2756        */
2757       if (scan_inst->dst.file == GRF) {
2758          for (int i = 0; i < scan_inst->regs_written; i++) {
2759             int reg = scan_inst->dst.reg + i;
2760
2761             if (reg >= first_write_grf &&
2762                 reg < first_write_grf + write_len &&
2763                 needs_dep[reg - first_write_grf]) {
2764                inst->insert_before(block, DEP_RESOLVE_MOV(reg));
2765                needs_dep[reg - first_write_grf] = false;
2766                if (scan_inst->exec_size == 16)
2767                   needs_dep[reg - first_write_grf + 1] = false;
2768             }
2769          }
2770       }
2771
2772       /* Clear the flag for registers that actually got read (as expected). */
2773       clear_deps_for_inst_src(scan_inst, dispatch_width,
2774                               needs_dep, first_write_grf, write_len);
2775
2776       /* Continue the loop only if we haven't resolved all the dependencies */
2777       int i;
2778       for (i = 0; i < write_len; i++) {
2779          if (needs_dep[i])
2780             break;
2781       }
2782       if (i == write_len)
2783          return;
2784    }
2785 }
2786
2787 /**
2788  * Implements this workaround for the original 965:
2789  *
2790  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
2791  *      used as a destination register until after it has been sourced by an
2792  *      instruction with a different destination register.
2793  */
2794 void
2795 fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
2796 {
2797    int write_len = inst->regs_written;
2798    int first_write_grf = inst->dst.reg;
2799    bool needs_dep[BRW_MAX_MRF];
2800    assert(write_len < (int)sizeof(needs_dep) - 1);
2801
2802    memset(needs_dep, false, sizeof(needs_dep));
2803    memset(needs_dep, true, write_len);
2804    /* Walk forwards looking for writes to registers we're writing which aren't
2805     * read before being written.
2806     */
2807    foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst, block) {
2808       /* If we hit control flow, force resolve all remaining dependencies. */
2809       if (block->end() == scan_inst) {
2810          for (int i = 0; i < write_len; i++) {
2811             if (needs_dep[i])
2812                scan_inst->insert_before(block,
2813                                         DEP_RESOLVE_MOV(first_write_grf + i));
2814          }
2815          return;
2816       }
2817
2818       /* Clear the flag for registers that actually got read (as expected). */
2819       clear_deps_for_inst_src(scan_inst, dispatch_width,
2820                               needs_dep, first_write_grf, write_len);
2821
2822       /* We insert our reads as late as possible since they're reading the
2823        * result of a SEND, which has massive latency.
2824        */
2825       if (scan_inst->dst.file == GRF &&
2826           scan_inst->dst.reg >= first_write_grf &&
2827           scan_inst->dst.reg < first_write_grf + write_len &&
2828           needs_dep[scan_inst->dst.reg - first_write_grf]) {
2829          scan_inst->insert_before(block, DEP_RESOLVE_MOV(scan_inst->dst.reg));
2830          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2831       }
2832
2833       /* Continue the loop only if we haven't resolved all the dependencies */
2834       int i;
2835       for (i = 0; i < write_len; i++) {
2836          if (needs_dep[i])
2837             break;
2838       }
2839       if (i == write_len)
2840          return;
2841    }
2842
2843    /* If we hit the end of the program, resolve all remaining dependencies out
2844     * of paranoia.
2845     */
2846    fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2847    assert(last_inst->eot);
2848    for (int i = 0; i < write_len; i++) {
2849       if (needs_dep[i])
2850          last_inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
2851    }
2852 }
2853
2854 void
2855 fs_visitor::insert_gen4_send_dependency_workarounds()
2856 {
2857    if (brw->gen != 4 || brw->is_g4x)
2858       return;
2859
2860    bool progress = false;
2861
2862    /* Note that we're done with register allocation, so GRF fs_regs always
2863     * have a .reg_offset of 0.
2864     */
2865
2866    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2867       if (inst->mlen != 0 && inst->dst.file == GRF) {
2868          insert_gen4_pre_send_dependency_workarounds(block, inst);
2869          insert_gen4_post_send_dependency_workarounds(block, inst);
2870          progress = true;
2871       }
2872    }
2873
2874    if (progress)
2875       invalidate_live_intervals();
2876 }
2877
2878 /**
2879  * Turns the generic expression-style uniform pull constant load instruction
2880  * into a hardware-specific series of instructions for loading a pull
2881  * constant.
2882  *
2883  * The expression style allows the CSE pass before this to optimize out
2884  * repeated loads from the same offset, and gives the pre-register-allocation
2885  * scheduling full flexibility, while the conversion to native instructions
2886  * allows the post-register-allocation scheduler the best information
2887  * possible.
2888  *
2889  * Note that execution masking for setting up pull constant loads is special:
2890  * the channels that need to be written are unrelated to the current execution
2891  * mask, since a later instruction will use one of the result channels as a
2892  * source operand for all 8 or 16 of its channels.
2893  */
2894 void
2895 fs_visitor::lower_uniform_pull_constant_loads()
2896 {
2897    foreach_block_and_inst (block, fs_inst, inst, cfg) {
2898       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2899          continue;
2900
2901       if (brw->gen >= 7) {
2902          /* The offset arg before was a vec4-aligned byte offset.  We need to
2903           * turn it into a dword offset.
2904           */
2905          fs_reg const_offset_reg = inst->src[1];
2906          assert(const_offset_reg.file == IMM &&
2907                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2908          const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
2909          fs_reg payload = fs_reg(this, glsl_type::uint_type);
2910
2911          /* This is actually going to be a MOV, but since only the first dword
2912           * is accessed, we have a special opcode to do just that one.  Note
2913           * that this needs to be an operation that will be considered a def
2914           * by live variable analysis, or register allocation will explode.
2915           */
2916          fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2917                                                8, payload, const_offset_reg);
2918          setup->force_writemask_all = true;
2919
2920          setup->ir = inst->ir;
2921          setup->annotation = inst->annotation;
2922          inst->insert_before(block, setup);
2923
2924          /* Similarly, this will only populate the first 4 channels of the
2925           * result register (since we only use smear values from 0-3), but we
2926           * don't tell the optimizer.
2927           */
2928          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2929          inst->src[1] = payload;
2930
2931          invalidate_live_intervals();
2932       } else {
2933          /* Before register allocation, we didn't tell the scheduler about the
2934           * MRF we use.  We know it's safe to use this MRF because nothing
2935           * else does except for register spill/unspill, which generates and
2936           * uses its MRF within a single IR instruction.
2937           */
2938          inst->base_mrf = 14;
2939          inst->mlen = 1;
2940       }
2941    }
2942 }
2943
2944 bool
2945 fs_visitor::lower_load_payload()
2946 {
2947    bool progress = false;
2948
2949    int vgrf_to_reg[virtual_grf_count];
2950    int reg_count = 16; /* Leave room for MRF */
2951    for (int i = 0; i < virtual_grf_count; ++i) {
2952       vgrf_to_reg[i] = reg_count;
2953       reg_count += virtual_grf_sizes[i];
2954    }
2955
2956    struct {
2957       bool written:1; /* Whether this register has ever been written */
2958       bool force_writemask_all:1;
2959       bool force_sechalf:1;
2960    } metadata[reg_count];
2961    memset(metadata, 0, sizeof(metadata));
2962
2963    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
2964       int dst_reg;
2965       if (inst->dst.file == GRF) {
2966          dst_reg = vgrf_to_reg[inst->dst.reg];
2967       } else {
2968          /* MRF */
2969          dst_reg = inst->dst.reg;
2970       }
2971
2972       if (inst->dst.file == MRF || inst->dst.file == GRF) {
2973          bool force_sechalf = inst->force_sechalf;
2974          bool toggle_sechalf = inst->dst.width == 16 &&
2975                                type_sz(inst->dst.type) == 4;
2976          for (int i = 0; i < inst->regs_written; ++i) {
2977             metadata[dst_reg + i].written = true;
2978             metadata[dst_reg + i].force_sechalf = force_sechalf;
2979             metadata[dst_reg + i].force_writemask_all = inst->force_writemask_all;
2980             force_sechalf = (toggle_sechalf != force_sechalf);
2981          }
2982       }
2983
2984       if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
2985          assert(inst->dst.file == MRF || inst->dst.file == GRF);
2986          fs_reg dst = inst->dst;
2987
2988          for (int i = 0; i < inst->sources; i++) {
2989             dst.width = inst->src[i].effective_width;
2990             dst.type = inst->src[i].type;
2991
2992             if (inst->src[i].file == BAD_FILE) {
2993                /* Do nothing but otherwise increment as normal */
2994             } else if (dst.file == MRF &&
2995                        dst.width == 8 &&
2996                        brw->has_compr4 &&
2997                        i + 4 < inst->sources &&
2998                        inst->src[i + 4].equals(horiz_offset(inst->src[i], 8))) {
2999                fs_reg compr4_dst = dst;
3000                compr4_dst.reg += BRW_MRF_COMPR4;
3001                compr4_dst.width = 16;
3002                fs_reg compr4_src = inst->src[i];
3003                compr4_src.width = 16;
3004                fs_inst *mov = MOV(compr4_dst, compr4_src);
3005                mov->force_writemask_all = true;
3006                inst->insert_before(block, mov);
3007                /* Mark i+4 as BAD_FILE so we don't emit a MOV for it */
3008                inst->src[i + 4].file = BAD_FILE;
3009             } else {
3010                fs_inst *mov = MOV(dst, inst->src[i]);
3011                if (inst->src[i].file == GRF) {
3012                   int src_reg = vgrf_to_reg[inst->src[i].reg] +
3013                                 inst->src[i].reg_offset;
3014                   mov->force_sechalf = metadata[src_reg].force_sechalf;
3015                   mov->force_writemask_all = metadata[src_reg].force_writemask_all;
3016                   metadata[dst_reg] = metadata[src_reg];
3017                   if (dst.width * type_sz(dst.type) > 32) {
3018                      assert((!metadata[src_reg].written ||
3019                              !metadata[src_reg].force_sechalf) &&
3020                             (!metadata[src_reg + 1].written ||
3021                              metadata[src_reg + 1].force_sechalf));
3022                      metadata[dst_reg + 1] = metadata[src_reg + 1];
3023                   }
3024                } else {
3025                   metadata[dst_reg].force_writemask_all = false;
3026                   metadata[dst_reg].force_sechalf = false;
3027                   if (dst.width == 16) {
3028                      metadata[dst_reg + 1].force_writemask_all = false;
3029                      metadata[dst_reg + 1].force_sechalf = true;
3030                   }
3031                }
3032                inst->insert_before(block, mov);
3033             }
3034
3035             dst = offset(dst, 1);
3036          }
3037
3038          inst->remove(block);
3039          progress = true;
3040       }
3041    }
3042
3043    if (progress)
3044       invalidate_live_intervals();
3045
3046    return progress;
3047 }
3048
3049 void
3050 fs_visitor::dump_instructions()
3051 {
3052    dump_instructions(NULL);
3053 }
3054
3055 void
3056 fs_visitor::dump_instructions(const char *name)
3057 {
3058    calculate_register_pressure();
3059    FILE *file = stderr;
3060    if (name && geteuid() != 0) {
3061       file = fopen(name, "w");
3062       if (!file)
3063          file = stderr;
3064    }
3065
3066    int ip = 0, max_pressure = 0;
3067    foreach_block_and_inst(block, backend_instruction, inst, cfg) {
3068       max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
3069       fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
3070       dump_instruction(inst, file);
3071       ++ip;
3072    }
3073    fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
3074
3075    if (file != stderr) {
3076       fclose(file);
3077    }
3078 }
3079
3080 void
3081 fs_visitor::dump_instruction(backend_instruction *be_inst)
3082 {
3083    dump_instruction(be_inst, stderr);
3084 }
3085
3086 void
3087 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
3088 {
3089    fs_inst *inst = (fs_inst *)be_inst;
3090
3091    if (inst->predicate) {
3092       fprintf(file, "(%cf0.%d) ",
3093              inst->predicate_inverse ? '-' : '+',
3094              inst->flag_subreg);
3095    }
3096
3097    fprintf(file, "%s", brw_instruction_name(inst->opcode));
3098    if (inst->saturate)
3099       fprintf(file, ".sat");
3100    if (inst->conditional_mod) {
3101       fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
3102       if (!inst->predicate &&
3103           (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
3104                               inst->opcode != BRW_OPCODE_IF &&
3105                               inst->opcode != BRW_OPCODE_WHILE))) {
3106          fprintf(file, ".f0.%d", inst->flag_subreg);
3107       }
3108    }
3109    fprintf(file, "(%d) ", inst->exec_size);
3110
3111
3112    switch (inst->dst.file) {
3113    case GRF:
3114       fprintf(file, "vgrf%d", inst->dst.reg);
3115       if (inst->dst.width != dispatch_width)
3116          fprintf(file, "@%d", inst->dst.width);
3117       if (virtual_grf_sizes[inst->dst.reg] != inst->dst.width / 8 ||
3118           inst->dst.subreg_offset)
3119          fprintf(file, "+%d.%d",
3120                  inst->dst.reg_offset, inst->dst.subreg_offset);
3121       break;
3122    case MRF:
3123       fprintf(file, "m%d", inst->dst.reg);
3124       break;
3125    case BAD_FILE:
3126       fprintf(file, "(null)");
3127       break;
3128    case UNIFORM:
3129       fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
3130       break;
3131    case HW_REG:
3132       if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3133          switch (inst->dst.fixed_hw_reg.nr) {
3134          case BRW_ARF_NULL:
3135             fprintf(file, "null");
3136             break;
3137          case BRW_ARF_ADDRESS:
3138             fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
3139             break;
3140          case BRW_ARF_ACCUMULATOR:
3141             fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
3142             break;
3143          case BRW_ARF_FLAG:
3144             fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3145                              inst->dst.fixed_hw_reg.subnr);
3146             break;
3147          default:
3148             fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3149                                inst->dst.fixed_hw_reg.subnr);
3150             break;
3151          }
3152       } else {
3153          fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
3154       }
3155       if (inst->dst.fixed_hw_reg.subnr)
3156          fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
3157       break;
3158    default:
3159       fprintf(file, "???");
3160       break;
3161    }
3162    fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
3163
3164    for (int i = 0; i < inst->sources; i++) {
3165       if (inst->src[i].negate)
3166          fprintf(file, "-");
3167       if (inst->src[i].abs)
3168          fprintf(file, "|");
3169       switch (inst->src[i].file) {
3170       case GRF:
3171          fprintf(file, "vgrf%d", inst->src[i].reg);
3172          if (inst->src[i].width != dispatch_width)
3173             fprintf(file, "@%d", inst->src[i].width);
3174          if (virtual_grf_sizes[inst->src[i].reg] != inst->src[i].width / 8 ||
3175              inst->src[i].subreg_offset)
3176             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3177                     inst->src[i].subreg_offset);
3178          break;
3179       case MRF:
3180          fprintf(file, "***m%d***", inst->src[i].reg);
3181          break;
3182       case UNIFORM:
3183          fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
3184          if (inst->src[i].reladdr) {
3185             fprintf(file, "+reladdr");
3186          } else if (inst->src[i].subreg_offset) {
3187             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3188                     inst->src[i].subreg_offset);
3189          }
3190          break;
3191       case BAD_FILE:
3192          fprintf(file, "(null)");
3193          break;
3194       case IMM:
3195          switch (inst->src[i].type) {
3196          case BRW_REGISTER_TYPE_F:
3197             fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
3198             break;
3199          case BRW_REGISTER_TYPE_D:
3200             fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
3201             break;
3202          case BRW_REGISTER_TYPE_UD:
3203             fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
3204             break;
3205          case BRW_REGISTER_TYPE_VF:
3206             fprintf(stderr, "[%-gF, %-gF, %-gF, %-gF]",
3207                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >>  0) & 0xff),
3208                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >>  8) & 0xff),
3209                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 16) & 0xff),
3210                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 24) & 0xff));
3211             break;
3212          default:
3213             fprintf(file, "???");
3214             break;
3215          }
3216          break;
3217       case HW_REG:
3218          if (inst->src[i].fixed_hw_reg.negate)
3219             fprintf(file, "-");
3220          if (inst->src[i].fixed_hw_reg.abs)
3221             fprintf(file, "|");
3222          if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3223             switch (inst->src[i].fixed_hw_reg.nr) {
3224             case BRW_ARF_NULL:
3225                fprintf(file, "null");
3226                break;
3227             case BRW_ARF_ADDRESS:
3228                fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
3229                break;
3230             case BRW_ARF_ACCUMULATOR:
3231                fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
3232                break;
3233             case BRW_ARF_FLAG:
3234                fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3235                                 inst->src[i].fixed_hw_reg.subnr);
3236                break;
3237             default:
3238                fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3239                                   inst->src[i].fixed_hw_reg.subnr);
3240                break;
3241             }
3242          } else {
3243             fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3244          }
3245          if (inst->src[i].fixed_hw_reg.subnr)
3246             fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
3247          if (inst->src[i].fixed_hw_reg.abs)
3248             fprintf(file, "|");
3249          break;
3250       default:
3251          fprintf(file, "???");
3252          break;
3253       }
3254       if (inst->src[i].abs)
3255          fprintf(file, "|");
3256
3257       if (inst->src[i].file != IMM) {
3258          fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
3259       }
3260
3261       if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
3262          fprintf(file, ", ");
3263    }
3264
3265    fprintf(file, " ");
3266
3267    if (dispatch_width == 16 && inst->exec_size == 8) {
3268       if (inst->force_sechalf)
3269          fprintf(file, "2ndhalf ");
3270       else
3271          fprintf(file, "1sthalf ");
3272    }
3273
3274    fprintf(file, "\n");
3275 }
3276
3277 /**
3278  * Possibly returns an instruction that set up @param reg.
3279  *
3280  * Sometimes we want to take the result of some expression/variable
3281  * dereference tree and rewrite the instruction generating the result
3282  * of the tree.  When processing the tree, we know that the
3283  * instructions generated are all writing temporaries that are dead
3284  * outside of this tree.  So, if we have some instructions that write
3285  * a temporary, we're free to point that temp write somewhere else.
3286  *
3287  * Note that this doesn't guarantee that the instruction generated
3288  * only reg -- it might be the size=4 destination of a texture instruction.
3289  */
3290 fs_inst *
3291 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3292                                            fs_inst *end,
3293                                            const fs_reg &reg)
3294 {
3295    if (end == start ||
3296        end->is_partial_write() ||
3297        reg.reladdr ||
3298        !reg.equals(end->dst)) {
3299       return NULL;
3300    } else {
3301       return end;
3302    }
3303 }
3304
3305 void
3306 fs_visitor::setup_payload_gen6()
3307 {
3308    bool uses_depth =
3309       (prog->InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3310    unsigned barycentric_interp_modes =
3311       (stage == MESA_SHADER_FRAGMENT) ?
3312       ((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
3313
3314    assert(brw->gen >= 6);
3315
3316    /* R0-1: masks, pixel X/Y coordinates. */
3317    payload.num_regs = 2;
3318    /* R2: only for 32-pixel dispatch.*/
3319
3320    /* R3-26: barycentric interpolation coordinates.  These appear in the
3321     * same order that they appear in the brw_wm_barycentric_interp_mode
3322     * enum.  Each set of coordinates occupies 2 registers if dispatch width
3323     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
3324     * appear if they were enabled using the "Barycentric Interpolation
3325     * Mode" bits in WM_STATE.
3326     */
3327    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3328       if (barycentric_interp_modes & (1 << i)) {
3329          payload.barycentric_coord_reg[i] = payload.num_regs;
3330          payload.num_regs += 2;
3331          if (dispatch_width == 16) {
3332             payload.num_regs += 2;
3333          }
3334       }
3335    }
3336
3337    /* R27: interpolated depth if uses source depth */
3338    if (uses_depth) {
3339       payload.source_depth_reg = payload.num_regs;
3340       payload.num_regs++;
3341       if (dispatch_width == 16) {
3342          /* R28: interpolated depth if not SIMD8. */
3343          payload.num_regs++;
3344       }
3345    }
3346    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3347    if (uses_depth) {
3348       payload.source_w_reg = payload.num_regs;
3349       payload.num_regs++;
3350       if (dispatch_width == 16) {
3351          /* R30: interpolated W if not SIMD8. */
3352          payload.num_regs++;
3353       }
3354    }
3355
3356    if (stage == MESA_SHADER_FRAGMENT) {
3357       brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3358       brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3359       prog_data->uses_pos_offset = key->compute_pos_offset;
3360       /* R31: MSAA position offsets. */
3361       if (prog_data->uses_pos_offset) {
3362          payload.sample_pos_reg = payload.num_regs;
3363          payload.num_regs++;
3364       }
3365    }
3366
3367    /* R32: MSAA input coverage mask */
3368    if (prog->SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3369       assert(brw->gen >= 7);
3370       payload.sample_mask_in_reg = payload.num_regs;
3371       payload.num_regs++;
3372       if (dispatch_width == 16) {
3373          /* R33: input coverage mask if not SIMD8. */
3374          payload.num_regs++;
3375       }
3376    }
3377
3378    /* R34-: bary for 32-pixel. */
3379    /* R58-59: interp W for 32-pixel. */
3380
3381    if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3382       source_depth_to_render_target = true;
3383    }
3384 }
3385
3386 void
3387 fs_visitor::assign_binding_table_offsets()
3388 {
3389    assert(stage == MESA_SHADER_FRAGMENT);
3390    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3391    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3392    uint32_t next_binding_table_offset = 0;
3393
3394    /* If there are no color regions, we still perform an FB write to a null
3395     * renderbuffer, which we place at surface index 0.
3396     */
3397    prog_data->binding_table.render_target_start = next_binding_table_offset;
3398    next_binding_table_offset += MAX2(key->nr_color_regions, 1);
3399
3400    assign_common_binding_table_offsets(next_binding_table_offset);
3401 }
3402
3403 void
3404 fs_visitor::calculate_register_pressure()
3405 {
3406    invalidate_live_intervals();
3407    calculate_live_intervals();
3408
3409    unsigned num_instructions = 0;
3410    foreach_block(block, cfg)
3411       num_instructions += block->instructions.length();
3412
3413    regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3414
3415    for (int reg = 0; reg < virtual_grf_count; reg++) {
3416       for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3417          regs_live_at_ip[ip] += virtual_grf_sizes[reg];
3418    }
3419 }
3420
3421 void
3422 fs_visitor::optimize()
3423 {
3424    calculate_cfg();
3425
3426    split_virtual_grfs();
3427
3428    move_uniform_array_access_to_pull_constants();
3429    assign_constant_locations();
3430    demote_pull_constants();
3431
3432 #define OPT(pass, args...) do {                                         \
3433       pass_num++;                                                       \
3434       bool this_progress = pass(args);                                  \
3435                                                                         \
3436       if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) {   \
3437          char filename[64];                                             \
3438          snprintf(filename, 64, "fs%d-%04d-%02d-%02d-" #pass,           \
3439                   dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
3440                                                                         \
3441          backend_visitor::dump_instructions(filename);                  \
3442       }                                                                 \
3443                                                                         \
3444       progress = progress || this_progress;                             \
3445    } while (false)
3446
3447    if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
3448       char filename[64];
3449       snprintf(filename, 64, "fs%d-%04d-00-start",
3450                dispatch_width, shader_prog ? shader_prog->Name : 0);
3451
3452       backend_visitor::dump_instructions(filename);
3453    }
3454
3455    bool progress;
3456    int iteration = 0;
3457    do {
3458       progress = false;
3459       iteration++;
3460       int pass_num = 0;
3461
3462       OPT(remove_duplicate_mrf_writes);
3463
3464       OPT(opt_algebraic);
3465       OPT(opt_cse);
3466       OPT(opt_copy_propagate);
3467       OPT(opt_peephole_predicated_break);
3468       OPT(dead_code_eliminate);
3469       OPT(opt_peephole_sel);
3470       OPT(dead_control_flow_eliminate, this);
3471       OPT(opt_register_renaming);
3472       OPT(opt_saturate_propagation);
3473       OPT(register_coalesce);
3474       OPT(compute_to_mrf);
3475
3476       OPT(compact_virtual_grfs);
3477    } while (progress);
3478
3479    if (lower_load_payload()) {
3480       split_virtual_grfs();
3481       register_coalesce();
3482       compute_to_mrf();
3483       dead_code_eliminate();
3484    }
3485
3486    lower_uniform_pull_constant_loads();
3487 }
3488
3489 void
3490 fs_visitor::allocate_registers()
3491 {
3492    bool allocated_without_spills;
3493
3494    static enum instruction_scheduler_mode pre_modes[] = {
3495       SCHEDULE_PRE,
3496       SCHEDULE_PRE_NON_LIFO,
3497       SCHEDULE_PRE_LIFO,
3498    };
3499
3500    /* Try each scheduling heuristic to see if it can successfully register
3501     * allocate without spilling.  They should be ordered by decreasing
3502     * performance but increasing likelihood of allocating.
3503     */
3504    for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3505       schedule_instructions(pre_modes[i]);
3506
3507       if (0) {
3508          assign_regs_trivial();
3509          allocated_without_spills = true;
3510       } else {
3511          allocated_without_spills = assign_regs(false);
3512       }
3513       if (allocated_without_spills)
3514          break;
3515    }
3516
3517    if (!allocated_without_spills) {
3518       /* We assume that any spilling is worse than just dropping back to
3519        * SIMD8.  There's probably actually some intermediate point where
3520        * SIMD16 with a couple of spills is still better.
3521        */
3522       if (dispatch_width == 16) {
3523          fail("Failure to register allocate.  Reduce number of "
3524               "live scalar values to avoid this.");
3525       } else {
3526          perf_debug("Fragment shader triggered register spilling.  "
3527                     "Try reducing the number of live scalar values to "
3528                     "improve performance.\n");
3529       }
3530
3531       /* Since we're out of heuristics, just go spill registers until we
3532        * get an allocation.
3533        */
3534       while (!assign_regs(true)) {
3535          if (failed)
3536             break;
3537       }
3538    }
3539
3540    /* This must come after all optimization and register allocation, since
3541     * it inserts dead code that happens to have side effects, and it does
3542     * so based on the actual physical registers in use.
3543     */
3544    insert_gen4_send_dependency_workarounds();
3545
3546    if (failed)
3547       return;
3548
3549    if (!allocated_without_spills)
3550       schedule_instructions(SCHEDULE_POST);
3551
3552    if (last_scratch > 0)
3553       prog_data->total_scratch = brw_get_scratch_size(last_scratch);
3554 }
3555
3556 bool
3557 fs_visitor::run()
3558 {
3559    sanity_param_count = prog->Parameters->NumParameters;
3560
3561    assign_binding_table_offsets();
3562
3563    if (brw->gen >= 6)
3564       setup_payload_gen6();
3565    else
3566       setup_payload_gen4();
3567
3568    if (0) {
3569       emit_dummy_fs();
3570    } else if (brw->use_rep_send && dispatch_width == 16) {
3571       emit_repclear_shader();
3572    } else {
3573       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3574          emit_shader_time_begin();
3575
3576       calculate_urb_setup();
3577       if (prog->InputsRead > 0) {
3578          if (brw->gen < 6)
3579             emit_interpolation_setup_gen4();
3580          else
3581             emit_interpolation_setup_gen6();
3582       }
3583
3584       /* We handle discards by keeping track of the still-live pixels in f0.1.
3585        * Initialize it with the dispatched pixels.
3586        */
3587       bool uses_kill =
3588          (stage == MESA_SHADER_FRAGMENT) &&
3589          ((brw_wm_prog_data*) this->prog_data)->uses_kill;
3590       bool alpha_test_func =
3591          (stage == MESA_SHADER_FRAGMENT) &&
3592          ((brw_wm_prog_key*) this->key)->alpha_test_func;
3593       if (uses_kill) {
3594          fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3595          discard_init->flag_subreg = 1;
3596       }
3597
3598       /* Generate FS IR for main().  (the visitor only descends into
3599        * functions called "main").
3600        */
3601       if (shader) {
3602          foreach_in_list(ir_instruction, ir, shader->base.ir) {
3603             base_ir = ir;
3604             this->result = reg_undef;
3605             ir->accept(this);
3606          }
3607       } else {
3608          emit_fragment_program_code();
3609       }
3610       base_ir = NULL;
3611       if (failed)
3612          return false;
3613
3614       emit(FS_OPCODE_PLACEHOLDER_HALT);
3615
3616       if (alpha_test_func)
3617          emit_alpha_test();
3618
3619       emit_fb_writes();
3620
3621       optimize();
3622
3623       assign_curb_setup();
3624       assign_urb_setup();
3625
3626       allocate_registers();
3627
3628       if (failed)
3629          return false;
3630    }
3631
3632    if (stage == MESA_SHADER_FRAGMENT) {
3633       brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3634       if (dispatch_width == 8)
3635          prog_data->reg_blocks = brw_register_blocks(grf_used);
3636       else
3637          prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
3638    }
3639
3640    /* If any state parameters were appended, then ParameterValues could have
3641     * been realloced, in which case the driver uniform storage set up by
3642     * _mesa_associate_uniform_storage() would point to freed memory.  Make
3643     * sure that didn't happen.
3644     */
3645    assert(sanity_param_count == prog->Parameters->NumParameters);
3646
3647    return !failed;
3648 }
3649
3650 const unsigned *
3651 brw_wm_fs_emit(struct brw_context *brw,
3652                void *mem_ctx,
3653                const struct brw_wm_prog_key *key,
3654                struct brw_wm_prog_data *prog_data,
3655                struct gl_fragment_program *fp,
3656                struct gl_shader_program *prog,
3657                unsigned *final_assembly_size)
3658 {
3659    bool start_busy = false;
3660    double start_time = 0;
3661
3662    if (unlikely(brw->perf_debug)) {
3663       start_busy = (brw->batch.last_bo &&
3664                     drm_intel_bo_busy(brw->batch.last_bo));
3665       start_time = get_time();
3666    }
3667
3668    struct brw_shader *shader = NULL;
3669    if (prog)
3670       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3671
3672    if (unlikely(INTEL_DEBUG & DEBUG_WM))
3673       brw_dump_ir("fragment", prog, &shader->base, &fp->Base);
3674
3675    /* Now the main event: Visit the shader IR and generate our FS IR for it.
3676     */
3677    fs_visitor v(brw, mem_ctx, key, prog_data, prog, fp, 8);
3678    if (!v.run()) {
3679       if (prog) {
3680          prog->LinkStatus = false;
3681          ralloc_strcat(&prog->InfoLog, v.fail_msg);
3682       }
3683
3684       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3685                     v.fail_msg);
3686
3687       return NULL;
3688    }
3689
3690    cfg_t *simd16_cfg = NULL;
3691    fs_visitor v2(brw, mem_ctx, key, prog_data, prog, fp, 16);
3692    if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16) ||
3693                                brw->use_rep_send)) {
3694       if (!v.simd16_unsupported) {
3695          /* Try a SIMD16 compile */
3696          v2.import_uniforms(&v);
3697          if (!v2.run()) {
3698             perf_debug("SIMD16 shader failed to compile, falling back to "
3699                        "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
3700          } else {
3701             simd16_cfg = v2.cfg;
3702          }
3703       } else {
3704          perf_debug("SIMD16 shader unsupported, falling back to "
3705                     "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
3706       }
3707    }
3708
3709    cfg_t *simd8_cfg;
3710    int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8;
3711    if (no_simd8 && simd16_cfg) {
3712       simd8_cfg = NULL;
3713       prog_data->no_8 = true;
3714    } else {
3715       simd8_cfg = v.cfg;
3716       prog_data->no_8 = false;
3717    }
3718
3719    fs_generator g(brw, mem_ctx, (void *) key, &prog_data->base, prog, &fp->Base,
3720                   v.runtime_check_aads_emit, INTEL_DEBUG & DEBUG_WM);
3721    if (simd8_cfg)
3722       g.generate_code(simd8_cfg, 8);
3723    if (simd16_cfg)
3724       prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
3725
3726    if (unlikely(brw->perf_debug) && shader) {
3727       if (shader->compiled_once)
3728          brw_wm_debug_recompile(brw, prog, key);
3729       shader->compiled_once = true;
3730
3731       if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3732          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3733                     (get_time() - start_time) * 1000);
3734       }
3735    }
3736
3737    return g.get_assembly(final_assembly_size);
3738 }
3739
3740 extern "C" bool
3741 brw_fs_precompile(struct gl_context *ctx,
3742                   struct gl_shader_program *shader_prog,
3743                   struct gl_program *prog)
3744 {
3745    struct brw_context *brw = brw_context(ctx);
3746    struct brw_wm_prog_key key;
3747
3748    struct gl_fragment_program *fp = (struct gl_fragment_program *) prog;
3749    struct brw_fragment_program *bfp = brw_fragment_program(fp);
3750    bool program_uses_dfdy = fp->UsesDFdy;
3751
3752    memset(&key, 0, sizeof(key));
3753
3754    if (brw->gen < 6) {
3755       if (fp->UsesKill)
3756          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3757
3758       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3759          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3760
3761       /* Just assume depth testing. */
3762       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3763       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3764    }
3765
3766    if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
3767                                          BRW_FS_VARYING_INPUT_MASK) > 16)
3768       key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
3769
3770    unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
3771    for (unsigned i = 0; i < sampler_count; i++) {
3772       if (fp->Base.ShadowSamplers & (1 << i)) {
3773          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3774          key.tex.swizzles[i] =
3775             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3776       } else {
3777          /* Color sampler: assume no swizzling. */
3778          key.tex.swizzles[i] = SWIZZLE_XYZW;
3779       }
3780    }
3781
3782    if (fp->Base.InputsRead & VARYING_BIT_POS) {
3783       key.drawable_height = ctx->DrawBuffer->Height;
3784    }
3785
3786    key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
3787          ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
3788          BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
3789
3790    if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3791       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
3792                           key.nr_color_regions > 1;
3793    }
3794
3795    key.program_string_id = bfp->id;
3796
3797    uint32_t old_prog_offset = brw->wm.base.prog_offset;
3798    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3799
3800    bool success = do_wm_prog(brw, shader_prog, bfp, &key);
3801
3802    brw->wm.base.prog_offset = old_prog_offset;
3803    brw->wm.prog_data = old_prog_data;
3804
3805    return success;
3806 }