src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 extern "C" {
  32
  33 #include <sys/types.h>
  34
  35 #include "util/hash_table.h"
  36 #include "main/macros.h"
  37 #include "main/shaderobj.h"
  38 #include "main/fbobject.h"
  39 #include "program/prog_parameter.h"
  40 #include "program/prog_print.h"
  41 #include "util/register_allocate.h"
  42 #include "program/sampler.h"
  43 #include "program/hash_table.h"
  44 #include "brw_context.h"
  45 #include "brw_eu.h"
  46 #include "brw_wm.h"
  47 }
  48 #include "brw_fs.h"
  49 #include "brw_cfg.h"
  50 #include "brw_dead_control_flow.h"
  51 #include "main/uniforms.h"
  52 #include "brw_fs_live_variables.h"
  53 #include "glsl/glsl_types.h"
  54
  55 void
  56 fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
  57               fs_reg *src, int sources)
  58 {
  59    memset(this, 0, sizeof(*this));
  60
  61    this->opcode = opcode;
  62    this->dst = dst;
  63    this->src = src;
  64    this->sources = sources;
  65    this->exec_size = exec_size;
  66
  67    assert(dst.file != IMM && dst.file != UNIFORM);
  68
  69    /* If exec_size == 0, try to guess it from the registers.  Since all
  70     * manner of things may use hardware registers, we first try to guess
  71     * based on GRF registers.  If this fails, we will go ahead and take the
  72     * width from the destination register.
  73     */
  74    if (this->exec_size == 0) {
  75       if (dst.file == GRF) {
  76          this->exec_size = dst.width;
  77       } else {
  78          for (int i = 0; i < sources; ++i) {
  79             if (src[i].file != GRF)
  80                continue;
  81
  82             if (this->exec_size <= 1)
  83                this->exec_size = src[i].width;
  84             assert(src[i].width == 1 || src[i].width == this->exec_size);
  85          }
  86       }
  87
  88       if (this->exec_size == 0 && dst.file != BAD_FILE)
  89          this->exec_size = dst.width;
  90    }
  91    assert(this->exec_size != 0);
  92
  93    for (int i = 0; i < sources; ++i) {
  94       switch (this->src[i].file) {
  95       case BAD_FILE:
  96          this->src[i].effective_width = 8;
  97          break;
  98       case GRF:
  99       case HW_REG:
 100          assert(this->src[i].width > 0);
 101          if (this->src[i].width == 1) {
 102             this->src[i].effective_width = this->exec_size;
 103          } else {
 104             this->src[i].effective_width = this->src[i].width;
 105          }
 106          break;
 107       case IMM:
 108       case UNIFORM:
 109          this->src[i].effective_width = this->exec_size;
 110          break;
 111       default:
 112          unreachable("Invalid source register file");
 113       }
 114    }
 115    this->dst.effective_width = this->exec_size;
 116
 117    this->conditional_mod = BRW_CONDITIONAL_NONE;
 118
 119    /* This will be the case for almost all instructions. */
 120    switch (dst.file) {
 121    case GRF:
 122    case HW_REG:
 123    case MRF:
 124       this->regs_written = (dst.width * dst.stride * type_sz(dst.type) + 31) / 32;
 125       break;
 126    case BAD_FILE:
 127       this->regs_written = 0;
 128       break;
 129    case IMM:
 130    case UNIFORM:
 131       unreachable("Invalid destination register file");
 132    default:
 133       unreachable("Invalid register file");
 134    }
 135
 136    this->writes_accumulator = false;
 137 }
 138
 139 fs_inst::fs_inst()
 140 {
 141    fs_reg *src = ralloc_array(this, fs_reg, 3);
 142    init(BRW_OPCODE_NOP, 8, dst, src, 0);
 143 }
 144
 145 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
 146 {
 147    fs_reg *src = ralloc_array(this, fs_reg, 3);
 148    init(opcode, exec_size, reg_undef, src, 0);
 149 }
 150
 151 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
 152 {
 153    fs_reg *src = ralloc_array(this, fs_reg, 3);
 154    init(opcode, 0, dst, src, 0);
 155 }
 156
 157 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 158                  const fs_reg &src0)
 159 {
 160    fs_reg *src = ralloc_array(this, fs_reg, 3);
 161    src[0] = src0;
 162    init(opcode, exec_size, dst, src, 1);
 163 }
 164
 165 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
 166 {
 167    fs_reg *src = ralloc_array(this, fs_reg, 3);
 168    src[0] = src0;
 169    init(opcode, 0, dst, src, 1);
 170 }
 171
 172 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 173                  const fs_reg &src0, const fs_reg &src1)
 174 {
 175    fs_reg *src = ralloc_array(this, fs_reg, 3);
 176    src[0] = src0;
 177    src[1] = src1;
 178    init(opcode, exec_size, dst, src, 2);
 179 }
 180
 181 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 182                  const fs_reg &src1)
 183 {
 184    fs_reg *src = ralloc_array(this, fs_reg, 3);
 185    src[0] = src0;
 186    src[1] = src1;
 187    init(opcode, 0, dst, src, 2);
 188 }
 189
 190 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 191                  const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
 192 {
 193    fs_reg *src = ralloc_array(this, fs_reg, 3);
 194    src[0] = src0;
 195    src[1] = src1;
 196    src[2] = src2;
 197    init(opcode, exec_size, dst, src, 3);
 198 }
 199
 200 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 201                  const fs_reg &src1, const fs_reg &src2)
 202 {
 203    fs_reg *src = ralloc_array(this, fs_reg, 3);
 204    src[0] = src0;
 205    src[1] = src1;
 206    src[2] = src2;
 207    init(opcode, 0, dst, src, 3);
 208 }
 209
 210 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, fs_reg src[], int sources)
 211 {
 212    init(opcode, 0, dst, src, sources);
 213 }
 214
 215 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
 216                  fs_reg src[], int sources)
 217 {
 218    init(opcode, exec_width, dst, src, sources);
 219 }
 220
 221 fs_inst::fs_inst(const fs_inst &that)
 222 {
 223    memcpy(this, &that, sizeof(that));
 224
 225    this->src = ralloc_array(this, fs_reg, that.sources);
 226
 227    for (int i = 0; i < that.sources; i++)
 228       this->src[i] = that.src[i];
 229 }
 230
 231 void
 232 fs_inst::resize_sources(uint8_t num_sources)
 233 {
 234    if (this->sources != num_sources) {
 235       this->src = reralloc(this, this->src, fs_reg, num_sources);
 236       this->sources = num_sources;
 237    }
 238 }
 239
 240 #define ALU1(op)                                                        \
 241    fs_inst *                                                            \
 242    fs_visitor::op(const fs_reg &dst, const fs_reg &src0)                \
 243    {                                                                    \
 244       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
 245    }
 246
 247 #define ALU2(op)                                                        \
 248    fs_inst *                                                            \
 249    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 250                   const fs_reg &src1)                                   \
 251    {                                                                    \
 252       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
 253    }
 254
 255 #define ALU2_ACC(op)                                                    \
 256    fs_inst *                                                            \
 257    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 258                   const fs_reg &src1)                                   \
 259    {                                                                    \
 260       fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
 261       inst->writes_accumulator = true;                                  \
 262       return inst;                                                      \
 263    }
 264
 265 #define ALU3(op)                                                        \
 266    fs_inst *                                                            \
 267    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 268                   const fs_reg &src1, const fs_reg &src2)               \
 269    {                                                                    \
 270       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
 271    }
 272
 273 ALU1(NOT)
 274 ALU1(MOV)
 275 ALU1(FRC)
 276 ALU1(RNDD)
 277 ALU1(RNDE)
 278 ALU1(RNDZ)
 279 ALU2(ADD)
 280 ALU2(MUL)
 281 ALU2_ACC(MACH)
 282 ALU2(AND)
 283 ALU2(OR)
 284 ALU2(XOR)
 285 ALU2(SHL)
 286 ALU2(SHR)
 287 ALU2(ASR)
 288 ALU3(LRP)
 289 ALU1(BFREV)
 290 ALU3(BFE)
 291 ALU2(BFI1)
 292 ALU3(BFI2)
 293 ALU1(FBH)
 294 ALU1(FBL)
 295 ALU1(CBIT)
 296 ALU3(MAD)
 297 ALU2_ACC(ADDC)
 298 ALU2_ACC(SUBB)
 299 ALU2(SEL)
 300 ALU2(MAC)
 301
 302 /** Gen4 predicated IF. */
 303 fs_inst *
 304 fs_visitor::IF(enum brw_predicate predicate)
 305 {
 306    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width);
 307    inst->predicate = predicate;
 308    return inst;
 309 }
 310
 311 /** Gen6 IF with embedded comparison. */
 312 fs_inst *
 313 fs_visitor::IF(const fs_reg &src0, const fs_reg &src1,
 314                enum brw_conditional_mod condition)
 315 {
 316    assert(brw->gen == 6);
 317    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width,
 318                                         reg_null_d, src0, src1);
 319    inst->conditional_mod = condition;
 320    return inst;
 321 }
 322
 323 /**
 324  * CMP: Sets the low bit of the destination channels with the result
 325  * of the comparison, while the upper bits are undefined, and updates
 326  * the flag register with the packed 16 bits of the result.
 327  */
 328 fs_inst *
 329 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
 330                 enum brw_conditional_mod condition)
 331 {
 332    fs_inst *inst;
 333
 334    /* Take the instruction:
 335     *
 336     * CMP null<d> src0<f> src1<f>
 337     *
 338     * Original gen4 does type conversion to the destination type before
 339     * comparison, producing garbage results for floating point comparisons.
 340     * gen5 does the comparison on the execution type (resolved source types),
 341     * so dst type doesn't matter.  gen6 does comparison and then uses the
 342     * result as if it was the dst type with no conversion, which happens to
 343     * mostly work out for float-interpreted-as-int since our comparisons are
 344     * for >0, =0, <0.
 345     */
 346    if (brw->gen == 4) {
 347       dst.type = src0.type;
 348       if (dst.file == HW_REG)
 349          dst.fixed_hw_reg.type = dst.type;
 350    }
 351
 352    resolve_ud_negate(&src0);
 353    resolve_ud_negate(&src1);
 354
 355    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
 356    inst->conditional_mod = condition;
 357
 358    return inst;
 359 }
 360
 361 fs_inst *
 362 fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources)
 363 {
 364    uint8_t exec_size = dst.width;
 365    for (int i = 0; i < sources; ++i) {
 366       assert(src[i].width % dst.width == 0);
 367       if (src[i].width > exec_size)
 368          exec_size = src[i].width;
 369    }
 370
 371    fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, exec_size,
 372                                         dst, src, sources);
 373    inst->regs_written = 0;
 374    for (int i = 0; i < sources; ++i) {
 375       /* The LOAD_PAYLOAD instruction only really makes sense if we are
 376        * dealing with whole registers.  If this ever changes, we can deal
 377        * with it later.
 378        */
 379       int size = src[i].effective_width * type_sz(src[i].type);
 380       assert(size % 32 == 0);
 381       inst->regs_written += (size + 31) / 32;
 382    }
 383
 384    return inst;
 385 }
 386
 387 exec_list
 388 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
 389                                        const fs_reg &surf_index,
 390                                        const fs_reg &varying_offset,
 391                                        uint32_t const_offset)
 392 {
 393    exec_list instructions;
 394    fs_inst *inst;
 395
 396    /* We have our constant surface use a pitch of 4 bytes, so our index can
 397     * be any component of a vector, and then we load 4 contiguous
 398     * components starting from that.
 399     *
 400     * We break down the const_offset to a portion added to the variable
 401     * offset and a portion done using reg_offset, which means that if you
 402     * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
 403     * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
 404     * CSE can later notice that those loads are all the same and eliminate
 405     * the redundant ones.
 406     */
 407    fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
 408    instructions.push_tail(ADD(vec4_offset,
 409                               varying_offset, fs_reg(const_offset & ~3)));
 410
 411    int scale = 1;
 412    if (brw->gen == 4 && dst.width == 8) {
 413       /* Pre-gen5, we can either use a SIMD8 message that requires (header,
 414        * u, v, r) as parameters, or we can just use the SIMD16 message
 415        * consisting of (header, u).  We choose the second, at the cost of a
 416        * longer return length.
 417        */
 418       scale = 2;
 419    }
 420
 421    enum opcode op;
 422    if (brw->gen >= 7)
 423       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
 424    else
 425       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
 426
 427    assert(dst.width % 8 == 0);
 428    int regs_written = 4 * (dst.width / 8) * scale;
 429    fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(regs_written),
 430                                dst.type, dst.width);
 431    inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
 432    inst->regs_written = regs_written;
 433    instructions.push_tail(inst);
 434
 435    if (brw->gen < 7) {
 436       inst->base_mrf = 13;
 437       inst->header_present = true;
 438       if (brw->gen == 4)
 439          inst->mlen = 3;
 440       else
 441          inst->mlen = 1 + dispatch_width / 8;
 442    }
 443
 444    fs_reg result = offset(vec4_result, (const_offset & 3) * scale);
 445    instructions.push_tail(MOV(dst, result));
 446
 447    return instructions;
 448 }
 449
 450 /**
 451  * A helper for MOV generation for fixing up broken hardware SEND dependency
 452  * handling.
 453  */
 454 fs_inst *
 455 fs_visitor::DEP_RESOLVE_MOV(int grf)
 456 {
 457    fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
 458
 459    inst->ir = NULL;
 460    inst->annotation = "send dependency resolve";
 461
 462    /* The caller always wants uncompressed to emit the minimal extra
 463     * dependencies, and to avoid having to deal with aligning its regs to 2.
 464     */
 465    inst->exec_size = 8;
 466
 467    return inst;
 468 }
 469
 470 bool
 471 fs_inst::equals(fs_inst *inst) const
 472 {
 473    return (opcode == inst->opcode &&
 474            dst.equals(inst->dst) &&
 475            src[0].equals(inst->src[0]) &&
 476            src[1].equals(inst->src[1]) &&
 477            src[2].equals(inst->src[2]) &&
 478            saturate == inst->saturate &&
 479            predicate == inst->predicate &&
 480            conditional_mod == inst->conditional_mod &&
 481            mlen == inst->mlen &&
 482            base_mrf == inst->base_mrf &&
 483            target == inst->target &&
 484            eot == inst->eot &&
 485            header_present == inst->header_present &&
 486            shadow_compare == inst->shadow_compare &&
 487            exec_size == inst->exec_size &&
 488            offset == inst->offset);
 489 }
 490
 491 bool
 492 fs_inst::overwrites_reg(const fs_reg &reg) const
 493 {
 494    return (reg.file == dst.file &&
 495            reg.reg == dst.reg &&
 496            reg.reg_offset >= dst.reg_offset  &&
 497            reg.reg_offset < dst.reg_offset + regs_written);
 498 }
 499
 500 bool
 501 fs_inst::is_send_from_grf() const
 502 {
 503    switch (opcode) {
 504    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
 505    case SHADER_OPCODE_SHADER_TIME_ADD:
 506    case FS_OPCODE_INTERPOLATE_AT_CENTROID:
 507    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
 508    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
 509    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
 510    case SHADER_OPCODE_UNTYPED_ATOMIC:
 511    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
 512       return true;
 513    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 514       return src[1].file == GRF;
 515    case FS_OPCODE_FB_WRITE:
 516       return src[0].file == GRF;
 517    default:
 518       if (is_tex())
 519          return src[0].file == GRF;
 520
 521       return false;
 522    }
 523 }
 524
 525 bool
 526 fs_inst::can_do_source_mods(struct brw_context *brw)
 527 {
 528    if (brw->gen == 6 && is_math())
 529       return false;
 530
 531    if (is_send_from_grf())
 532       return false;
 533
 534    if (!backend_instruction::can_do_source_mods())
 535       return false;
 536
 537    return true;
 538 }
 539
 540 void
 541 fs_reg::init()
 542 {
 543    memset(this, 0, sizeof(*this));
 544    stride = 1;
 545 }
 546
 547 /** Generic unset register constructor. */
 548 fs_reg::fs_reg()
 549 {
 550    init();
 551    this->file = BAD_FILE;
 552 }
 553
 554 /** Immediate value constructor. */
 555 fs_reg::fs_reg(float f)
 556 {
 557    init();
 558    this->file = IMM;
 559    this->type = BRW_REGISTER_TYPE_F;
 560    this->fixed_hw_reg.dw1.f = f;
 561    this->width = 1;
 562 }
 563
 564 /** Immediate value constructor. */
 565 fs_reg::fs_reg(int32_t i)
 566 {
 567    init();
 568    this->file = IMM;
 569    this->type = BRW_REGISTER_TYPE_D;
 570    this->fixed_hw_reg.dw1.d = i;
 571    this->width = 1;
 572 }
 573
 574 /** Immediate value constructor. */
 575 fs_reg::fs_reg(uint32_t u)
 576 {
 577    init();
 578    this->file = IMM;
 579    this->type = BRW_REGISTER_TYPE_UD;
 580    this->fixed_hw_reg.dw1.ud = u;
 581    this->width = 1;
 582 }
 583
 584 /** Vector float immediate value constructor. */
 585 fs_reg::fs_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
 586 {
 587    init();
 588    this->file = IMM;
 589    this->type = BRW_REGISTER_TYPE_VF;
 590    this->fixed_hw_reg.dw1.ud = (vf0 <<  0) |
 591                                (vf1 <<  8) |
 592                                (vf2 << 16) |
 593                                (vf3 << 24);
 594 }
 595
 596 /** Fixed brw_reg. */
 597 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 598 {
 599    init();
 600    this->file = HW_REG;
 601    this->fixed_hw_reg = fixed_hw_reg;
 602    this->type = fixed_hw_reg.type;
 603    this->width = 1 << fixed_hw_reg.width;
 604 }
 605
 606 bool
 607 fs_reg::equals(const fs_reg &r) const
 608 {
 609    return (file == r.file &&
 610            reg == r.reg &&
 611            reg_offset == r.reg_offset &&
 612            subreg_offset == r.subreg_offset &&
 613            type == r.type &&
 614            negate == r.negate &&
 615            abs == r.abs &&
 616            !reladdr && !r.reladdr &&
 617            memcmp(&fixed_hw_reg, &r.fixed_hw_reg, sizeof(fixed_hw_reg)) == 0 &&
 618            width == r.width &&
 619            stride == r.stride);
 620 }
 621
 622 fs_reg &
 623 fs_reg::set_smear(unsigned subreg)
 624 {
 625    assert(file != HW_REG && file != IMM);
 626    subreg_offset = subreg * type_sz(type);
 627    stride = 0;
 628    return *this;
 629 }
 630
 631 bool
 632 fs_reg::is_contiguous() const
 633 {
 634    return stride == 1;
 635 }
 636
 637 int
 638 fs_visitor::type_size(const struct glsl_type *type)
 639 {
 640    unsigned int size, i;
 641
 642    switch (type->base_type) {
 643    case GLSL_TYPE_UINT:
 644    case GLSL_TYPE_INT:
 645    case GLSL_TYPE_FLOAT:
 646    case GLSL_TYPE_BOOL:
 647       return type->components();
 648    case GLSL_TYPE_ARRAY:
 649       return type_size(type->fields.array) * type->length;
 650    case GLSL_TYPE_STRUCT:
 651       size = 0;
 652       for (i = 0; i < type->length; i++) {
 653          size += type_size(type->fields.structure[i].type);
 654       }
 655       return size;
 656    case GLSL_TYPE_SAMPLER:
 657       /* Samplers take up no register space, since they're baked in at
 658        * link time.
 659        */
 660       return 0;
 661    case GLSL_TYPE_ATOMIC_UINT:
 662       return 0;
 663    case GLSL_TYPE_IMAGE:
 664    case GLSL_TYPE_VOID:
 665    case GLSL_TYPE_ERROR:
 666    case GLSL_TYPE_INTERFACE:
 667       unreachable("not reached");
 668    }
 669
 670    return 0;
 671 }
 672
 673 fs_reg
 674 fs_visitor::get_timestamp()
 675 {
 676    assert(brw->gen >= 7);
 677
 678    fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 679                                           BRW_ARF_TIMESTAMP,
 680                                           0),
 681                              BRW_REGISTER_TYPE_UD));
 682
 683    fs_reg dst = fs_reg(GRF, virtual_grf_alloc(1), BRW_REGISTER_TYPE_UD, 4);
 684
 685    fs_inst *mov = emit(MOV(dst, ts));
 686    /* We want to read the 3 fields we care about even if it's not enabled in
 687     * the dispatch.
 688     */
 689    mov->force_writemask_all = true;
 690
 691    /* The caller wants the low 32 bits of the timestamp.  Since it's running
 692     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 693     * which is plenty of time for our purposes.  It is identical across the
 694     * EUs, but since it's tracking GPU core speed it will increment at a
 695     * varying rate as render P-states change.
 696     *
 697     * The caller could also check if render P-states have changed (or anything
 698     * else that might disrupt timing) by setting smear to 2 and checking if
 699     * that field is != 0.
 700     */
 701    dst.set_smear(0);
 702
 703    return dst;
 704 }
 705
 706 void
 707 fs_visitor::emit_shader_time_begin()
 708 {
 709    current_annotation = "shader time start";
 710    shader_start_time = get_timestamp();
 711 }
 712
 713 void
 714 fs_visitor::emit_shader_time_end()
 715 {
 716    current_annotation = "shader time end";
 717
 718    enum shader_time_shader_type type, written_type, reset_type;
 719    if (dispatch_width == 8) {
 720       type = ST_FS8;
 721       written_type = ST_FS8_WRITTEN;
 722       reset_type = ST_FS8_RESET;
 723    } else {
 724       assert(dispatch_width == 16);
 725       type = ST_FS16;
 726       written_type = ST_FS16_WRITTEN;
 727       reset_type = ST_FS16_RESET;
 728    }
 729
 730    fs_reg shader_end_time = get_timestamp();
 731
 732    /* Check that there weren't any timestamp reset events (assuming these
 733     * were the only two timestamp reads that happened).
 734     */
 735    fs_reg reset = shader_end_time;
 736    reset.set_smear(2);
 737    fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
 738    test->conditional_mod = BRW_CONDITIONAL_Z;
 739    emit(IF(BRW_PREDICATE_NORMAL));
 740
 741    fs_reg start = shader_start_time;
 742    start.negate = true;
 743    fs_reg diff = fs_reg(GRF, virtual_grf_alloc(1), BRW_REGISTER_TYPE_UD, 1);
 744    emit(ADD(diff, start, shader_end_time));
 745
 746    /* If there were no instructions between the two timestamp gets, the diff
 747     * is 2 cycles.  Remove that overhead, so I can forget about that when
 748     * trying to determine the time taken for single instructions.
 749     */
 750    emit(ADD(diff, diff, fs_reg(-2u)));
 751
 752    emit_shader_time_write(type, diff);
 753    emit_shader_time_write(written_type, fs_reg(1u));
 754    emit(BRW_OPCODE_ELSE);
 755    emit_shader_time_write(reset_type, fs_reg(1u));
 756    emit(BRW_OPCODE_ENDIF);
 757 }
 758
 759 void
 760 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
 761                                    fs_reg value)
 762 {
 763    int shader_time_index =
 764       brw_get_shader_time_index(brw, shader_prog, prog, type);
 765    fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
 766
 767    fs_reg payload;
 768    if (dispatch_width == 8)
 769       payload = fs_reg(this, glsl_type::uvec2_type);
 770    else
 771       payload = fs_reg(this, glsl_type::uint_type);
 772
 773    emit(new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
 774                              fs_reg(), payload, offset, value));
 775 }
 776
 777 void
 778 fs_visitor::vfail(const char *format, va_list va)
 779 {
 780    char *msg;
 781
 782    if (failed)
 783       return;
 784
 785    failed = true;
 786
 787    msg = ralloc_vasprintf(mem_ctx, format, va);
 788    msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
 789
 790    this->fail_msg = msg;
 791
 792    if (INTEL_DEBUG & DEBUG_WM) {
 793       fprintf(stderr, "%s",  msg);
 794    }
 795 }
 796
 797 void
 798 fs_visitor::fail(const char *format, ...)
 799 {
 800    va_list va;
 801
 802    va_start(va, format);
 803    vfail(format, va);
 804    va_end(va);
 805 }
 806
 807 /**
 808  * Mark this program as impossible to compile in SIMD16 mode.
 809  *
 810  * During the SIMD8 compile (which happens first), we can detect and flag
 811  * things that are unsupported in SIMD16 mode, so the compiler can skip
 812  * the SIMD16 compile altogether.
 813  *
 814  * During a SIMD16 compile (if one happens anyway), this just calls fail().
 815  */
 816 void
 817 fs_visitor::no16(const char *format, ...)
 818 {
 819    va_list va;
 820
 821    va_start(va, format);
 822
 823    if (dispatch_width == 16) {
 824       vfail(format, va);
 825    } else {
 826       simd16_unsupported = true;
 827
 828       if (brw->perf_debug) {
 829          if (no16_msg)
 830             ralloc_vasprintf_append(&no16_msg, format, va);
 831          else
 832             no16_msg = ralloc_vasprintf(mem_ctx, format, va);
 833       }
 834    }
 835
 836    va_end(va);
 837 }
 838
 839 fs_inst *
 840 fs_visitor::emit(enum opcode opcode)
 841 {
 842    return emit(new(mem_ctx) fs_inst(opcode, dispatch_width));
 843 }
 844
 845 fs_inst *
 846 fs_visitor::emit(enum opcode opcode, const fs_reg &dst)
 847 {
 848    return emit(new(mem_ctx) fs_inst(opcode, dst));
 849 }
 850
 851 fs_inst *
 852 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
 853 {
 854    return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
 855 }
 856
 857 fs_inst *
 858 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 859                  const fs_reg &src1)
 860 {
 861    return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
 862 }
 863
 864 fs_inst *
 865 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 866                  const fs_reg &src1, const fs_reg &src2)
 867 {
 868    return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
 869 }
 870
 871 fs_inst *
 872 fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
 873                  fs_reg src[], int sources)
 874 {
 875    return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
 876 }
 877
 878 /**
 879  * Returns true if the instruction has a flag that means it won't
 880  * update an entire destination register.
 881  *
 882  * For example, dead code elimination and live variable analysis want to know
 883  * when a write to a variable screens off any preceding values that were in
 884  * it.
 885  */
 886 bool
 887 fs_inst::is_partial_write() const
 888 {
 889    return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
 890            (this->dst.width * type_sz(this->dst.type)) < 32 ||
 891            !this->dst.is_contiguous());
 892 }
 893
 894 int
 895 fs_inst::regs_read(fs_visitor *v, int arg) const
 896 {
 897    if (is_tex() && arg == 0 && src[0].file == GRF) {
 898       return mlen;
 899    } else if (opcode == FS_OPCODE_FB_WRITE && arg == 0) {
 900       return mlen;
 901    } else if (opcode == SHADER_OPCODE_UNTYPED_ATOMIC && arg == 0) {
 902       return mlen;
 903    } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ && arg == 0) {
 904       return mlen;
 905    }
 906
 907    switch (src[arg].file) {
 908    case BAD_FILE:
 909    case UNIFORM:
 910    case IMM:
 911       return 1;
 912    case GRF:
 913    case HW_REG:
 914       if (src[arg].stride == 0) {
 915          return 1;
 916       } else {
 917          int size = src[arg].width * src[arg].stride * type_sz(src[arg].type);
 918          return (size + 31) / 32;
 919       }
 920    case MRF:
 921       unreachable("MRF registers are not allowed as sources");
 922    default:
 923       unreachable("Invalid register file");
 924    }
 925 }
 926
 927 bool
 928 fs_inst::reads_flag() const
 929 {
 930    return predicate;
 931 }
 932
 933 bool
 934 fs_inst::writes_flag() const
 935 {
 936    return (conditional_mod && opcode != BRW_OPCODE_SEL) ||
 937           opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
 938 }
 939
 940 /**
 941  * Returns how many MRFs an FS opcode will write over.
 942  *
 943  * Note that this is not the 0 or 1 implied writes in an actual gen
 944  * instruction -- the FS opcodes often generate MOVs in addition.
 945  */
 946 int
 947 fs_visitor::implied_mrf_writes(fs_inst *inst)
 948 {
 949    if (inst->mlen == 0)
 950       return 0;
 951
 952    if (inst->base_mrf == -1)
 953       return 0;
 954
 955    switch (inst->opcode) {
 956    case SHADER_OPCODE_RCP:
 957    case SHADER_OPCODE_RSQ:
 958    case SHADER_OPCODE_SQRT:
 959    case SHADER_OPCODE_EXP2:
 960    case SHADER_OPCODE_LOG2:
 961    case SHADER_OPCODE_SIN:
 962    case SHADER_OPCODE_COS:
 963       return 1 * dispatch_width / 8;
 964    case SHADER_OPCODE_POW:
 965    case SHADER_OPCODE_INT_QUOTIENT:
 966    case SHADER_OPCODE_INT_REMAINDER:
 967       return 2 * dispatch_width / 8;
 968    case SHADER_OPCODE_TEX:
 969    case FS_OPCODE_TXB:
 970    case SHADER_OPCODE_TXD:
 971    case SHADER_OPCODE_TXF:
 972    case SHADER_OPCODE_TXF_CMS:
 973    case SHADER_OPCODE_TXF_MCS:
 974    case SHADER_OPCODE_TG4:
 975    case SHADER_OPCODE_TG4_OFFSET:
 976    case SHADER_OPCODE_TXL:
 977    case SHADER_OPCODE_TXS:
 978    case SHADER_OPCODE_LOD:
 979       return 1;
 980    case FS_OPCODE_FB_WRITE:
 981       return 2;
 982    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 983    case SHADER_OPCODE_GEN4_SCRATCH_READ:
 984       return 1;
 985    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
 986       return inst->mlen;
 987    case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
 988       return 2;
 989    case SHADER_OPCODE_UNTYPED_ATOMIC:
 990    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
 991    case FS_OPCODE_INTERPOLATE_AT_CENTROID:
 992    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
 993    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
 994    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
 995       return 0;
 996    default:
 997       unreachable("not reached");
 998    }
 999 }
1000
1001 int
1002 fs_visitor::virtual_grf_alloc(int size)
1003 {
1004    if (virtual_grf_array_size <= virtual_grf_count) {
1005       if (virtual_grf_array_size == 0)
1006          virtual_grf_array_size = 16;
1007       else
1008          virtual_grf_array_size *= 2;
1009       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
1010                                    virtual_grf_array_size);
1011    }
1012    virtual_grf_sizes[virtual_grf_count] = size;
1013    return virtual_grf_count++;
1014 }
1015
1016 /** Fixed HW reg constructor. */
1017 fs_reg::fs_reg(enum register_file file, int reg)
1018 {
1019    init();
1020    this->file = file;
1021    this->reg = reg;
1022    this->type = BRW_REGISTER_TYPE_F;
1023
1024    switch (file) {
1025    case UNIFORM:
1026       this->width = 1;
1027       break;
1028    default:
1029       this->width = 8;
1030    }
1031 }
1032
1033 /** Fixed HW reg constructor. */
1034 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
1035 {
1036    init();
1037    this->file = file;
1038    this->reg = reg;
1039    this->type = type;
1040
1041    switch (file) {
1042    case UNIFORM:
1043       this->width = 1;
1044       break;
1045    default:
1046       this->width = 8;
1047    }
1048 }
1049
1050 /** Fixed HW reg constructor. */
1051 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type,
1052                uint8_t width)
1053 {
1054    init();
1055    this->file = file;
1056    this->reg = reg;
1057    this->type = type;
1058    this->width = width;
1059 }
1060
1061 /** Automatic reg constructor. */
1062 fs_reg::fs_reg(fs_visitor *v, const struct glsl_type *type)
1063 {
1064    init();
1065    int reg_width = v->dispatch_width / 8;
1066
1067    this->file = GRF;
1068    this->reg = v->virtual_grf_alloc(v->type_size(type) * reg_width);
1069    this->reg_offset = 0;
1070    this->type = brw_type_for_base_type(type);
1071    this->width = v->dispatch_width;
1072    assert(this->width == 8 || this->width == 16);
1073 }
1074
1075 fs_reg *
1076 fs_visitor::variable_storage(ir_variable *var)
1077 {
1078    return (fs_reg *)hash_table_find(this->variable_ht, var);
1079 }
1080
1081 void
1082 import_uniforms_callback(const void *key,
1083                          void *data,
1084                          void *closure)
1085 {
1086    struct hash_table *dst_ht = (struct hash_table *)closure;
1087    const fs_reg *reg = (const fs_reg *)data;
1088
1089    if (reg->file != UNIFORM)
1090       return;
1091
1092    hash_table_insert(dst_ht, data, key);
1093 }
1094
1095 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
1096  * This brings in those uniform definitions
1097  */
1098 void
1099 fs_visitor::import_uniforms(fs_visitor *v)
1100 {
1101    hash_table_call_foreach(v->variable_ht,
1102                            import_uniforms_callback,
1103                            variable_ht);
1104    this->push_constant_loc = v->push_constant_loc;
1105    this->pull_constant_loc = v->pull_constant_loc;
1106    this->uniforms = v->uniforms;
1107    this->param_size = v->param_size;
1108 }
1109
1110 /* Our support for uniforms is piggy-backed on the struct
1111  * gl_fragment_program, because that's where the values actually
1112  * get stored, rather than in some global gl_shader_program uniform
1113  * store.
1114  */
1115 void
1116 fs_visitor::setup_uniform_values(ir_variable *ir)
1117 {
1118    int namelen = strlen(ir->name);
1119
1120    /* The data for our (non-builtin) uniforms is stored in a series of
1121     * gl_uniform_driver_storage structs for each subcomponent that
1122     * glGetUniformLocation() could name.  We know it's been set up in the same
1123     * order we'd walk the type, so walk the list of storage and find anything
1124     * with our name, or the prefix of a component that starts with our name.
1125     */
1126    unsigned params_before = uniforms;
1127    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
1128       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
1129
1130       if (strncmp(ir->name, storage->name, namelen) != 0 ||
1131           (storage->name[namelen] != 0 &&
1132            storage->name[namelen] != '.' &&
1133            storage->name[namelen] != '[')) {
1134          continue;
1135       }
1136
1137       unsigned slots = storage->type->component_slots();
1138       if (storage->array_elements)
1139          slots *= storage->array_elements;
1140
1141       for (unsigned i = 0; i < slots; i++) {
1142          stage_prog_data->param[uniforms++] = &storage->storage[i];
1143       }
1144    }
1145
1146    /* Make sure we actually initialized the right amount of stuff here. */
1147    assert(params_before + ir->type->component_slots() == uniforms);
1148    (void)params_before;
1149 }
1150
1151
1152 /* Our support for builtin uniforms is even scarier than non-builtin.
1153  * It sits on top of the PROG_STATE_VAR parameters that are
1154  * automatically updated from GL context state.
1155  */
1156 void
1157 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
1158 {
1159    const ir_state_slot *const slots = ir->get_state_slots();
1160    assert(slots != NULL);
1161
1162    for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
1163       /* This state reference has already been setup by ir_to_mesa, but we'll
1164        * get the same index back here.
1165        */
1166       int index = _mesa_add_state_reference(this->prog->Parameters,
1167                                             (gl_state_index *)slots[i].tokens);
1168
1169       /* Add each of the unique swizzles of the element as a parameter.
1170        * This'll end up matching the expected layout of the
1171        * array/matrix/structure we're trying to fill in.
1172        */
1173       int last_swiz = -1;
1174       for (unsigned int j = 0; j < 4; j++) {
1175          int swiz = GET_SWZ(slots[i].swizzle, j);
1176          if (swiz == last_swiz)
1177             break;
1178          last_swiz = swiz;
1179
1180          stage_prog_data->param[uniforms++] =
1181             &prog->Parameters->ParameterValues[index][swiz];
1182       }
1183    }
1184 }
1185
1186 fs_reg *
1187 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
1188 {
1189    assert(stage == MESA_SHADER_FRAGMENT);
1190    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1191    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1192    fs_reg wpos = *reg;
1193    bool flip = !ir->data.origin_upper_left ^ key->render_to_fbo;
1194
1195    /* gl_FragCoord.x */
1196    if (ir->data.pixel_center_integer) {
1197       emit(MOV(wpos, this->pixel_x));
1198    } else {
1199       emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
1200    }
1201    wpos = offset(wpos, 1);
1202
1203    /* gl_FragCoord.y */
1204    if (!flip && ir->data.pixel_center_integer) {
1205       emit(MOV(wpos, this->pixel_y));
1206    } else {
1207       fs_reg pixel_y = this->pixel_y;
1208       float offset = (ir->data.pixel_center_integer ? 0.0 : 0.5);
1209
1210       if (flip) {
1211          pixel_y.negate = true;
1212          offset += key->drawable_height - 1.0;
1213       }
1214
1215       emit(ADD(wpos, pixel_y, fs_reg(offset)));
1216    }
1217    wpos = offset(wpos, 1);
1218
1219    /* gl_FragCoord.z */
1220    if (brw->gen >= 6) {
1221       emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
1222    } else {
1223       emit(FS_OPCODE_LINTERP, wpos,
1224            this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1225            this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1226            interp_reg(VARYING_SLOT_POS, 2));
1227    }
1228    wpos = offset(wpos, 1);
1229
1230    /* gl_FragCoord.w: Already set up in emit_interpolation */
1231    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1232
1233    return reg;
1234 }
1235
1236 fs_inst *
1237 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1238                          glsl_interp_qualifier interpolation_mode,
1239                          bool is_centroid, bool is_sample)
1240 {
1241    brw_wm_barycentric_interp_mode barycoord_mode;
1242    if (brw->gen >= 6) {
1243       if (is_centroid) {
1244          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1245             barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1246          else
1247             barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1248       } else if (is_sample) {
1249           if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1250             barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1251          else
1252             barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1253       } else {
1254          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1255             barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1256          else
1257             barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1258       }
1259    } else {
1260       /* On Ironlake and below, there is only one interpolation mode.
1261        * Centroid interpolation doesn't mean anything on this hardware --
1262        * there is no multisampling.
1263        */
1264       barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1265    }
1266    return emit(FS_OPCODE_LINTERP, attr,
1267                this->delta_x[barycoord_mode],
1268                this->delta_y[barycoord_mode], interp);
1269 }
1270
1271 fs_reg *
1272 fs_visitor::emit_general_interpolation(ir_variable *ir)
1273 {
1274    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1275    reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
1276    fs_reg attr = *reg;
1277
1278    assert(stage == MESA_SHADER_FRAGMENT);
1279    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1280    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1281
1282    unsigned int array_elements;
1283    const glsl_type *type;
1284
1285    if (ir->type->is_array()) {
1286       array_elements = ir->type->length;
1287       if (array_elements == 0) {
1288          fail("dereferenced array '%s' has length 0\n", ir->name);
1289       }
1290       type = ir->type->fields.array;
1291    } else {
1292       array_elements = 1;
1293       type = ir->type;
1294    }
1295
1296    glsl_interp_qualifier interpolation_mode =
1297       ir->determine_interpolation_mode(key->flat_shade);
1298
1299    int location = ir->data.location;
1300    for (unsigned int i = 0; i < array_elements; i++) {
1301       for (unsigned int j = 0; j < type->matrix_columns; j++) {
1302          if (prog_data->urb_setup[location] == -1) {
1303             /* If there's no incoming setup data for this slot, don't
1304              * emit interpolation for it.
1305              */
1306             attr = offset(attr, type->vector_elements);
1307             location++;
1308             continue;
1309          }
1310
1311          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1312             /* Constant interpolation (flat shading) case. The SF has
1313              * handed us defined values in only the constant offset
1314              * field of the setup reg.
1315              */
1316             for (unsigned int k = 0; k < type->vector_elements; k++) {
1317                struct brw_reg interp = interp_reg(location, k);
1318                interp = suboffset(interp, 3);
1319                interp.type = reg->type;
1320                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1321                attr = offset(attr, 1);
1322             }
1323          } else {
1324             /* Smooth/noperspective interpolation case. */
1325             for (unsigned int k = 0; k < type->vector_elements; k++) {
1326                struct brw_reg interp = interp_reg(location, k);
1327                if (brw->needs_unlit_centroid_workaround && ir->data.centroid) {
1328                   /* Get the pixel/sample mask into f0 so that we know
1329                    * which pixels are lit.  Then, for each channel that is
1330                    * unlit, replace the centroid data with non-centroid
1331                    * data.
1332                    */
1333                   emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1334
1335                   fs_inst *inst;
1336                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1337                                       false, false);
1338                   inst->predicate = BRW_PREDICATE_NORMAL;
1339                   inst->predicate_inverse = true;
1340                   if (brw->has_pln)
1341                      inst->no_dd_clear = true;
1342
1343                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1344                                       ir->data.centroid && !key->persample_shading,
1345                                       ir->data.sample || key->persample_shading);
1346                   inst->predicate = BRW_PREDICATE_NORMAL;
1347                   inst->predicate_inverse = false;
1348                   if (brw->has_pln)
1349                      inst->no_dd_check = true;
1350
1351                } else {
1352                   emit_linterp(attr, fs_reg(interp), interpolation_mode,
1353                                ir->data.centroid && !key->persample_shading,
1354                                ir->data.sample || key->persample_shading);
1355                }
1356                if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1357                   emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1358                }
1359                attr = offset(attr, 1);
1360             }
1361
1362          }
1363          location++;
1364       }
1365    }
1366
1367    return reg;
1368 }
1369
1370 fs_reg *
1371 fs_visitor::emit_frontfacing_interpolation()
1372 {
1373    fs_reg *reg = new(this->mem_ctx) fs_reg(this, glsl_type::bool_type);
1374
1375    if (brw->gen >= 6) {
1376       /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
1377        * a boolean result from this (~0/true or 0/false).
1378        *
1379        * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
1380        * this task in only one instruction:
1381        *    - a negation source modifier will flip the bit; and
1382        *    - a W -> D type conversion will sign extend the bit into the high
1383        *      word of the destination.
1384        *
1385        * An ASR 15 fills the low word of the destination.
1386        */
1387       fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
1388       g0.negate = true;
1389
1390       emit(ASR(*reg, g0, fs_reg(15)));
1391    } else {
1392       /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
1393        * a boolean result from this (1/true or 0/false).
1394        *
1395        * Like in the above case, since the bit is the MSB of g1.6:UD we can use
1396        * the negation source modifier to flip it. Unfortunately the SHR
1397        * instruction only operates on UD (or D with an abs source modifier)
1398        * sources without negation.
1399        *
1400        * Instead, use ASR (which will give ~0/true or 0/false) followed by an
1401        * AND 1.
1402        */
1403       fs_reg asr = fs_reg(this, glsl_type::bool_type);
1404       fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
1405       g1_6.negate = true;
1406
1407       emit(ASR(asr, g1_6, fs_reg(31)));
1408       emit(AND(*reg, asr, fs_reg(1)));
1409    }
1410
1411    return reg;
1412 }
1413
1414 void
1415 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1416 {
1417    assert(stage == MESA_SHADER_FRAGMENT);
1418    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1419    assert(dst.type == BRW_REGISTER_TYPE_F);
1420
1421    if (key->compute_pos_offset) {
1422       /* Convert int_sample_pos to floating point */
1423       emit(MOV(dst, int_sample_pos));
1424       /* Scale to the range [0, 1] */
1425       emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1426    }
1427    else {
1428       /* From ARB_sample_shading specification:
1429        * "When rendering to a non-multisample buffer, or if multisample
1430        *  rasterization is disabled, gl_SamplePosition will always be
1431        *  (0.5, 0.5).
1432        */
1433       emit(MOV(dst, fs_reg(0.5f)));
1434    }
1435 }
1436
1437 fs_reg *
1438 fs_visitor::emit_samplepos_setup()
1439 {
1440    assert(brw->gen >= 6);
1441
1442    this->current_annotation = "compute sample position";
1443    fs_reg *reg = new(this->mem_ctx) fs_reg(this, glsl_type::vec2_type);
1444    fs_reg pos = *reg;
1445    fs_reg int_sample_x = fs_reg(this, glsl_type::int_type);
1446    fs_reg int_sample_y = fs_reg(this, glsl_type::int_type);
1447
1448    /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1449     * mode will be enabled.
1450     *
1451     * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1452     * R31.1:0         Position Offset X/Y for Slot[3:0]
1453     * R31.3:2         Position Offset X/Y for Slot[7:4]
1454     * .....
1455     *
1456     * The X, Y sample positions come in as bytes in  thread payload. So, read
1457     * the positions using vstride=16, width=8, hstride=2.
1458     */
1459    struct brw_reg sample_pos_reg =
1460       stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1461                     BRW_REGISTER_TYPE_B), 16, 8, 2);
1462
1463    if (dispatch_width == 8) {
1464       emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1465    } else {
1466       emit(MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg)));
1467       emit(MOV(half(int_sample_x, 1), fs_reg(suboffset(sample_pos_reg, 16))))
1468          ->force_sechalf = true;
1469    }
1470    /* Compute gl_SamplePosition.x */
1471    compute_sample_position(pos, int_sample_x);
1472    pos = offset(pos, 1);
1473    if (dispatch_width == 8) {
1474       emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1475    } else {
1476       emit(MOV(half(int_sample_y, 0),
1477                fs_reg(suboffset(sample_pos_reg, 1))));
1478       emit(MOV(half(int_sample_y, 1), fs_reg(suboffset(sample_pos_reg, 17))))
1479          ->force_sechalf = true;
1480    }
1481    /* Compute gl_SamplePosition.y */
1482    compute_sample_position(pos, int_sample_y);
1483    return reg;
1484 }
1485
1486 fs_reg *
1487 fs_visitor::emit_sampleid_setup()
1488 {
1489    assert(stage == MESA_SHADER_FRAGMENT);
1490    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1491    assert(brw->gen >= 6);
1492
1493    this->current_annotation = "compute sample id";
1494    fs_reg *reg = new(this->mem_ctx) fs_reg(this, glsl_type::int_type);
1495
1496    if (key->compute_sample_id) {
1497       fs_reg t1 = fs_reg(this, glsl_type::int_type);
1498       fs_reg t2 = fs_reg(this, glsl_type::int_type);
1499       t2.type = BRW_REGISTER_TYPE_UW;
1500
1501       /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1502        * 8x multisampling, subspan 0 will represent sample N (where N
1503        * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1504        * 7. We can find the value of N by looking at R0.0 bits 7:6
1505        * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1506        * (since samples are always delivered in pairs). That is, we
1507        * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1508        * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1509        * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1510        * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1511        * populating a temporary variable with the sequence (0, 1, 2, 3),
1512        * and then reading from it using vstride=1, width=4, hstride=0.
1513        * These computations hold good for 4x multisampling as well.
1514        *
1515        * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1516        * the first four slots are sample 0 of subspan 0; the next four
1517        * are sample 1 of subspan 0; the third group is sample 0 of
1518        * subspan 1, and finally sample 1 of subspan 1.
1519        */
1520       fs_inst *inst;
1521       inst = emit(BRW_OPCODE_AND, t1,
1522                   fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1523                   fs_reg(0xc0));
1524       inst->force_writemask_all = true;
1525       inst = emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1526       inst->force_writemask_all = true;
1527       /* This works for both SIMD8 and SIMD16 */
1528       inst = emit(MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210)));
1529       inst->force_writemask_all = true;
1530       /* This special instruction takes care of setting vstride=1,
1531        * width=4, hstride=0 of t2 during an ADD instruction.
1532        */
1533       emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1534    } else {
1535       /* As per GL_ARB_sample_shading specification:
1536        * "When rendering to a non-multisample buffer, or if multisample
1537        *  rasterization is disabled, gl_SampleID will always be zero."
1538        */
1539       emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1540    }
1541
1542    return reg;
1543 }
1544
1545 fs_reg
1546 fs_visitor::fix_math_operand(fs_reg src)
1547 {
1548    /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1549     * might be able to do better by doing execsize = 1 math and then
1550     * expanding that result out, but we would need to be careful with
1551     * masking.
1552     *
1553     * The hardware ignores source modifiers (negate and abs) on math
1554     * instructions, so we also move to a temp to set those up.
1555     */
1556    if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1557        !src.abs && !src.negate)
1558       return src;
1559
1560    /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1561     * operands to math
1562     */
1563    if (brw->gen >= 7 && src.file != IMM)
1564       return src;
1565
1566    fs_reg expanded = fs_reg(this, glsl_type::float_type);
1567    expanded.type = src.type;
1568    emit(BRW_OPCODE_MOV, expanded, src);
1569    return expanded;
1570 }
1571
1572 fs_inst *
1573 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1574 {
1575    switch (opcode) {
1576    case SHADER_OPCODE_RCP:
1577    case SHADER_OPCODE_RSQ:
1578    case SHADER_OPCODE_SQRT:
1579    case SHADER_OPCODE_EXP2:
1580    case SHADER_OPCODE_LOG2:
1581    case SHADER_OPCODE_SIN:
1582    case SHADER_OPCODE_COS:
1583       break;
1584    default:
1585       unreachable("not reached: bad math opcode");
1586    }
1587
1588    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
1589     * might be able to do better by doing execsize = 1 math and then
1590     * expanding that result out, but we would need to be careful with
1591     * masking.
1592     *
1593     * Gen 6 hardware ignores source modifiers (negate and abs) on math
1594     * instructions, so we also move to a temp to set those up.
1595     */
1596    if (brw->gen == 6 || brw->gen == 7)
1597       src = fix_math_operand(src);
1598
1599    fs_inst *inst = emit(opcode, dst, src);
1600
1601    if (brw->gen < 6) {
1602       inst->base_mrf = 2;
1603       inst->mlen = dispatch_width / 8;
1604    }
1605
1606    return inst;
1607 }
1608
1609 fs_inst *
1610 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1611 {
1612    int base_mrf = 2;
1613    fs_inst *inst;
1614
1615    if (brw->gen >= 8) {
1616       inst = emit(opcode, dst, src0, src1);
1617    } else if (brw->gen >= 6) {
1618       src0 = fix_math_operand(src0);
1619       src1 = fix_math_operand(src1);
1620
1621       inst = emit(opcode, dst, src0, src1);
1622    } else {
1623       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1624        * "Message Payload":
1625        *
1626        * "Operand0[7].  For the INT DIV functions, this operand is the
1627        *  denominator."
1628        *  ...
1629        * "Operand1[7].  For the INT DIV functions, this operand is the
1630        *  numerator."
1631        */
1632       bool is_int_div = opcode != SHADER_OPCODE_POW;
1633       fs_reg &op0 = is_int_div ? src1 : src0;
1634       fs_reg &op1 = is_int_div ? src0 : src1;
1635
1636       emit(MOV(fs_reg(MRF, base_mrf + 1, op1.type, dispatch_width), op1));
1637       inst = emit(opcode, dst, op0, reg_null_f);
1638
1639       inst->base_mrf = base_mrf;
1640       inst->mlen = 2 * dispatch_width / 8;
1641    }
1642    return inst;
1643 }
1644
1645 void
1646 fs_visitor::assign_curb_setup()
1647 {
1648    if (dispatch_width == 8) {
1649       prog_data->dispatch_grf_start_reg = payload.num_regs;
1650    } else {
1651       assert(stage == MESA_SHADER_FRAGMENT);
1652       brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1653       prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1654    }
1655
1656    prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1657
1658    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1659    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1660       for (unsigned int i = 0; i < inst->sources; i++) {
1661          if (inst->src[i].file == UNIFORM) {
1662             int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1663             int constant_nr;
1664             if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1665                constant_nr = push_constant_loc[uniform_nr];
1666             } else {
1667                /* Section 5.11 of the OpenGL 4.1 spec says:
1668                 * "Out-of-bounds reads return undefined values, which include
1669                 *  values from other variables of the active program or zero."
1670                 * Just return the first push constant.
1671                 */
1672                constant_nr = 0;
1673             }
1674
1675             struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1676                                                   constant_nr / 8,
1677                                                   constant_nr % 8);
1678
1679             inst->src[i].file = HW_REG;
1680             inst->src[i].fixed_hw_reg = byte_offset(
1681                retype(brw_reg, inst->src[i].type),
1682                inst->src[i].subreg_offset);
1683          }
1684       }
1685    }
1686 }
1687
1688 void
1689 fs_visitor::calculate_urb_setup()
1690 {
1691    assert(stage == MESA_SHADER_FRAGMENT);
1692    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1693    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1694
1695    memset(prog_data->urb_setup, -1,
1696           sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
1697
1698    int urb_next = 0;
1699    /* Figure out where each of the incoming setup attributes lands. */
1700    if (brw->gen >= 6) {
1701       if (_mesa_bitcount_64(prog->InputsRead &
1702                             BRW_FS_VARYING_INPUT_MASK) <= 16) {
1703          /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1704           * first 16 varying inputs, so we can put them wherever we want.
1705           * Just put them in order.
1706           *
1707           * This is useful because it means that (a) inputs not used by the
1708           * fragment shader won't take up valuable register space, and (b) we
1709           * won't have to recompile the fragment shader if it gets paired with
1710           * a different vertex (or geometry) shader.
1711           */
1712          for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1713             if (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1714                 BITFIELD64_BIT(i)) {
1715                prog_data->urb_setup[i] = urb_next++;
1716             }
1717          }
1718       } else {
1719          /* We have enough input varyings that the SF/SBE pipeline stage can't
1720           * arbitrarily rearrange them to suit our whim; we have to put them
1721           * in an order that matches the output of the previous pipeline stage
1722           * (geometry or vertex shader).
1723           */
1724          struct brw_vue_map prev_stage_vue_map;
1725          brw_compute_vue_map(brw, &prev_stage_vue_map,
1726                              key->input_slots_valid);
1727          int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1728          assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1729          for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1730               slot++) {
1731             int varying = prev_stage_vue_map.slot_to_varying[slot];
1732             /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1733              * unused.
1734              */
1735             if (varying != BRW_VARYING_SLOT_COUNT &&
1736                 (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1737                  BITFIELD64_BIT(varying))) {
1738                prog_data->urb_setup[varying] = slot - first_slot;
1739             }
1740          }
1741          urb_next = prev_stage_vue_map.num_slots - first_slot;
1742       }
1743    } else {
1744       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1745       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1746          /* Point size is packed into the header, not as a general attribute */
1747          if (i == VARYING_SLOT_PSIZ)
1748             continue;
1749
1750          if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1751             /* The back color slot is skipped when the front color is
1752              * also written to.  In addition, some slots can be
1753              * written in the vertex shader and not read in the
1754              * fragment shader.  So the register number must always be
1755              * incremented, mapped or not.
1756              */
1757             if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1758                prog_data->urb_setup[i] = urb_next;
1759             urb_next++;
1760          }
1761       }
1762
1763       /*
1764        * It's a FS only attribute, and we did interpolation for this attribute
1765        * in SF thread. So, count it here, too.
1766        *
1767        * See compile_sf_prog() for more info.
1768        */
1769       if (prog->InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1770          prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1771    }
1772
1773    prog_data->num_varying_inputs = urb_next;
1774 }
1775
1776 void
1777 fs_visitor::assign_urb_setup()
1778 {
1779    assert(stage == MESA_SHADER_FRAGMENT);
1780    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1781
1782    int urb_start = payload.num_regs + prog_data->base.curb_read_length;
1783
1784    /* Offset all the urb_setup[] index by the actual position of the
1785     * setup regs, now that the location of the constants has been chosen.
1786     */
1787    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1788       if (inst->opcode == FS_OPCODE_LINTERP) {
1789          assert(inst->src[2].file == HW_REG);
1790          inst->src[2].fixed_hw_reg.nr += urb_start;
1791       }
1792
1793       if (inst->opcode == FS_OPCODE_CINTERP) {
1794          assert(inst->src[0].file == HW_REG);
1795          inst->src[0].fixed_hw_reg.nr += urb_start;
1796       }
1797    }
1798
1799    /* Each attribute is 4 setup channels, each of which is half a reg. */
1800    this->first_non_payload_grf =
1801       urb_start + prog_data->num_varying_inputs * 2;
1802 }
1803
1804 /**
1805  * Split large virtual GRFs into separate components if we can.
1806  *
1807  * This is mostly duplicated with what brw_fs_vector_splitting does,
1808  * but that's really conservative because it's afraid of doing
1809  * splitting that doesn't result in real progress after the rest of
1810  * the optimization phases, which would cause infinite looping in
1811  * optimization.  We can do it once here, safely.  This also has the
1812  * opportunity to split interpolated values, or maybe even uniforms,
1813  * which we don't have at the IR level.
1814  *
1815  * We want to split, because virtual GRFs are what we register
1816  * allocate and spill (due to contiguousness requirements for some
1817  * instructions), and they're what we naturally generate in the
1818  * codegen process, but most virtual GRFs don't actually need to be
1819  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1820  * live intervals and better dead code elimination and coalescing.
1821  */
1822 void
1823 fs_visitor::split_virtual_grfs()
1824 {
1825    int num_vars = this->virtual_grf_count;
1826
1827    /* Count the total number of registers */
1828    int reg_count = 0;
1829    int vgrf_to_reg[num_vars];
1830    for (int i = 0; i < num_vars; i++) {
1831       vgrf_to_reg[i] = reg_count;
1832       reg_count += virtual_grf_sizes[i];
1833    }
1834
1835    /* An array of "split points".  For each register slot, this indicates
1836     * if this slot can be separated from the previous slot.  Every time an
1837     * instruction uses multiple elements of a register (as a source or
1838     * destination), we mark the used slots as inseparable.  Then we go
1839     * through and split the registers into the smallest pieces we can.
1840     */
1841    bool split_points[reg_count];
1842    memset(split_points, 0, sizeof(split_points));
1843
1844    /* Mark all used registers as fully splittable */
1845    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1846       if (inst->dst.file == GRF) {
1847          int reg = vgrf_to_reg[inst->dst.reg];
1848          for (int j = 1; j < this->virtual_grf_sizes[inst->dst.reg]; j++)
1849             split_points[reg + j] = true;
1850       }
1851
1852       for (int i = 0; i < inst->sources; i++) {
1853          if (inst->src[i].file == GRF) {
1854             int reg = vgrf_to_reg[inst->src[i].reg];
1855             for (int j = 1; j < this->virtual_grf_sizes[inst->src[i].reg]; j++)
1856                split_points[reg + j] = true;
1857          }
1858       }
1859    }
1860
1861    if (brw->has_pln &&
1862        this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1863       /* PLN opcodes rely on the delta_xy being contiguous.  We only have to
1864        * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1865        * Gen6, that was the only supported interpolation mode, and since Gen6,
1866        * delta_x and delta_y are in fixed hardware registers.
1867        */
1868       int vgrf = this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg;
1869       split_points[vgrf_to_reg[vgrf] + 1] = false;
1870    }
1871
1872    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1873       if (inst->dst.file == GRF) {
1874          int reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
1875          for (int j = 1; j < inst->regs_written; j++)
1876             split_points[reg + j] = false;
1877       }
1878       for (int i = 0; i < inst->sources; i++) {
1879          if (inst->src[i].file == GRF) {
1880             int reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
1881             for (int j = 1; j < inst->regs_read(this, i); j++)
1882                split_points[reg + j] = false;
1883          }
1884       }
1885    }
1886
1887    int new_virtual_grf[reg_count];
1888    int new_reg_offset[reg_count];
1889
1890    int reg = 0;
1891    for (int i = 0; i < num_vars; i++) {
1892       /* The first one should always be 0 as a quick sanity check. */
1893       assert(split_points[reg] == false);
1894
1895       /* j = 0 case */
1896       new_reg_offset[reg] = 0;
1897       reg++;
1898       int offset = 1;
1899
1900       /* j > 0 case */
1901       for (int j = 1; j < virtual_grf_sizes[i]; j++) {
1902          /* If this is a split point, reset the offset to 0 and allocate a
1903           * new virtual GRF for the previous offset many registers
1904           */
1905          if (split_points[reg]) {
1906             assert(offset <= MAX_VGRF_SIZE);
1907             int grf = virtual_grf_alloc(offset);
1908             for (int k = reg - offset; k < reg; k++)
1909                new_virtual_grf[k] = grf;
1910             offset = 0;
1911          }
1912          new_reg_offset[reg] = offset;
1913          offset++;
1914          reg++;
1915       }
1916
1917       /* The last one gets the original register number */
1918       assert(offset <= MAX_VGRF_SIZE);
1919       virtual_grf_sizes[i] = offset;
1920       for (int k = reg - offset; k < reg; k++)
1921          new_virtual_grf[k] = i;
1922    }
1923    assert(reg == reg_count);
1924
1925    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1926       if (inst->dst.file == GRF) {
1927          reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
1928          inst->dst.reg = new_virtual_grf[reg];
1929          inst->dst.reg_offset = new_reg_offset[reg];
1930          assert(new_reg_offset[reg] < virtual_grf_sizes[new_virtual_grf[reg]]);
1931       }
1932       for (int i = 0; i < inst->sources; i++) {
1933          if (inst->src[i].file == GRF) {
1934             reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
1935             inst->src[i].reg = new_virtual_grf[reg];
1936             inst->src[i].reg_offset = new_reg_offset[reg];
1937             assert(new_reg_offset[reg] < virtual_grf_sizes[new_virtual_grf[reg]]);
1938          }
1939       }
1940    }
1941    invalidate_live_intervals();
1942 }
1943
1944 /**
1945  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1946  *
1947  * During code generation, we create tons of temporary variables, many of
1948  * which get immediately killed and are never used again.  Yet, in later
1949  * optimization and analysis passes, such as compute_live_intervals, we need
1950  * to loop over all the virtual GRFs.  Compacting them can save a lot of
1951  * overhead.
1952  */
1953 bool
1954 fs_visitor::compact_virtual_grfs()
1955 {
1956    bool progress = false;
1957    int remap_table[this->virtual_grf_count];
1958    memset(remap_table, -1, sizeof(remap_table));
1959
1960    /* Mark which virtual GRFs are used. */
1961    foreach_block_and_inst(block, const fs_inst, inst, cfg) {
1962       if (inst->dst.file == GRF)
1963          remap_table[inst->dst.reg] = 0;
1964
1965       for (int i = 0; i < inst->sources; i++) {
1966          if (inst->src[i].file == GRF)
1967             remap_table[inst->src[i].reg] = 0;
1968       }
1969    }
1970
1971    /* Compact the GRF arrays. */
1972    int new_index = 0;
1973    for (int i = 0; i < this->virtual_grf_count; i++) {
1974       if (remap_table[i] == -1) {
1975          /* We just found an unused register.  This means that we are
1976           * actually going to compact something.
1977           */
1978          progress = true;
1979       } else {
1980          remap_table[i] = new_index;
1981          virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1982          invalidate_live_intervals();
1983          ++new_index;
1984       }
1985    }
1986
1987    this->virtual_grf_count = new_index;
1988
1989    /* Patch all the instructions to use the newly renumbered registers */
1990    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1991       if (inst->dst.file == GRF)
1992          inst->dst.reg = remap_table[inst->dst.reg];
1993
1994       for (int i = 0; i < inst->sources; i++) {
1995          if (inst->src[i].file == GRF)
1996             inst->src[i].reg = remap_table[inst->src[i].reg];
1997       }
1998    }
1999
2000    /* Patch all the references to delta_x/delta_y, since they're used in
2001     * register allocation.  If they're unused, switch them to BAD_FILE so
2002     * we don't think some random VGRF is delta_x/delta_y.
2003     */
2004    for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
2005       if (delta_x[i].file == GRF) {
2006          if (remap_table[delta_x[i].reg] != -1) {
2007             delta_x[i].reg = remap_table[delta_x[i].reg];
2008          } else {
2009             delta_x[i].file = BAD_FILE;
2010          }
2011       }
2012    }
2013    for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
2014       if (delta_y[i].file == GRF) {
2015          if (remap_table[delta_y[i].reg] != -1) {
2016             delta_y[i].reg = remap_table[delta_y[i].reg];
2017          } else {
2018             delta_y[i].file = BAD_FILE;
2019          }
2020       }
2021    }
2022
2023    return progress;
2024 }
2025
2026 /*
2027  * Implements array access of uniforms by inserting a
2028  * PULL_CONSTANT_LOAD instruction.
2029  *
2030  * Unlike temporary GRF array access (where we don't support it due to
2031  * the difficulty of doing relative addressing on instruction
2032  * destinations), we could potentially do array access of uniforms
2033  * that were loaded in GRF space as push constants.  In real-world
2034  * usage we've seen, though, the arrays being used are always larger
2035  * than we could load as push constants, so just always move all
2036  * uniform array access out to a pull constant buffer.
2037  */
2038 void
2039 fs_visitor::move_uniform_array_access_to_pull_constants()
2040 {
2041    if (dispatch_width != 8)
2042       return;
2043
2044    pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2045    memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
2046
2047    /* Walk through and find array access of uniforms.  Put a copy of that
2048     * uniform in the pull constant buffer.
2049     *
2050     * Note that we don't move constant-indexed accesses to arrays.  No
2051     * testing has been done of the performance impact of this choice.
2052     */
2053    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2054       for (int i = 0 ; i < inst->sources; i++) {
2055          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2056             continue;
2057
2058          int uniform = inst->src[i].reg;
2059
2060          /* If this array isn't already present in the pull constant buffer,
2061           * add it.
2062           */
2063          if (pull_constant_loc[uniform] == -1) {
2064             const gl_constant_value **values = &stage_prog_data->param[uniform];
2065
2066             assert(param_size[uniform]);
2067
2068             for (int j = 0; j < param_size[uniform]; j++) {
2069                pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
2070
2071                stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
2072                   values[j];
2073             }
2074          }
2075       }
2076    }
2077 }
2078
2079 /**
2080  * Assign UNIFORM file registers to either push constants or pull constants.
2081  *
2082  * We allow a fragment shader to have more than the specified minimum
2083  * maximum number of fragment shader uniform components (64).  If
2084  * there are too many of these, they'd fill up all of register space.
2085  * So, this will push some of them out to the pull constant buffer and
2086  * update the program to load them.
2087  */
2088 void
2089 fs_visitor::assign_constant_locations()
2090 {
2091    /* Only the first compile (SIMD8 mode) gets to decide on locations. */
2092    if (dispatch_width != 8)
2093       return;
2094
2095    /* Find which UNIFORM registers are still in use. */
2096    bool is_live[uniforms];
2097    for (unsigned int i = 0; i < uniforms; i++) {
2098       is_live[i] = false;
2099    }
2100
2101    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2102       for (int i = 0; i < inst->sources; i++) {
2103          if (inst->src[i].file != UNIFORM)
2104             continue;
2105
2106          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
2107          if (constant_nr >= 0 && constant_nr < (int) uniforms)
2108             is_live[constant_nr] = true;
2109       }
2110    }
2111
2112    /* Only allow 16 registers (128 uniform components) as push constants.
2113     *
2114     * Just demote the end of the list.  We could probably do better
2115     * here, demoting things that are rarely used in the program first.
2116     *
2117     * If changing this value, note the limitation about total_regs in
2118     * brw_curbe.c.
2119     */
2120    unsigned int max_push_components = 16 * 8;
2121    unsigned int num_push_constants = 0;
2122
2123    push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2124
2125    for (unsigned int i = 0; i < uniforms; i++) {
2126       if (!is_live[i] || pull_constant_loc[i] != -1) {
2127          /* This UNIFORM register is either dead, or has already been demoted
2128           * to a pull const.  Mark it as no longer living in the param[] array.
2129           */
2130          push_constant_loc[i] = -1;
2131          continue;
2132       }
2133
2134       if (num_push_constants < max_push_components) {
2135          /* Retain as a push constant.  Record the location in the params[]
2136           * array.
2137           */
2138          push_constant_loc[i] = num_push_constants++;
2139       } else {
2140          /* Demote to a pull constant. */
2141          push_constant_loc[i] = -1;
2142
2143          int pull_index = stage_prog_data->nr_pull_params++;
2144          stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
2145          pull_constant_loc[i] = pull_index;
2146       }
2147    }
2148
2149    stage_prog_data->nr_params = num_push_constants;
2150
2151    /* Up until now, the param[] array has been indexed by reg + reg_offset
2152     * of UNIFORM registers.  Condense it to only contain the uniforms we
2153     * chose to upload as push constants.
2154     */
2155    for (unsigned int i = 0; i < uniforms; i++) {
2156       int remapped = push_constant_loc[i];
2157
2158       if (remapped == -1)
2159          continue;
2160
2161       assert(remapped <= (int)i);
2162       stage_prog_data->param[remapped] = stage_prog_data->param[i];
2163    }
2164 }
2165
2166 /**
2167  * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
2168  * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
2169  */
2170 void
2171 fs_visitor::demote_pull_constants()
2172 {
2173    foreach_block_and_inst (block, fs_inst, inst, cfg) {
2174       for (int i = 0; i < inst->sources; i++) {
2175          if (inst->src[i].file != UNIFORM)
2176             continue;
2177
2178          int pull_index = pull_constant_loc[inst->src[i].reg +
2179                                             inst->src[i].reg_offset];
2180          if (pull_index == -1)
2181             continue;
2182
2183          /* Set up the annotation tracking for new generated instructions. */
2184          base_ir = inst->ir;
2185          current_annotation = inst->annotation;
2186
2187          fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
2188          fs_reg dst = fs_reg(this, glsl_type::float_type);
2189
2190          /* Generate a pull load into dst. */
2191          if (inst->src[i].reladdr) {
2192             exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
2193                                                         surf_index,
2194                                                         *inst->src[i].reladdr,
2195                                                         pull_index);
2196             inst->insert_before(block, &list);
2197             inst->src[i].reladdr = NULL;
2198          } else {
2199             fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
2200             fs_inst *pull =
2201                new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 8,
2202                                     dst, surf_index, offset);
2203             inst->insert_before(block, pull);
2204             inst->src[i].set_smear(pull_index & 3);
2205          }
2206
2207          /* Rewrite the instruction to use the temporary VGRF. */
2208          inst->src[i].file = GRF;
2209          inst->src[i].reg = dst.reg;
2210          inst->src[i].reg_offset = 0;
2211          inst->src[i].width = dispatch_width;
2212       }
2213    }
2214    invalidate_live_intervals();
2215 }
2216
2217 bool
2218 fs_visitor::opt_algebraic()
2219 {
2220    bool progress = false;
2221
2222    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2223       switch (inst->opcode) {
2224       case BRW_OPCODE_MUL:
2225          if (inst->src[1].file != IMM)
2226             continue;
2227
2228          /* a * 1.0 = a */
2229          if (inst->src[1].is_one()) {
2230             inst->opcode = BRW_OPCODE_MOV;
2231             inst->src[1] = reg_undef;
2232             progress = true;
2233             break;
2234          }
2235
2236          /* a * 0.0 = 0.0 */
2237          if (inst->src[1].is_zero()) {
2238             inst->opcode = BRW_OPCODE_MOV;
2239             inst->src[0] = inst->src[1];
2240             inst->src[1] = reg_undef;
2241             progress = true;
2242             break;
2243          }
2244
2245          break;
2246       case BRW_OPCODE_ADD:
2247          if (inst->src[1].file != IMM)
2248             continue;
2249
2250          /* a + 0.0 = a */
2251          if (inst->src[1].is_zero()) {
2252             inst->opcode = BRW_OPCODE_MOV;
2253             inst->src[1] = reg_undef;
2254             progress = true;
2255             break;
2256          }
2257          break;
2258       case BRW_OPCODE_OR:
2259          if (inst->src[0].equals(inst->src[1])) {
2260             inst->opcode = BRW_OPCODE_MOV;
2261             inst->src[1] = reg_undef;
2262             progress = true;
2263             break;
2264          }
2265          break;
2266       case BRW_OPCODE_LRP:
2267          if (inst->src[1].equals(inst->src[2])) {
2268             inst->opcode = BRW_OPCODE_MOV;
2269             inst->src[0] = inst->src[1];
2270             inst->src[1] = reg_undef;
2271             inst->src[2] = reg_undef;
2272             progress = true;
2273             break;
2274          }
2275          break;
2276       case BRW_OPCODE_SEL:
2277          if (inst->src[0].equals(inst->src[1])) {
2278             inst->opcode = BRW_OPCODE_MOV;
2279             inst->src[1] = reg_undef;
2280             inst->predicate = BRW_PREDICATE_NONE;
2281             inst->predicate_inverse = false;
2282             progress = true;
2283          } else if (inst->saturate && inst->src[1].file == IMM) {
2284             switch (inst->conditional_mod) {
2285             case BRW_CONDITIONAL_LE:
2286             case BRW_CONDITIONAL_L:
2287                switch (inst->src[1].type) {
2288                case BRW_REGISTER_TYPE_F:
2289                   if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
2290                      inst->opcode = BRW_OPCODE_MOV;
2291                      inst->src[1] = reg_undef;
2292                      progress = true;
2293                   }
2294                   break;
2295                default:
2296                   break;
2297                }
2298                break;
2299             case BRW_CONDITIONAL_GE:
2300             case BRW_CONDITIONAL_G:
2301                switch (inst->src[1].type) {
2302                case BRW_REGISTER_TYPE_F:
2303                   if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
2304                      inst->opcode = BRW_OPCODE_MOV;
2305                      inst->src[1] = reg_undef;
2306                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
2307                      progress = true;
2308                   }
2309                   break;
2310                default:
2311                   break;
2312                }
2313             default:
2314                break;
2315             }
2316          }
2317          break;
2318       case SHADER_OPCODE_RCP: {
2319          fs_inst *prev = (fs_inst *)inst->prev;
2320          if (prev->opcode == SHADER_OPCODE_SQRT) {
2321             if (inst->src[0].equals(prev->dst)) {
2322                inst->opcode = SHADER_OPCODE_RSQ;
2323                inst->src[0] = prev->src[0];
2324                progress = true;
2325             }
2326          }
2327          break;
2328       }
2329       default:
2330          break;
2331       }
2332    }
2333
2334    return progress;
2335 }
2336
2337 bool
2338 fs_visitor::opt_register_renaming()
2339 {
2340    bool progress = false;
2341    int depth = 0;
2342
2343    int remap[virtual_grf_count];
2344    memset(remap, -1, sizeof(int) * virtual_grf_count);
2345
2346    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2347       if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
2348          depth++;
2349       } else if (inst->opcode == BRW_OPCODE_ENDIF ||
2350                  inst->opcode == BRW_OPCODE_WHILE) {
2351          depth--;
2352       }
2353
2354       /* Rewrite instruction sources. */
2355       for (int i = 0; i < inst->sources; i++) {
2356          if (inst->src[i].file == GRF &&
2357              remap[inst->src[i].reg] != -1 &&
2358              remap[inst->src[i].reg] != inst->src[i].reg) {
2359             inst->src[i].reg = remap[inst->src[i].reg];
2360             progress = true;
2361          }
2362       }
2363
2364       const int dst = inst->dst.reg;
2365
2366       if (depth == 0 &&
2367           inst->dst.file == GRF &&
2368           virtual_grf_sizes[inst->dst.reg] == inst->dst.width / 8 &&
2369           !inst->is_partial_write()) {
2370          if (remap[dst] == -1) {
2371             remap[dst] = dst;
2372          } else {
2373             remap[dst] = virtual_grf_alloc(inst->dst.width / 8);
2374             inst->dst.reg = remap[dst];
2375             progress = true;
2376          }
2377       } else if (inst->dst.file == GRF &&
2378                  remap[dst] != -1 &&
2379                  remap[dst] != dst) {
2380          inst->dst.reg = remap[dst];
2381          progress = true;
2382       }
2383    }
2384
2385    if (progress) {
2386       invalidate_live_intervals();
2387
2388       for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
2389          if (delta_x[i].file == GRF && remap[delta_x[i].reg] != -1) {
2390             delta_x[i].reg = remap[delta_x[i].reg];
2391          }
2392       }
2393       for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
2394          if (delta_y[i].file == GRF && remap[delta_y[i].reg] != -1) {
2395             delta_y[i].reg = remap[delta_y[i].reg];
2396          }
2397       }
2398    }
2399
2400    return progress;
2401 }
2402
2403 bool
2404 fs_visitor::compute_to_mrf()
2405 {
2406    bool progress = false;
2407    int next_ip = 0;
2408
2409    /* No MRFs on Gen >= 7. */
2410    if (brw->gen >= 7)
2411       return false;
2412
2413    calculate_live_intervals();
2414
2415    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2416       int ip = next_ip;
2417       next_ip++;
2418
2419       if (inst->opcode != BRW_OPCODE_MOV ||
2420           inst->is_partial_write() ||
2421           inst->dst.file != MRF || inst->src[0].file != GRF ||
2422           inst->dst.type != inst->src[0].type ||
2423           inst->src[0].abs || inst->src[0].negate ||
2424           !inst->src[0].is_contiguous() ||
2425           inst->src[0].subreg_offset)
2426          continue;
2427
2428       /* Work out which hardware MRF registers are written by this
2429        * instruction.
2430        */
2431       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2432       int mrf_high;
2433       if (inst->dst.reg & BRW_MRF_COMPR4) {
2434          mrf_high = mrf_low + 4;
2435       } else if (inst->exec_size == 16) {
2436          mrf_high = mrf_low + 1;
2437       } else {
2438          mrf_high = mrf_low;
2439       }
2440
2441       /* Can't compute-to-MRF this GRF if someone else was going to
2442        * read it later.
2443        */
2444       if (this->virtual_grf_end[inst->src[0].reg] > ip)
2445          continue;
2446
2447       /* Found a move of a GRF to a MRF.  Let's see if we can go
2448        * rewrite the thing that made this GRF to write into the MRF.
2449        */
2450       foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2451          if (scan_inst->dst.file == GRF &&
2452              scan_inst->dst.reg == inst->src[0].reg) {
2453             /* Found the last thing to write our reg we want to turn
2454              * into a compute-to-MRF.
2455              */
2456
2457             /* If this one instruction didn't populate all the
2458              * channels, bail.  We might be able to rewrite everything
2459              * that writes that reg, but it would require smarter
2460              * tracking to delay the rewriting until complete success.
2461              */
2462             if (scan_inst->is_partial_write())
2463                break;
2464
2465             /* Things returning more than one register would need us to
2466              * understand coalescing out more than one MOV at a time.
2467              */
2468             if (scan_inst->regs_written > scan_inst->dst.width / 8)
2469                break;
2470
2471             /* SEND instructions can't have MRF as a destination. */
2472             if (scan_inst->mlen)
2473                break;
2474
2475             if (brw->gen == 6) {
2476                /* gen6 math instructions must have the destination be
2477                 * GRF, so no compute-to-MRF for them.
2478                 */
2479                if (scan_inst->is_math()) {
2480                   break;
2481                }
2482             }
2483
2484             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2485                /* Found the creator of our MRF's source value. */
2486                scan_inst->dst.file = MRF;
2487                scan_inst->dst.reg = inst->dst.reg;
2488                scan_inst->saturate |= inst->saturate;
2489                inst->remove(block);
2490                progress = true;
2491             }
2492             break;
2493          }
2494
2495          /* We don't handle control flow here.  Most computation of
2496           * values that end up in MRFs are shortly before the MRF
2497           * write anyway.
2498           */
2499          if (block->start() == scan_inst)
2500             break;
2501
2502          /* You can't read from an MRF, so if someone else reads our
2503           * MRF's source GRF that we wanted to rewrite, that stops us.
2504           */
2505          bool interfered = false;
2506          for (int i = 0; i < scan_inst->sources; i++) {
2507             if (scan_inst->src[i].file == GRF &&
2508                 scan_inst->src[i].reg == inst->src[0].reg &&
2509                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2510                interfered = true;
2511             }
2512          }
2513          if (interfered)
2514             break;
2515
2516          if (scan_inst->dst.file == MRF) {
2517             /* If somebody else writes our MRF here, we can't
2518              * compute-to-MRF before that.
2519              */
2520             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2521             int scan_mrf_high;
2522
2523             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2524                scan_mrf_high = scan_mrf_low + 4;
2525             } else if (scan_inst->exec_size == 16) {
2526                scan_mrf_high = scan_mrf_low + 1;
2527             } else {
2528                scan_mrf_high = scan_mrf_low;
2529             }
2530
2531             if (mrf_low == scan_mrf_low ||
2532                 mrf_low == scan_mrf_high ||
2533                 mrf_high == scan_mrf_low ||
2534                 mrf_high == scan_mrf_high) {
2535                break;
2536             }
2537          }
2538
2539          if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2540             /* Found a SEND instruction, which means that there are
2541              * live values in MRFs from base_mrf to base_mrf +
2542              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2543              * above it.
2544              */
2545             if (mrf_low >= scan_inst->base_mrf &&
2546                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2547                break;
2548             }
2549             if (mrf_high >= scan_inst->base_mrf &&
2550                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2551                break;
2552             }
2553          }
2554       }
2555    }
2556
2557    if (progress)
2558       invalidate_live_intervals();
2559
2560    return progress;
2561 }
2562
2563 /**
2564  * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
2565  * instructions to FS_OPCODE_REP_FB_WRITE.
2566  */
2567 void
2568 fs_visitor::emit_repclear_shader()
2569 {
2570    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2571    int base_mrf = 1;
2572    int color_mrf = base_mrf + 2;
2573
2574    fs_inst *mov = emit(MOV(vec4(brw_message_reg(color_mrf)),
2575                            fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F)));
2576    mov->force_writemask_all = true;
2577
2578    fs_inst *write;
2579    if (key->nr_color_regions == 1) {
2580       write = emit(FS_OPCODE_REP_FB_WRITE);
2581       write->saturate = key->clamp_fragment_color;
2582       write->base_mrf = color_mrf;
2583       write->target = 0;
2584       write->header_present = false;
2585       write->mlen = 1;
2586    } else {
2587       assume(key->nr_color_regions > 0);
2588       for (int i = 0; i < key->nr_color_regions; ++i) {
2589          write = emit(FS_OPCODE_REP_FB_WRITE);
2590          write->saturate = key->clamp_fragment_color;
2591          write->base_mrf = base_mrf;
2592          write->target = i;
2593          write->header_present = true;
2594          write->mlen = 3;
2595       }
2596    }
2597    write->eot = true;
2598
2599    calculate_cfg();
2600
2601    assign_constant_locations();
2602    assign_curb_setup();
2603
2604    /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
2605    assert(mov->src[0].file == HW_REG);
2606    mov->src[0] = brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
2607 }
2608
2609 /**
2610  * Walks through basic blocks, looking for repeated MRF writes and
2611  * removing the later ones.
2612  */
2613 bool
2614 fs_visitor::remove_duplicate_mrf_writes()
2615 {
2616    fs_inst *last_mrf_move[16];
2617    bool progress = false;
2618
2619    /* Need to update the MRF tracking for compressed instructions. */
2620    if (dispatch_width == 16)
2621       return false;
2622
2623    memset(last_mrf_move, 0, sizeof(last_mrf_move));
2624
2625    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
2626       if (inst->is_control_flow()) {
2627          memset(last_mrf_move, 0, sizeof(last_mrf_move));
2628       }
2629
2630       if (inst->opcode == BRW_OPCODE_MOV &&
2631           inst->dst.file == MRF) {
2632          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2633          if (prev_inst && inst->equals(prev_inst)) {
2634             inst->remove(block);
2635             progress = true;
2636             continue;
2637          }
2638       }
2639
2640       /* Clear out the last-write records for MRFs that were overwritten. */
2641       if (inst->dst.file == MRF) {
2642          last_mrf_move[inst->dst.reg] = NULL;
2643       }
2644
2645       if (inst->mlen > 0 && inst->base_mrf != -1) {
2646          /* Found a SEND instruction, which will include two or fewer
2647           * implied MRF writes.  We could do better here.
2648           */
2649          for (int i = 0; i < implied_mrf_writes(inst); i++) {
2650             last_mrf_move[inst->base_mrf + i] = NULL;
2651          }
2652       }
2653
2654       /* Clear out any MRF move records whose sources got overwritten. */
2655       if (inst->dst.file == GRF) {
2656          for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2657             if (last_mrf_move[i] &&
2658                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2659                last_mrf_move[i] = NULL;
2660             }
2661          }
2662       }
2663
2664       if (inst->opcode == BRW_OPCODE_MOV &&
2665           inst->dst.file == MRF &&
2666           inst->src[0].file == GRF &&
2667           !inst->is_partial_write()) {
2668          last_mrf_move[inst->dst.reg] = inst;
2669       }
2670    }
2671
2672    if (progress)
2673       invalidate_live_intervals();
2674
2675    return progress;
2676 }
2677
2678 static void
2679 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2680                         int first_grf, int grf_len)
2681 {
2682    /* Clear the flag for registers that actually got read (as expected). */
2683    for (int i = 0; i < inst->sources; i++) {
2684       int grf;
2685       if (inst->src[i].file == GRF) {
2686          grf = inst->src[i].reg;
2687       } else if (inst->src[i].file == HW_REG &&
2688                  inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2689          grf = inst->src[i].fixed_hw_reg.nr;
2690       } else {
2691          continue;
2692       }
2693
2694       if (grf >= first_grf &&
2695           grf < first_grf + grf_len) {
2696          deps[grf - first_grf] = false;
2697          if (inst->exec_size == 16)
2698             deps[grf - first_grf + 1] = false;
2699       }
2700    }
2701 }
2702
2703 /**
2704  * Implements this workaround for the original 965:
2705  *
2706  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2707  *      check for post destination dependencies on this instruction, software
2708  *      must ensure that there is no destination hazard for the case of ‘write
2709  *      followed by a posted write’ shown in the following example.
2710  *
2711  *      1. mov r3 0
2712  *      2. send r3.xy <rest of send instruction>
2713  *      3. mov r2 r3
2714  *
2715  *      Due to no post-destination dependency check on the ‘send’, the above
2716  *      code sequence could have two instructions (1 and 2) in flight at the
2717  *      same time that both consider ‘r3’ as the target of their final writes.
2718  */
2719 void
2720 fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
2721                                                         fs_inst *inst)
2722 {
2723    int write_len = inst->regs_written;
2724    int first_write_grf = inst->dst.reg;
2725    bool needs_dep[BRW_MAX_MRF];
2726    assert(write_len < (int)sizeof(needs_dep) - 1);
2727
2728    memset(needs_dep, false, sizeof(needs_dep));
2729    memset(needs_dep, true, write_len);
2730
2731    clear_deps_for_inst_src(inst, dispatch_width,
2732                            needs_dep, first_write_grf, write_len);
2733
2734    /* Walk backwards looking for writes to registers we're writing which
2735     * aren't read since being written.  If we hit the start of the program,
2736     * we assume that there are no outstanding dependencies on entry to the
2737     * program.
2738     */
2739    foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2740       /* If we hit control flow, assume that there *are* outstanding
2741        * dependencies, and force their cleanup before our instruction.
2742        */
2743       if (block->start() == scan_inst) {
2744          for (int i = 0; i < write_len; i++) {
2745             if (needs_dep[i]) {
2746                inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
2747             }
2748          }
2749          return;
2750       }
2751
2752       /* We insert our reads as late as possible on the assumption that any
2753        * instruction but a MOV that might have left us an outstanding
2754        * dependency has more latency than a MOV.
2755        */
2756       if (scan_inst->dst.file == GRF) {
2757          for (int i = 0; i < scan_inst->regs_written; i++) {
2758             int reg = scan_inst->dst.reg + i;
2759
2760             if (reg >= first_write_grf &&
2761                 reg < first_write_grf + write_len &&
2762                 needs_dep[reg - first_write_grf]) {
2763                inst->insert_before(block, DEP_RESOLVE_MOV(reg));
2764                needs_dep[reg - first_write_grf] = false;
2765                if (scan_inst->exec_size == 16)
2766                   needs_dep[reg - first_write_grf + 1] = false;
2767             }
2768          }
2769       }
2770
2771       /* Clear the flag for registers that actually got read (as expected). */
2772       clear_deps_for_inst_src(scan_inst, dispatch_width,
2773                               needs_dep, first_write_grf, write_len);
2774
2775       /* Continue the loop only if we haven't resolved all the dependencies */
2776       int i;
2777       for (i = 0; i < write_len; i++) {
2778          if (needs_dep[i])
2779             break;
2780       }
2781       if (i == write_len)
2782          return;
2783    }
2784 }
2785
2786 /**
2787  * Implements this workaround for the original 965:
2788  *
2789  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
2790  *      used as a destination register until after it has been sourced by an
2791  *      instruction with a different destination register.
2792  */
2793 void
2794 fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
2795 {
2796    int write_len = inst->regs_written;
2797    int first_write_grf = inst->dst.reg;
2798    bool needs_dep[BRW_MAX_MRF];
2799    assert(write_len < (int)sizeof(needs_dep) - 1);
2800
2801    memset(needs_dep, false, sizeof(needs_dep));
2802    memset(needs_dep, true, write_len);
2803    /* Walk forwards looking for writes to registers we're writing which aren't
2804     * read before being written.
2805     */
2806    foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst, block) {
2807       /* If we hit control flow, force resolve all remaining dependencies. */
2808       if (block->end() == scan_inst) {
2809          for (int i = 0; i < write_len; i++) {
2810             if (needs_dep[i])
2811                scan_inst->insert_before(block,
2812                                         DEP_RESOLVE_MOV(first_write_grf + i));
2813          }
2814          return;
2815       }
2816
2817       /* Clear the flag for registers that actually got read (as expected). */
2818       clear_deps_for_inst_src(scan_inst, dispatch_width,
2819                               needs_dep, first_write_grf, write_len);
2820
2821       /* We insert our reads as late as possible since they're reading the
2822        * result of a SEND, which has massive latency.
2823        */
2824       if (scan_inst->dst.file == GRF &&
2825           scan_inst->dst.reg >= first_write_grf &&
2826           scan_inst->dst.reg < first_write_grf + write_len &&
2827           needs_dep[scan_inst->dst.reg - first_write_grf]) {
2828          scan_inst->insert_before(block, DEP_RESOLVE_MOV(scan_inst->dst.reg));
2829          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2830       }
2831
2832       /* Continue the loop only if we haven't resolved all the dependencies */
2833       int i;
2834       for (i = 0; i < write_len; i++) {
2835          if (needs_dep[i])
2836             break;
2837       }
2838       if (i == write_len)
2839          return;
2840    }
2841
2842    /* If we hit the end of the program, resolve all remaining dependencies out
2843     * of paranoia.
2844     */
2845    fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2846    assert(last_inst->eot);
2847    for (int i = 0; i < write_len; i++) {
2848       if (needs_dep[i])
2849          last_inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
2850    }
2851 }
2852
2853 void
2854 fs_visitor::insert_gen4_send_dependency_workarounds()
2855 {
2856    if (brw->gen != 4 || brw->is_g4x)
2857       return;
2858
2859    bool progress = false;
2860
2861    /* Note that we're done with register allocation, so GRF fs_regs always
2862     * have a .reg_offset of 0.
2863     */
2864
2865    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2866       if (inst->mlen != 0 && inst->dst.file == GRF) {
2867          insert_gen4_pre_send_dependency_workarounds(block, inst);
2868          insert_gen4_post_send_dependency_workarounds(block, inst);
2869          progress = true;
2870       }
2871    }
2872
2873    if (progress)
2874       invalidate_live_intervals();
2875 }
2876
2877 /**
2878  * Turns the generic expression-style uniform pull constant load instruction
2879  * into a hardware-specific series of instructions for loading a pull
2880  * constant.
2881  *
2882  * The expression style allows the CSE pass before this to optimize out
2883  * repeated loads from the same offset, and gives the pre-register-allocation
2884  * scheduling full flexibility, while the conversion to native instructions
2885  * allows the post-register-allocation scheduler the best information
2886  * possible.
2887  *
2888  * Note that execution masking for setting up pull constant loads is special:
2889  * the channels that need to be written are unrelated to the current execution
2890  * mask, since a later instruction will use one of the result channels as a
2891  * source operand for all 8 or 16 of its channels.
2892  */
2893 void
2894 fs_visitor::lower_uniform_pull_constant_loads()
2895 {
2896    foreach_block_and_inst (block, fs_inst, inst, cfg) {
2897       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2898          continue;
2899
2900       if (brw->gen >= 7) {
2901          /* The offset arg before was a vec4-aligned byte offset.  We need to
2902           * turn it into a dword offset.
2903           */
2904          fs_reg const_offset_reg = inst->src[1];
2905          assert(const_offset_reg.file == IMM &&
2906                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2907          const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
2908          fs_reg payload = fs_reg(this, glsl_type::uint_type);
2909
2910          /* This is actually going to be a MOV, but since only the first dword
2911           * is accessed, we have a special opcode to do just that one.  Note
2912           * that this needs to be an operation that will be considered a def
2913           * by live variable analysis, or register allocation will explode.
2914           */
2915          fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2916                                                8, payload, const_offset_reg);
2917          setup->force_writemask_all = true;
2918
2919          setup->ir = inst->ir;
2920          setup->annotation = inst->annotation;
2921          inst->insert_before(block, setup);
2922
2923          /* Similarly, this will only populate the first 4 channels of the
2924           * result register (since we only use smear values from 0-3), but we
2925           * don't tell the optimizer.
2926           */
2927          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2928          inst->src[1] = payload;
2929
2930          invalidate_live_intervals();
2931       } else {
2932          /* Before register allocation, we didn't tell the scheduler about the
2933           * MRF we use.  We know it's safe to use this MRF because nothing
2934           * else does except for register spill/unspill, which generates and
2935           * uses its MRF within a single IR instruction.
2936           */
2937          inst->base_mrf = 14;
2938          inst->mlen = 1;
2939       }
2940    }
2941 }
2942
2943 bool
2944 fs_visitor::lower_load_payload()
2945 {
2946    bool progress = false;
2947
2948    int vgrf_to_reg[virtual_grf_count];
2949    int reg_count = 16; /* Leave room for MRF */
2950    for (int i = 0; i < virtual_grf_count; ++i) {
2951       vgrf_to_reg[i] = reg_count;
2952       reg_count += virtual_grf_sizes[i];
2953    }
2954
2955    struct {
2956       bool written:1; /* Whether this register has ever been written */
2957       bool force_writemask_all:1;
2958       bool force_sechalf:1;
2959    } metadata[reg_count];
2960    memset(metadata, 0, sizeof(metadata));
2961
2962    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
2963       int dst_reg;
2964       if (inst->dst.file == GRF) {
2965          dst_reg = vgrf_to_reg[inst->dst.reg];
2966       } else {
2967          /* MRF */
2968          dst_reg = inst->dst.reg;
2969       }
2970
2971       if (inst->dst.file == MRF || inst->dst.file == GRF) {
2972          bool force_sechalf = inst->force_sechalf;
2973          bool toggle_sechalf = inst->dst.width == 16 &&
2974                                type_sz(inst->dst.type) == 4;
2975          for (int i = 0; i < inst->regs_written; ++i) {
2976             metadata[dst_reg + i].written = true;
2977             metadata[dst_reg + i].force_sechalf = force_sechalf;
2978             metadata[dst_reg + i].force_writemask_all = inst->force_writemask_all;
2979             force_sechalf = (toggle_sechalf != force_sechalf);
2980          }
2981       }
2982
2983       if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
2984          assert(inst->dst.file == MRF || inst->dst.file == GRF);
2985          fs_reg dst = inst->dst;
2986
2987          for (int i = 0; i < inst->sources; i++) {
2988             dst.width = inst->src[i].effective_width;
2989             dst.type = inst->src[i].type;
2990
2991             if (inst->src[i].file == BAD_FILE) {
2992                /* Do nothing but otherwise increment as normal */
2993             } else if (dst.file == MRF &&
2994                        dst.width == 8 &&
2995                        brw->has_compr4 &&
2996                        i + 4 < inst->sources &&
2997                        inst->src[i + 4].equals(horiz_offset(inst->src[i], 8))) {
2998                fs_reg compr4_dst = dst;
2999                compr4_dst.reg += BRW_MRF_COMPR4;
3000                compr4_dst.width = 16;
3001                fs_reg compr4_src = inst->src[i];
3002                compr4_src.width = 16;
3003                fs_inst *mov = MOV(compr4_dst, compr4_src);
3004                mov->force_writemask_all = true;
3005                inst->insert_before(block, mov);
3006                /* Mark i+4 as BAD_FILE so we don't emit a MOV for it */
3007                inst->src[i + 4].file = BAD_FILE;
3008             } else {
3009                fs_inst *mov = MOV(dst, inst->src[i]);
3010                if (inst->src[i].file == GRF) {
3011                   int src_reg = vgrf_to_reg[inst->src[i].reg] +
3012                                 inst->src[i].reg_offset;
3013                   mov->force_sechalf = metadata[src_reg].force_sechalf;
3014                   mov->force_writemask_all = metadata[src_reg].force_writemask_all;
3015                   metadata[dst_reg] = metadata[src_reg];
3016                   if (dst.width * type_sz(dst.type) > 32) {
3017                      assert((!metadata[src_reg].written ||
3018                              !metadata[src_reg].force_sechalf) &&
3019                             (!metadata[src_reg + 1].written ||
3020                              metadata[src_reg + 1].force_sechalf));
3021                      metadata[dst_reg + 1] = metadata[src_reg + 1];
3022                   }
3023                } else {
3024                   metadata[dst_reg].force_writemask_all = false;
3025                   metadata[dst_reg].force_sechalf = false;
3026                   if (dst.width == 16) {
3027                      metadata[dst_reg + 1].force_writemask_all = false;
3028                      metadata[dst_reg + 1].force_sechalf = true;
3029                   }
3030                }
3031                inst->insert_before(block, mov);
3032             }
3033
3034             dst = offset(dst, 1);
3035          }
3036
3037          inst->remove(block);
3038          progress = true;
3039       }
3040    }
3041
3042    if (progress)
3043       invalidate_live_intervals();
3044
3045    return progress;
3046 }
3047
3048 void
3049 fs_visitor::dump_instructions()
3050 {
3051    dump_instructions(NULL);
3052 }
3053
3054 void
3055 fs_visitor::dump_instructions(const char *name)
3056 {
3057    calculate_register_pressure();
3058    FILE *file = stderr;
3059    if (name && geteuid() != 0) {
3060       file = fopen(name, "w");
3061       if (!file)
3062          file = stderr;
3063    }
3064
3065    int ip = 0, max_pressure = 0;
3066    foreach_block_and_inst(block, backend_instruction, inst, cfg) {
3067       max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
3068       fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
3069       dump_instruction(inst, file);
3070       ++ip;
3071    }
3072    fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
3073
3074    if (file != stderr) {
3075       fclose(file);
3076    }
3077 }
3078
3079 void
3080 fs_visitor::dump_instruction(backend_instruction *be_inst)
3081 {
3082    dump_instruction(be_inst, stderr);
3083 }
3084
3085 void
3086 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
3087 {
3088    fs_inst *inst = (fs_inst *)be_inst;
3089
3090    if (inst->predicate) {
3091       fprintf(file, "(%cf0.%d) ",
3092              inst->predicate_inverse ? '-' : '+',
3093              inst->flag_subreg);
3094    }
3095
3096    fprintf(file, "%s", brw_instruction_name(inst->opcode));
3097    if (inst->saturate)
3098       fprintf(file, ".sat");
3099    if (inst->conditional_mod) {
3100       fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
3101       if (!inst->predicate &&
3102           (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
3103                               inst->opcode != BRW_OPCODE_IF &&
3104                               inst->opcode != BRW_OPCODE_WHILE))) {
3105          fprintf(file, ".f0.%d", inst->flag_subreg);
3106       }
3107    }
3108    fprintf(file, "(%d) ", inst->exec_size);
3109
3110
3111    switch (inst->dst.file) {
3112    case GRF:
3113       fprintf(file, "vgrf%d", inst->dst.reg);
3114       if (inst->dst.width != dispatch_width)
3115          fprintf(file, "@%d", inst->dst.width);
3116       if (virtual_grf_sizes[inst->dst.reg] != inst->dst.width / 8 ||
3117           inst->dst.subreg_offset)
3118          fprintf(file, "+%d.%d",
3119                  inst->dst.reg_offset, inst->dst.subreg_offset);
3120       break;
3121    case MRF:
3122       fprintf(file, "m%d", inst->dst.reg);
3123       break;
3124    case BAD_FILE:
3125       fprintf(file, "(null)");
3126       break;
3127    case UNIFORM:
3128       fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
3129       break;
3130    case HW_REG:
3131       if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3132          switch (inst->dst.fixed_hw_reg.nr) {
3133          case BRW_ARF_NULL:
3134             fprintf(file, "null");
3135             break;
3136          case BRW_ARF_ADDRESS:
3137             fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
3138             break;
3139          case BRW_ARF_ACCUMULATOR:
3140             fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
3141             break;
3142          case BRW_ARF_FLAG:
3143             fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3144                              inst->dst.fixed_hw_reg.subnr);
3145             break;
3146          default:
3147             fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3148                                inst->dst.fixed_hw_reg.subnr);
3149             break;
3150          }
3151       } else {
3152          fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
3153       }
3154       if (inst->dst.fixed_hw_reg.subnr)
3155          fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
3156       break;
3157    default:
3158       fprintf(file, "???");
3159       break;
3160    }
3161    fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
3162
3163    for (int i = 0; i < inst->sources; i++) {
3164       if (inst->src[i].negate)
3165          fprintf(file, "-");
3166       if (inst->src[i].abs)
3167          fprintf(file, "|");
3168       switch (inst->src[i].file) {
3169       case GRF:
3170          fprintf(file, "vgrf%d", inst->src[i].reg);
3171          if (inst->src[i].width != dispatch_width)
3172             fprintf(file, "@%d", inst->src[i].width);
3173          if (virtual_grf_sizes[inst->src[i].reg] != inst->src[i].width / 8 ||
3174              inst->src[i].subreg_offset)
3175             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3176                     inst->src[i].subreg_offset);
3177          break;
3178       case MRF:
3179          fprintf(file, "***m%d***", inst->src[i].reg);
3180          break;
3181       case UNIFORM:
3182          fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
3183          if (inst->src[i].reladdr) {
3184             fprintf(file, "+reladdr");
3185          } else if (inst->src[i].subreg_offset) {
3186             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3187                     inst->src[i].subreg_offset);
3188          }
3189          break;
3190       case BAD_FILE:
3191          fprintf(file, "(null)");
3192          break;
3193       case IMM:
3194          switch (inst->src[i].type) {
3195          case BRW_REGISTER_TYPE_F:
3196             fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
3197             break;
3198          case BRW_REGISTER_TYPE_D:
3199             fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
3200             break;
3201          case BRW_REGISTER_TYPE_UD:
3202             fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
3203             break;
3204          case BRW_REGISTER_TYPE_VF:
3205             fprintf(stderr, "[%-gF, %-gF, %-gF, %-gF]",
3206                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >>  0) & 0xff),
3207                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >>  8) & 0xff),
3208                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 16) & 0xff),
3209                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 24) & 0xff));
3210             break;
3211          default:
3212             fprintf(file, "???");
3213             break;
3214          }
3215          break;
3216       case HW_REG:
3217          if (inst->src[i].fixed_hw_reg.negate)
3218             fprintf(file, "-");
3219          if (inst->src[i].fixed_hw_reg.abs)
3220             fprintf(file, "|");
3221          if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3222             switch (inst->src[i].fixed_hw_reg.nr) {
3223             case BRW_ARF_NULL:
3224                fprintf(file, "null");
3225                break;
3226             case BRW_ARF_ADDRESS:
3227                fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
3228                break;
3229             case BRW_ARF_ACCUMULATOR:
3230                fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
3231                break;
3232             case BRW_ARF_FLAG:
3233                fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3234                                 inst->src[i].fixed_hw_reg.subnr);
3235                break;
3236             default:
3237                fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3238                                   inst->src[i].fixed_hw_reg.subnr);
3239                break;
3240             }
3241          } else {
3242             fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3243          }
3244          if (inst->src[i].fixed_hw_reg.subnr)
3245             fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
3246          if (inst->src[i].fixed_hw_reg.abs)
3247             fprintf(file, "|");
3248          break;
3249       default:
3250          fprintf(file, "???");
3251          break;
3252       }
3253       if (inst->src[i].abs)
3254          fprintf(file, "|");
3255
3256       if (inst->src[i].file != IMM) {
3257          fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
3258       }
3259
3260       if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
3261          fprintf(file, ", ");
3262    }
3263
3264    fprintf(file, " ");
3265
3266    if (dispatch_width == 16 && inst->exec_size == 8) {
3267       if (inst->force_sechalf)
3268          fprintf(file, "2ndhalf ");
3269       else
3270          fprintf(file, "1sthalf ");
3271    }
3272
3273    fprintf(file, "\n");
3274 }
3275
3276 /**
3277  * Possibly returns an instruction that set up @param reg.
3278  *
3279  * Sometimes we want to take the result of some expression/variable
3280  * dereference tree and rewrite the instruction generating the result
3281  * of the tree.  When processing the tree, we know that the
3282  * instructions generated are all writing temporaries that are dead
3283  * outside of this tree.  So, if we have some instructions that write
3284  * a temporary, we're free to point that temp write somewhere else.
3285  *
3286  * Note that this doesn't guarantee that the instruction generated
3287  * only reg -- it might be the size=4 destination of a texture instruction.
3288  */
3289 fs_inst *
3290 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3291                                            fs_inst *end,
3292                                            const fs_reg &reg)
3293 {
3294    if (end == start ||
3295        end->is_partial_write() ||
3296        reg.reladdr ||
3297        !reg.equals(end->dst)) {
3298       return NULL;
3299    } else {
3300       return end;
3301    }
3302 }
3303
3304 void
3305 fs_visitor::setup_payload_gen6()
3306 {
3307    bool uses_depth =
3308       (prog->InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3309    unsigned barycentric_interp_modes =
3310       (stage == MESA_SHADER_FRAGMENT) ?
3311       ((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
3312
3313    assert(brw->gen >= 6);
3314
3315    /* R0-1: masks, pixel X/Y coordinates. */
3316    payload.num_regs = 2;
3317    /* R2: only for 32-pixel dispatch.*/
3318
3319    /* R3-26: barycentric interpolation coordinates.  These appear in the
3320     * same order that they appear in the brw_wm_barycentric_interp_mode
3321     * enum.  Each set of coordinates occupies 2 registers if dispatch width
3322     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
3323     * appear if they were enabled using the "Barycentric Interpolation
3324     * Mode" bits in WM_STATE.
3325     */
3326    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3327       if (barycentric_interp_modes & (1 << i)) {
3328          payload.barycentric_coord_reg[i] = payload.num_regs;
3329          payload.num_regs += 2;
3330          if (dispatch_width == 16) {
3331             payload.num_regs += 2;
3332          }
3333       }
3334    }
3335
3336    /* R27: interpolated depth if uses source depth */
3337    if (uses_depth) {
3338       payload.source_depth_reg = payload.num_regs;
3339       payload.num_regs++;
3340       if (dispatch_width == 16) {
3341          /* R28: interpolated depth if not SIMD8. */
3342          payload.num_regs++;
3343       }
3344    }
3345    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3346    if (uses_depth) {
3347       payload.source_w_reg = payload.num_regs;
3348       payload.num_regs++;
3349       if (dispatch_width == 16) {
3350          /* R30: interpolated W if not SIMD8. */
3351          payload.num_regs++;
3352       }
3353    }
3354
3355    if (stage == MESA_SHADER_FRAGMENT) {
3356       brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3357       brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3358       prog_data->uses_pos_offset = key->compute_pos_offset;
3359       /* R31: MSAA position offsets. */
3360       if (prog_data->uses_pos_offset) {
3361          payload.sample_pos_reg = payload.num_regs;
3362          payload.num_regs++;
3363       }
3364    }
3365
3366    /* R32: MSAA input coverage mask */
3367    if (prog->SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3368       assert(brw->gen >= 7);
3369       payload.sample_mask_in_reg = payload.num_regs;
3370       payload.num_regs++;
3371       if (dispatch_width == 16) {
3372          /* R33: input coverage mask if not SIMD8. */
3373          payload.num_regs++;
3374       }
3375    }
3376
3377    /* R34-: bary for 32-pixel. */
3378    /* R58-59: interp W for 32-pixel. */
3379
3380    if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3381       source_depth_to_render_target = true;
3382    }
3383 }
3384
3385 void
3386 fs_visitor::assign_binding_table_offsets()
3387 {
3388    assert(stage == MESA_SHADER_FRAGMENT);
3389    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3390    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3391    uint32_t next_binding_table_offset = 0;
3392
3393    /* If there are no color regions, we still perform an FB write to a null
3394     * renderbuffer, which we place at surface index 0.
3395     */
3396    prog_data->binding_table.render_target_start = next_binding_table_offset;
3397    next_binding_table_offset += MAX2(key->nr_color_regions, 1);
3398
3399    assign_common_binding_table_offsets(next_binding_table_offset);
3400 }
3401
3402 void
3403 fs_visitor::calculate_register_pressure()
3404 {
3405    invalidate_live_intervals();
3406    calculate_live_intervals();
3407
3408    unsigned num_instructions = 0;
3409    foreach_block(block, cfg)
3410       num_instructions += block->instructions.length();
3411
3412    regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3413
3414    for (int reg = 0; reg < virtual_grf_count; reg++) {
3415       for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3416          regs_live_at_ip[ip] += virtual_grf_sizes[reg];
3417    }
3418 }
3419
3420 /**
3421  * Look for repeated FS_OPCODE_MOV_DISPATCH_TO_FLAGS and drop the later ones.
3422  *
3423  * The needs_unlit_centroid_workaround ends up producing one of these per
3424  * channel of centroid input, so it's good to clean them up.
3425  *
3426  * An assumption here is that nothing ever modifies the dispatched pixels
3427  * value that FS_OPCODE_MOV_DISPATCH_TO_FLAGS reads from, but the hardware
3428  * dictates that anyway.
3429  */
3430 void
3431 fs_visitor::opt_drop_redundant_mov_to_flags()
3432 {
3433    bool flag_mov_found[2] = {false};
3434
3435    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
3436       if (inst->is_control_flow()) {
3437          memset(flag_mov_found, 0, sizeof(flag_mov_found));
3438       } else if (inst->opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS) {
3439          if (!flag_mov_found[inst->flag_subreg])
3440             flag_mov_found[inst->flag_subreg] = true;
3441          else
3442             inst->remove(block);
3443       } else if (inst->writes_flag()) {
3444          flag_mov_found[inst->flag_subreg] = false;
3445       }
3446    }
3447 }
3448
3449 void
3450 fs_visitor::optimize()
3451 {
3452    calculate_cfg();
3453
3454    split_virtual_grfs();
3455
3456    move_uniform_array_access_to_pull_constants();
3457    assign_constant_locations();
3458    demote_pull_constants();
3459
3460    opt_drop_redundant_mov_to_flags();
3461
3462 #define OPT(pass, args...) do {                                         \
3463       pass_num++;                                                       \
3464       bool this_progress = pass(args);                                  \
3465                                                                         \
3466       if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) {   \
3467          char filename[64];                                             \
3468          snprintf(filename, 64, "fs%d-%04d-%02d-%02d-" #pass,           \
3469                   dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
3470                                                                         \
3471          backend_visitor::dump_instructions(filename);                  \
3472       }                                                                 \
3473                                                                         \
3474       progress = progress || this_progress;                             \
3475    } while (false)
3476
3477    if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
3478       char filename[64];
3479       snprintf(filename, 64, "fs%d-%04d-00-start",
3480                dispatch_width, shader_prog ? shader_prog->Name : 0);
3481
3482       backend_visitor::dump_instructions(filename);
3483    }
3484
3485    bool progress;
3486    int iteration = 0;
3487    do {
3488       progress = false;
3489       iteration++;
3490       int pass_num = 0;
3491
3492       OPT(remove_duplicate_mrf_writes);
3493
3494       OPT(opt_algebraic);
3495       OPT(opt_cse);
3496       OPT(opt_copy_propagate);
3497       OPT(opt_peephole_predicated_break);
3498       OPT(dead_code_eliminate);
3499       OPT(opt_peephole_sel);
3500       OPT(dead_control_flow_eliminate, this);
3501       OPT(opt_register_renaming);
3502       OPT(opt_saturate_propagation);
3503       OPT(register_coalesce);
3504       OPT(compute_to_mrf);
3505
3506       OPT(compact_virtual_grfs);
3507    } while (progress);
3508
3509    if (lower_load_payload()) {
3510       split_virtual_grfs();
3511       register_coalesce();
3512       compute_to_mrf();
3513       dead_code_eliminate();
3514    }
3515
3516    lower_uniform_pull_constant_loads();
3517 }
3518
3519 void
3520 fs_visitor::allocate_registers()
3521 {
3522    bool allocated_without_spills;
3523
3524    static enum instruction_scheduler_mode pre_modes[] = {
3525       SCHEDULE_PRE,
3526       SCHEDULE_PRE_NON_LIFO,
3527       SCHEDULE_PRE_LIFO,
3528    };
3529
3530    /* Try each scheduling heuristic to see if it can successfully register
3531     * allocate without spilling.  They should be ordered by decreasing
3532     * performance but increasing likelihood of allocating.
3533     */
3534    for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3535       schedule_instructions(pre_modes[i]);
3536
3537       if (0) {
3538          assign_regs_trivial();
3539          allocated_without_spills = true;
3540       } else {
3541          allocated_without_spills = assign_regs(false);
3542       }
3543       if (allocated_without_spills)
3544          break;
3545    }
3546
3547    if (!allocated_without_spills) {
3548       /* We assume that any spilling is worse than just dropping back to
3549        * SIMD8.  There's probably actually some intermediate point where
3550        * SIMD16 with a couple of spills is still better.
3551        */
3552       if (dispatch_width == 16) {
3553          fail("Failure to register allocate.  Reduce number of "
3554               "live scalar values to avoid this.");
3555       } else {
3556          perf_debug("Fragment shader triggered register spilling.  "
3557                     "Try reducing the number of live scalar values to "
3558                     "improve performance.\n");
3559       }
3560
3561       /* Since we're out of heuristics, just go spill registers until we
3562        * get an allocation.
3563        */
3564       while (!assign_regs(true)) {
3565          if (failed)
3566             break;
3567       }
3568    }
3569
3570    /* This must come after all optimization and register allocation, since
3571     * it inserts dead code that happens to have side effects, and it does
3572     * so based on the actual physical registers in use.
3573     */
3574    insert_gen4_send_dependency_workarounds();
3575
3576    if (failed)
3577       return;
3578
3579    if (!allocated_without_spills)
3580       schedule_instructions(SCHEDULE_POST);
3581
3582    if (last_scratch > 0)
3583       prog_data->total_scratch = brw_get_scratch_size(last_scratch);
3584 }
3585
3586 bool
3587 fs_visitor::run()
3588 {
3589    sanity_param_count = prog->Parameters->NumParameters;
3590
3591    assign_binding_table_offsets();
3592
3593    if (brw->gen >= 6)
3594       setup_payload_gen6();
3595    else
3596       setup_payload_gen4();
3597
3598    if (0) {
3599       emit_dummy_fs();
3600    } else if (brw->use_rep_send && dispatch_width == 16) {
3601       emit_repclear_shader();
3602    } else {
3603       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3604          emit_shader_time_begin();
3605
3606       calculate_urb_setup();
3607       if (prog->InputsRead > 0) {
3608          if (brw->gen < 6)
3609             emit_interpolation_setup_gen4();
3610          else
3611             emit_interpolation_setup_gen6();
3612       }
3613
3614       /* We handle discards by keeping track of the still-live pixels in f0.1.
3615        * Initialize it with the dispatched pixels.
3616        */
3617       bool uses_kill =
3618          (stage == MESA_SHADER_FRAGMENT) &&
3619          ((brw_wm_prog_data*) this->prog_data)->uses_kill;
3620       bool alpha_test_func =
3621          (stage == MESA_SHADER_FRAGMENT) &&
3622          ((brw_wm_prog_key*) this->key)->alpha_test_func;
3623       if (uses_kill) {
3624          fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3625          discard_init->flag_subreg = 1;
3626       }
3627
3628       /* Generate FS IR for main().  (the visitor only descends into
3629        * functions called "main").
3630        */
3631       if (shader) {
3632          foreach_in_list(ir_instruction, ir, shader->base.ir) {
3633             base_ir = ir;
3634             this->result = reg_undef;
3635             ir->accept(this);
3636          }
3637       } else {
3638          emit_fragment_program_code();
3639       }
3640       base_ir = NULL;
3641       if (failed)
3642          return false;
3643
3644       emit(FS_OPCODE_PLACEHOLDER_HALT);
3645
3646       if (alpha_test_func)
3647          emit_alpha_test();
3648
3649       emit_fb_writes();
3650
3651       optimize();
3652
3653       assign_curb_setup();
3654       assign_urb_setup();
3655
3656       allocate_registers();
3657
3658       if (failed)
3659          return false;
3660    }
3661
3662    if (stage == MESA_SHADER_FRAGMENT) {
3663       brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3664       if (dispatch_width == 8)
3665          prog_data->reg_blocks = brw_register_blocks(grf_used);
3666       else
3667          prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
3668    }
3669
3670    /* If any state parameters were appended, then ParameterValues could have
3671     * been realloced, in which case the driver uniform storage set up by
3672     * _mesa_associate_uniform_storage() would point to freed memory.  Make
3673     * sure that didn't happen.
3674     */
3675    assert(sanity_param_count == prog->Parameters->NumParameters);
3676
3677    return !failed;
3678 }
3679
3680 const unsigned *
3681 brw_wm_fs_emit(struct brw_context *brw,
3682                void *mem_ctx,
3683                const struct brw_wm_prog_key *key,
3684                struct brw_wm_prog_data *prog_data,
3685                struct gl_fragment_program *fp,
3686                struct gl_shader_program *prog,
3687                unsigned *final_assembly_size)
3688 {
3689    bool start_busy = false;
3690    double start_time = 0;
3691
3692    if (unlikely(brw->perf_debug)) {
3693       start_busy = (brw->batch.last_bo &&
3694                     drm_intel_bo_busy(brw->batch.last_bo));
3695       start_time = get_time();
3696    }
3697
3698    struct brw_shader *shader = NULL;
3699    if (prog)
3700       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3701
3702    if (unlikely(INTEL_DEBUG & DEBUG_WM))
3703       brw_dump_ir("fragment", prog, &shader->base, &fp->Base);
3704
3705    /* Now the main event: Visit the shader IR and generate our FS IR for it.
3706     */
3707    fs_visitor v(brw, mem_ctx, key, prog_data, prog, fp, 8);
3708    if (!v.run()) {
3709       if (prog) {
3710          prog->LinkStatus = false;
3711          ralloc_strcat(&prog->InfoLog, v.fail_msg);
3712       }
3713
3714       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3715                     v.fail_msg);
3716
3717       return NULL;
3718    }
3719
3720    cfg_t *simd16_cfg = NULL;
3721    fs_visitor v2(brw, mem_ctx, key, prog_data, prog, fp, 16);
3722    if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16) ||
3723                                brw->use_rep_send)) {
3724       if (!v.simd16_unsupported) {
3725          /* Try a SIMD16 compile */
3726          v2.import_uniforms(&v);
3727          if (!v2.run()) {
3728             perf_debug("SIMD16 shader failed to compile, falling back to "
3729                        "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
3730          } else {
3731             simd16_cfg = v2.cfg;
3732          }
3733       } else {
3734          perf_debug("SIMD16 shader unsupported, falling back to "
3735                     "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
3736       }
3737    }
3738
3739    cfg_t *simd8_cfg;
3740    int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8;
3741    if (no_simd8 && simd16_cfg) {
3742       simd8_cfg = NULL;
3743       prog_data->no_8 = true;
3744    } else {
3745       simd8_cfg = v.cfg;
3746       prog_data->no_8 = false;
3747    }
3748
3749    fs_generator g(brw, mem_ctx, key, prog_data, prog, fp,
3750                   v.runtime_check_aads_emit, INTEL_DEBUG & DEBUG_WM);
3751    if (simd8_cfg)
3752       g.generate_code(simd8_cfg, 8);
3753    if (simd16_cfg)
3754       prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
3755
3756    if (unlikely(brw->perf_debug) && shader) {
3757       if (shader->compiled_once)
3758          brw_wm_debug_recompile(brw, prog, key);
3759       shader->compiled_once = true;
3760
3761       if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3762          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3763                     (get_time() - start_time) * 1000);
3764       }
3765    }
3766
3767    return g.get_assembly(final_assembly_size);
3768 }
3769
3770 extern "C" bool
3771 brw_fs_precompile(struct gl_context *ctx,
3772                   struct gl_shader_program *shader_prog,
3773                   struct gl_program *prog)
3774 {
3775    struct brw_context *brw = brw_context(ctx);
3776    struct brw_wm_prog_key key;
3777
3778    struct gl_fragment_program *fp = (struct gl_fragment_program *) prog;
3779    struct brw_fragment_program *bfp = brw_fragment_program(fp);
3780    bool program_uses_dfdy = fp->UsesDFdy;
3781
3782    memset(&key, 0, sizeof(key));
3783
3784    if (brw->gen < 6) {
3785       if (fp->UsesKill)
3786          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3787
3788       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3789          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3790
3791       /* Just assume depth testing. */
3792       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3793       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3794    }
3795
3796    if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
3797                                          BRW_FS_VARYING_INPUT_MASK) > 16)
3798       key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
3799
3800    unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
3801    for (unsigned i = 0; i < sampler_count; i++) {
3802       if (fp->Base.ShadowSamplers & (1 << i)) {
3803          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3804          key.tex.swizzles[i] =
3805             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3806       } else {
3807          /* Color sampler: assume no swizzling. */
3808          key.tex.swizzles[i] = SWIZZLE_XYZW;
3809       }
3810    }
3811
3812    if (fp->Base.InputsRead & VARYING_BIT_POS) {
3813       key.drawable_height = ctx->DrawBuffer->Height;
3814    }
3815
3816    key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
3817          ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
3818          BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
3819
3820    if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3821       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
3822                           key.nr_color_regions > 1;
3823    }
3824
3825    /* GL_FRAGMENT_SHADER_DERIVATIVE_HINT is almost always GL_DONT_CARE.  The
3826     * quality of the derivatives is likely to be determined by the driconf
3827     * option.
3828     */
3829    key.high_quality_derivatives = brw->disable_derivative_optimization;
3830
3831    key.program_string_id = bfp->id;
3832
3833    uint32_t old_prog_offset = brw->wm.base.prog_offset;
3834    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3835
3836    bool success = do_wm_prog(brw, shader_prog, bfp, &key);
3837
3838    brw->wm.base.prog_offset = old_prog_offset;
3839    brw->wm.prog_data = old_prog_data;
3840
3841    return success;
3842 }