src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 extern "C" {
  32
  33 #include <sys/types.h>
  34
  35 #include "util/hash_table.h"
  36 #include "main/macros.h"
  37 #include "main/shaderobj.h"
  38 #include "main/fbobject.h"
  39 #include "program/prog_parameter.h"
  40 #include "program/prog_print.h"
  41 #include "util/register_allocate.h"
  42 #include "program/sampler.h"
  43 #include "program/hash_table.h"
  44 #include "brw_context.h"
  45 #include "brw_eu.h"
  46 #include "brw_wm.h"
  47 }
  48 #include "brw_fs.h"
  49 #include "brw_cfg.h"
  50 #include "brw_dead_control_flow.h"
  51 #include "main/uniforms.h"
  52 #include "brw_fs_live_variables.h"
  53 #include "glsl/glsl_types.h"
  54
  55 void
  56 fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
  57               fs_reg *src, int sources)
  58 {
  59    memset(this, 0, sizeof(*this));
  60
  61    this->opcode = opcode;
  62    this->dst = dst;
  63    this->src = src;
  64    this->sources = sources;
  65    this->exec_size = exec_size;
  66
  67    assert(dst.file != IMM && dst.file != UNIFORM);
  68
  69    /* If exec_size == 0, try to guess it from the registers.  Since all
  70     * manner of things may use hardware registers, we first try to guess
  71     * based on GRF registers.  If this fails, we will go ahead and take the
  72     * width from the destination register.
  73     */
  74    if (this->exec_size == 0) {
  75       if (dst.file == GRF) {
  76          this->exec_size = dst.width;
  77       } else {
  78          for (int i = 0; i < sources; ++i) {
  79             if (src[i].file != GRF)
  80                continue;
  81
  82             if (this->exec_size <= 1)
  83                this->exec_size = src[i].width;
  84             assert(src[i].width == 1 || src[i].width == this->exec_size);
  85          }
  86       }
  87
  88       if (this->exec_size == 0 && dst.file != BAD_FILE)
  89          this->exec_size = dst.width;
  90    }
  91    assert(this->exec_size != 0);
  92
  93    for (int i = 0; i < sources; ++i) {
  94       switch (this->src[i].file) {
  95       case BAD_FILE:
  96          this->src[i].effective_width = 8;
  97          break;
  98       case GRF:
  99       case HW_REG:
 100          assert(this->src[i].width > 0);
 101          if (this->src[i].width == 1) {
 102             this->src[i].effective_width = this->exec_size;
 103          } else {
 104             this->src[i].effective_width = this->src[i].width;
 105          }
 106          break;
 107       case IMM:
 108       case UNIFORM:
 109          this->src[i].effective_width = this->exec_size;
 110          break;
 111       default:
 112          unreachable("Invalid source register file");
 113       }
 114    }
 115    this->dst.effective_width = this->exec_size;
 116
 117    this->conditional_mod = BRW_CONDITIONAL_NONE;
 118
 119    /* This will be the case for almost all instructions. */
 120    switch (dst.file) {
 121    case GRF:
 122    case HW_REG:
 123    case MRF:
 124       this->regs_written = (dst.width * dst.stride * type_sz(dst.type) + 31) / 32;
 125       break;
 126    case BAD_FILE:
 127       this->regs_written = 0;
 128       break;
 129    case IMM:
 130    case UNIFORM:
 131       unreachable("Invalid destination register file");
 132    default:
 133       unreachable("Invalid register file");
 134    }
 135
 136    this->writes_accumulator = false;
 137 }
 138
 139 fs_inst::fs_inst()
 140 {
 141    fs_reg *src = ralloc_array(this, fs_reg, 3);
 142    init(BRW_OPCODE_NOP, 8, dst, src, 0);
 143 }
 144
 145 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
 146 {
 147    fs_reg *src = ralloc_array(this, fs_reg, 3);
 148    init(opcode, exec_size, reg_undef, src, 0);
 149 }
 150
 151 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
 152 {
 153    fs_reg *src = ralloc_array(this, fs_reg, 3);
 154    init(opcode, 0, dst, src, 0);
 155 }
 156
 157 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 158                  const fs_reg &src0)
 159 {
 160    fs_reg *src = ralloc_array(this, fs_reg, 3);
 161    src[0] = src0;
 162    init(opcode, exec_size, dst, src, 1);
 163 }
 164
 165 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
 166 {
 167    fs_reg *src = ralloc_array(this, fs_reg, 3);
 168    src[0] = src0;
 169    init(opcode, 0, dst, src, 1);
 170 }
 171
 172 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 173                  const fs_reg &src0, const fs_reg &src1)
 174 {
 175    fs_reg *src = ralloc_array(this, fs_reg, 3);
 176    src[0] = src0;
 177    src[1] = src1;
 178    init(opcode, exec_size, dst, src, 2);
 179 }
 180
 181 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 182                  const fs_reg &src1)
 183 {
 184    fs_reg *src = ralloc_array(this, fs_reg, 3);
 185    src[0] = src0;
 186    src[1] = src1;
 187    init(opcode, 0, dst, src, 2);
 188 }
 189
 190 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 191                  const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
 192 {
 193    fs_reg *src = ralloc_array(this, fs_reg, 3);
 194    src[0] = src0;
 195    src[1] = src1;
 196    src[2] = src2;
 197    init(opcode, exec_size, dst, src, 3);
 198 }
 199
 200 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 201                  const fs_reg &src1, const fs_reg &src2)
 202 {
 203    fs_reg *src = ralloc_array(this, fs_reg, 3);
 204    src[0] = src0;
 205    src[1] = src1;
 206    src[2] = src2;
 207    init(opcode, 0, dst, src, 3);
 208 }
 209
 210 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, fs_reg src[], int sources)
 211 {
 212    init(opcode, 0, dst, src, sources);
 213 }
 214
 215 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
 216                  fs_reg src[], int sources)
 217 {
 218    init(opcode, exec_width, dst, src, sources);
 219 }
 220
 221 fs_inst::fs_inst(const fs_inst &that)
 222 {
 223    memcpy(this, &that, sizeof(that));
 224
 225    this->src = ralloc_array(this, fs_reg, that.sources);
 226
 227    for (int i = 0; i < that.sources; i++)
 228       this->src[i] = that.src[i];
 229 }
 230
 231 void
 232 fs_inst::resize_sources(uint8_t num_sources)
 233 {
 234    if (this->sources != num_sources) {
 235       this->src = reralloc(this, this->src, fs_reg, num_sources);
 236       this->sources = num_sources;
 237    }
 238 }
 239
 240 #define ALU1(op)                                                        \
 241    fs_inst *                                                            \
 242    fs_visitor::op(const fs_reg &dst, const fs_reg &src0)                \
 243    {                                                                    \
 244       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
 245    }
 246
 247 #define ALU2(op)                                                        \
 248    fs_inst *                                                            \
 249    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 250                   const fs_reg &src1)                                   \
 251    {                                                                    \
 252       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
 253    }
 254
 255 #define ALU2_ACC(op)                                                    \
 256    fs_inst *                                                            \
 257    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 258                   const fs_reg &src1)                                   \
 259    {                                                                    \
 260       fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
 261       inst->writes_accumulator = true;                                  \
 262       return inst;                                                      \
 263    }
 264
 265 #define ALU3(op)                                                        \
 266    fs_inst *                                                            \
 267    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 268                   const fs_reg &src1, const fs_reg &src2)               \
 269    {                                                                    \
 270       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
 271    }
 272
 273 ALU1(NOT)
 274 ALU1(MOV)
 275 ALU1(FRC)
 276 ALU1(RNDD)
 277 ALU1(RNDE)
 278 ALU1(RNDZ)
 279 ALU2(ADD)
 280 ALU2(MUL)
 281 ALU2_ACC(MACH)
 282 ALU2(AND)
 283 ALU2(OR)
 284 ALU2(XOR)
 285 ALU2(SHL)
 286 ALU2(SHR)
 287 ALU2(ASR)
 288 ALU3(LRP)
 289 ALU1(BFREV)
 290 ALU3(BFE)
 291 ALU2(BFI1)
 292 ALU3(BFI2)
 293 ALU1(FBH)
 294 ALU1(FBL)
 295 ALU1(CBIT)
 296 ALU3(MAD)
 297 ALU2_ACC(ADDC)
 298 ALU2_ACC(SUBB)
 299 ALU2(SEL)
 300 ALU2(MAC)
 301
 302 /** Gen4 predicated IF. */
 303 fs_inst *
 304 fs_visitor::IF(enum brw_predicate predicate)
 305 {
 306    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width);
 307    inst->predicate = predicate;
 308    return inst;
 309 }
 310
 311 /** Gen6 IF with embedded comparison. */
 312 fs_inst *
 313 fs_visitor::IF(const fs_reg &src0, const fs_reg &src1,
 314                enum brw_conditional_mod condition)
 315 {
 316    assert(brw->gen == 6);
 317    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width,
 318                                         reg_null_d, src0, src1);
 319    inst->conditional_mod = condition;
 320    return inst;
 321 }
 322
 323 /**
 324  * CMP: Sets the low bit of the destination channels with the result
 325  * of the comparison, while the upper bits are undefined, and updates
 326  * the flag register with the packed 16 bits of the result.
 327  */
 328 fs_inst *
 329 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
 330                 enum brw_conditional_mod condition)
 331 {
 332    fs_inst *inst;
 333
 334    /* Take the instruction:
 335     *
 336     * CMP null<d> src0<f> src1<f>
 337     *
 338     * Original gen4 does type conversion to the destination type before
 339     * comparison, producing garbage results for floating point comparisons.
 340     * gen5 does the comparison on the execution type (resolved source types),
 341     * so dst type doesn't matter.  gen6 does comparison and then uses the
 342     * result as if it was the dst type with no conversion, which happens to
 343     * mostly work out for float-interpreted-as-int since our comparisons are
 344     * for >0, =0, <0.
 345     */
 346    if (brw->gen == 4) {
 347       dst.type = src0.type;
 348       if (dst.file == HW_REG)
 349          dst.fixed_hw_reg.type = dst.type;
 350    }
 351
 352    resolve_ud_negate(&src0);
 353    resolve_ud_negate(&src1);
 354
 355    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
 356    inst->conditional_mod = condition;
 357
 358    return inst;
 359 }
 360
 361 fs_inst *
 362 fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources)
 363 {
 364    uint8_t exec_size = dst.width;
 365    for (int i = 0; i < sources; ++i) {
 366       assert(src[i].width % dst.width == 0);
 367       if (src[i].width > exec_size)
 368          exec_size = src[i].width;
 369    }
 370
 371    fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, exec_size,
 372                                         dst, src, sources);
 373    inst->regs_written = 0;
 374    for (int i = 0; i < sources; ++i) {
 375       /* The LOAD_PAYLOAD instruction only really makes sense if we are
 376        * dealing with whole registers.  If this ever changes, we can deal
 377        * with it later.
 378        */
 379       int size = src[i].effective_width * type_sz(src[i].type);
 380       assert(size % 32 == 0);
 381       inst->regs_written += (size + 31) / 32;
 382    }
 383
 384    return inst;
 385 }
 386
 387 exec_list
 388 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
 389                                        const fs_reg &surf_index,
 390                                        const fs_reg &varying_offset,
 391                                        uint32_t const_offset)
 392 {
 393    exec_list instructions;
 394    fs_inst *inst;
 395
 396    /* We have our constant surface use a pitch of 4 bytes, so our index can
 397     * be any component of a vector, and then we load 4 contiguous
 398     * components starting from that.
 399     *
 400     * We break down the const_offset to a portion added to the variable
 401     * offset and a portion done using reg_offset, which means that if you
 402     * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
 403     * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
 404     * CSE can later notice that those loads are all the same and eliminate
 405     * the redundant ones.
 406     */
 407    fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
 408    instructions.push_tail(ADD(vec4_offset,
 409                               varying_offset, fs_reg(const_offset & ~3)));
 410
 411    int scale = 1;
 412    if (brw->gen == 4 && dst.width == 8) {
 413       /* Pre-gen5, we can either use a SIMD8 message that requires (header,
 414        * u, v, r) as parameters, or we can just use the SIMD16 message
 415        * consisting of (header, u).  We choose the second, at the cost of a
 416        * longer return length.
 417        */
 418       scale = 2;
 419    }
 420
 421    enum opcode op;
 422    if (brw->gen >= 7)
 423       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
 424    else
 425       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
 426
 427    assert(dst.width % 8 == 0);
 428    int regs_written = 4 * (dst.width / 8) * scale;
 429    fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(regs_written),
 430                                dst.type, dst.width);
 431    inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
 432    inst->regs_written = regs_written;
 433    instructions.push_tail(inst);
 434
 435    if (brw->gen < 7) {
 436       inst->base_mrf = 13;
 437       inst->header_present = true;
 438       if (brw->gen == 4)
 439          inst->mlen = 3;
 440       else
 441          inst->mlen = 1 + dispatch_width / 8;
 442    }
 443
 444    fs_reg result = offset(vec4_result, (const_offset & 3) * scale);
 445    instructions.push_tail(MOV(dst, result));
 446
 447    return instructions;
 448 }
 449
 450 /**
 451  * A helper for MOV generation for fixing up broken hardware SEND dependency
 452  * handling.
 453  */
 454 fs_inst *
 455 fs_visitor::DEP_RESOLVE_MOV(int grf)
 456 {
 457    fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
 458
 459    inst->ir = NULL;
 460    inst->annotation = "send dependency resolve";
 461
 462    /* The caller always wants uncompressed to emit the minimal extra
 463     * dependencies, and to avoid having to deal with aligning its regs to 2.
 464     */
 465    inst->exec_size = 8;
 466
 467    return inst;
 468 }
 469
 470 bool
 471 fs_inst::equals(fs_inst *inst) const
 472 {
 473    return (opcode == inst->opcode &&
 474            dst.equals(inst->dst) &&
 475            src[0].equals(inst->src[0]) &&
 476            src[1].equals(inst->src[1]) &&
 477            src[2].equals(inst->src[2]) &&
 478            saturate == inst->saturate &&
 479            predicate == inst->predicate &&
 480            conditional_mod == inst->conditional_mod &&
 481            mlen == inst->mlen &&
 482            base_mrf == inst->base_mrf &&
 483            target == inst->target &&
 484            eot == inst->eot &&
 485            header_present == inst->header_present &&
 486            shadow_compare == inst->shadow_compare &&
 487            exec_size == inst->exec_size &&
 488            offset == inst->offset);
 489 }
 490
 491 bool
 492 fs_inst::overwrites_reg(const fs_reg &reg) const
 493 {
 494    return (reg.file == dst.file &&
 495            reg.reg == dst.reg &&
 496            reg.reg_offset >= dst.reg_offset  &&
 497            reg.reg_offset < dst.reg_offset + regs_written);
 498 }
 499
 500 bool
 501 fs_inst::is_send_from_grf() const
 502 {
 503    switch (opcode) {
 504    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
 505    case SHADER_OPCODE_SHADER_TIME_ADD:
 506    case FS_OPCODE_INTERPOLATE_AT_CENTROID:
 507    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
 508    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
 509    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
 510    case SHADER_OPCODE_UNTYPED_ATOMIC:
 511    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
 512       return true;
 513    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 514       return src[1].file == GRF;
 515    case FS_OPCODE_FB_WRITE:
 516       return src[0].file == GRF;
 517    default:
 518       if (is_tex())
 519          return src[0].file == GRF;
 520
 521       return false;
 522    }
 523 }
 524
 525 bool
 526 fs_inst::can_do_source_mods(struct brw_context *brw)
 527 {
 528    if (brw->gen == 6 && is_math())
 529       return false;
 530
 531    if (is_send_from_grf())
 532       return false;
 533
 534    if (!backend_instruction::can_do_source_mods())
 535       return false;
 536
 537    return true;
 538 }
 539
 540 void
 541 fs_reg::init()
 542 {
 543    memset(this, 0, sizeof(*this));
 544    stride = 1;
 545 }
 546
 547 /** Generic unset register constructor. */
 548 fs_reg::fs_reg()
 549 {
 550    init();
 551    this->file = BAD_FILE;
 552 }
 553
 554 /** Immediate value constructor. */
 555 fs_reg::fs_reg(float f)
 556 {
 557    init();
 558    this->file = IMM;
 559    this->type = BRW_REGISTER_TYPE_F;
 560    this->fixed_hw_reg.dw1.f = f;
 561    this->width = 1;
 562 }
 563
 564 /** Immediate value constructor. */
 565 fs_reg::fs_reg(int32_t i)
 566 {
 567    init();
 568    this->file = IMM;
 569    this->type = BRW_REGISTER_TYPE_D;
 570    this->fixed_hw_reg.dw1.d = i;
 571    this->width = 1;
 572 }
 573
 574 /** Immediate value constructor. */
 575 fs_reg::fs_reg(uint32_t u)
 576 {
 577    init();
 578    this->file = IMM;
 579    this->type = BRW_REGISTER_TYPE_UD;
 580    this->fixed_hw_reg.dw1.ud = u;
 581    this->width = 1;
 582 }
 583
 584 /** Fixed brw_reg. */
 585 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 586 {
 587    init();
 588    this->file = HW_REG;
 589    this->fixed_hw_reg = fixed_hw_reg;
 590    this->type = fixed_hw_reg.type;
 591    this->width = 1 << fixed_hw_reg.width;
 592 }
 593
 594 bool
 595 fs_reg::equals(const fs_reg &r) const
 596 {
 597    return (file == r.file &&
 598            reg == r.reg &&
 599            reg_offset == r.reg_offset &&
 600            subreg_offset == r.subreg_offset &&
 601            type == r.type &&
 602            negate == r.negate &&
 603            abs == r.abs &&
 604            !reladdr && !r.reladdr &&
 605            memcmp(&fixed_hw_reg, &r.fixed_hw_reg, sizeof(fixed_hw_reg)) == 0 &&
 606            width == r.width &&
 607            stride == r.stride);
 608 }
 609
 610 fs_reg &
 611 fs_reg::apply_stride(unsigned stride)
 612 {
 613    assert((this->stride * stride) <= 4 &&
 614           (is_power_of_two(stride) || stride == 0) &&
 615           file != HW_REG && file != IMM);
 616    this->stride *= stride;
 617    return *this;
 618 }
 619
 620 fs_reg &
 621 fs_reg::set_smear(unsigned subreg)
 622 {
 623    assert(file != HW_REG && file != IMM);
 624    subreg_offset = subreg * type_sz(type);
 625    stride = 0;
 626    return *this;
 627 }
 628
 629 bool
 630 fs_reg::is_contiguous() const
 631 {
 632    return stride == 1;
 633 }
 634
 635 bool
 636 fs_reg::is_valid_3src() const
 637 {
 638    return file == GRF || file == UNIFORM;
 639 }
 640
 641 int
 642 fs_visitor::type_size(const struct glsl_type *type)
 643 {
 644    unsigned int size, i;
 645
 646    switch (type->base_type) {
 647    case GLSL_TYPE_UINT:
 648    case GLSL_TYPE_INT:
 649    case GLSL_TYPE_FLOAT:
 650    case GLSL_TYPE_BOOL:
 651       return type->components();
 652    case GLSL_TYPE_ARRAY:
 653       return type_size(type->fields.array) * type->length;
 654    case GLSL_TYPE_STRUCT:
 655       size = 0;
 656       for (i = 0; i < type->length; i++) {
 657          size += type_size(type->fields.structure[i].type);
 658       }
 659       return size;
 660    case GLSL_TYPE_SAMPLER:
 661       /* Samplers take up no register space, since they're baked in at
 662        * link time.
 663        */
 664       return 0;
 665    case GLSL_TYPE_ATOMIC_UINT:
 666       return 0;
 667    case GLSL_TYPE_IMAGE:
 668    case GLSL_TYPE_VOID:
 669    case GLSL_TYPE_ERROR:
 670    case GLSL_TYPE_INTERFACE:
 671       unreachable("not reached");
 672    }
 673
 674    return 0;
 675 }
 676
 677 fs_reg
 678 fs_visitor::get_timestamp()
 679 {
 680    assert(brw->gen >= 7);
 681
 682    fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 683                                           BRW_ARF_TIMESTAMP,
 684                                           0),
 685                              BRW_REGISTER_TYPE_UD));
 686
 687    fs_reg dst = fs_reg(this, glsl_type::uint_type);
 688
 689    fs_inst *mov = emit(MOV(dst, ts));
 690    /* We want to read the 3 fields we care about (mostly field 0, but also 2)
 691     * even if it's not enabled in the dispatch.
 692     */
 693    mov->force_writemask_all = true;
 694    mov->exec_size = 8;
 695
 696    /* The caller wants the low 32 bits of the timestamp.  Since it's running
 697     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 698     * which is plenty of time for our purposes.  It is identical across the
 699     * EUs, but since it's tracking GPU core speed it will increment at a
 700     * varying rate as render P-states change.
 701     *
 702     * The caller could also check if render P-states have changed (or anything
 703     * else that might disrupt timing) by setting smear to 2 and checking if
 704     * that field is != 0.
 705     */
 706    dst.set_smear(0);
 707
 708    return dst;
 709 }
 710
 711 void
 712 fs_visitor::emit_shader_time_begin()
 713 {
 714    current_annotation = "shader time start";
 715    shader_start_time = get_timestamp();
 716 }
 717
 718 void
 719 fs_visitor::emit_shader_time_end()
 720 {
 721    current_annotation = "shader time end";
 722
 723    enum shader_time_shader_type type, written_type, reset_type;
 724    if (dispatch_width == 8) {
 725       type = ST_FS8;
 726       written_type = ST_FS8_WRITTEN;
 727       reset_type = ST_FS8_RESET;
 728    } else {
 729       assert(dispatch_width == 16);
 730       type = ST_FS16;
 731       written_type = ST_FS16_WRITTEN;
 732       reset_type = ST_FS16_RESET;
 733    }
 734
 735    fs_reg shader_end_time = get_timestamp();
 736
 737    /* Check that there weren't any timestamp reset events (assuming these
 738     * were the only two timestamp reads that happened).
 739     */
 740    fs_reg reset = shader_end_time;
 741    reset.set_smear(2);
 742    fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
 743    test->conditional_mod = BRW_CONDITIONAL_Z;
 744    emit(IF(BRW_PREDICATE_NORMAL));
 745
 746    push_force_uncompressed();
 747    fs_reg start = shader_start_time;
 748    start.negate = true;
 749    fs_reg diff = fs_reg(this, glsl_type::uint_type);
 750    emit(ADD(diff, start, shader_end_time));
 751
 752    /* If there were no instructions between the two timestamp gets, the diff
 753     * is 2 cycles.  Remove that overhead, so I can forget about that when
 754     * trying to determine the time taken for single instructions.
 755     */
 756    emit(ADD(diff, diff, fs_reg(-2u)));
 757
 758    emit_shader_time_write(type, diff);
 759    emit_shader_time_write(written_type, fs_reg(1u));
 760    emit(BRW_OPCODE_ELSE);
 761    emit_shader_time_write(reset_type, fs_reg(1u));
 762    emit(BRW_OPCODE_ENDIF);
 763
 764    pop_force_uncompressed();
 765 }
 766
 767 void
 768 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
 769                                    fs_reg value)
 770 {
 771    int shader_time_index =
 772       brw_get_shader_time_index(brw, shader_prog, prog, type);
 773    fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
 774
 775    fs_reg payload;
 776    if (dispatch_width == 8)
 777       payload = fs_reg(this, glsl_type::uvec2_type);
 778    else
 779       payload = fs_reg(this, glsl_type::uint_type);
 780
 781    emit(new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
 782                              fs_reg(), payload, offset, value));
 783 }
 784
 785 void
 786 fs_visitor::vfail(const char *format, va_list va)
 787 {
 788    char *msg;
 789
 790    if (failed)
 791       return;
 792
 793    failed = true;
 794
 795    msg = ralloc_vasprintf(mem_ctx, format, va);
 796    msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
 797
 798    this->fail_msg = msg;
 799
 800    if (INTEL_DEBUG & DEBUG_WM) {
 801       fprintf(stderr, "%s",  msg);
 802    }
 803 }
 804
 805 void
 806 fs_visitor::fail(const char *format, ...)
 807 {
 808    va_list va;
 809
 810    va_start(va, format);
 811    vfail(format, va);
 812    va_end(va);
 813 }
 814
 815 /**
 816  * Mark this program as impossible to compile in SIMD16 mode.
 817  *
 818  * During the SIMD8 compile (which happens first), we can detect and flag
 819  * things that are unsupported in SIMD16 mode, so the compiler can skip
 820  * the SIMD16 compile altogether.
 821  *
 822  * During a SIMD16 compile (if one happens anyway), this just calls fail().
 823  */
 824 void
 825 fs_visitor::no16(const char *format, ...)
 826 {
 827    va_list va;
 828
 829    va_start(va, format);
 830
 831    if (dispatch_width == 16) {
 832       vfail(format, va);
 833    } else {
 834       simd16_unsupported = true;
 835
 836       if (brw->perf_debug) {
 837          if (no16_msg)
 838             ralloc_vasprintf_append(&no16_msg, format, va);
 839          else
 840             no16_msg = ralloc_vasprintf(mem_ctx, format, va);
 841       }
 842    }
 843
 844    va_end(va);
 845 }
 846
 847 fs_inst *
 848 fs_visitor::emit(enum opcode opcode)
 849 {
 850    return emit(new(mem_ctx) fs_inst(opcode, dispatch_width));
 851 }
 852
 853 fs_inst *
 854 fs_visitor::emit(enum opcode opcode, const fs_reg &dst)
 855 {
 856    return emit(new(mem_ctx) fs_inst(opcode, dst));
 857 }
 858
 859 fs_inst *
 860 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
 861 {
 862    return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
 863 }
 864
 865 fs_inst *
 866 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 867                  const fs_reg &src1)
 868 {
 869    return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
 870 }
 871
 872 fs_inst *
 873 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 874                  const fs_reg &src1, const fs_reg &src2)
 875 {
 876    return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
 877 }
 878
 879 fs_inst *
 880 fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
 881                  fs_reg src[], int sources)
 882 {
 883    return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
 884 }
 885
 886 void
 887 fs_visitor::push_force_uncompressed()
 888 {
 889    force_uncompressed_stack++;
 890 }
 891
 892 void
 893 fs_visitor::pop_force_uncompressed()
 894 {
 895    force_uncompressed_stack--;
 896    assert(force_uncompressed_stack >= 0);
 897 }
 898
 899 /**
 900  * Returns true if the instruction has a flag that means it won't
 901  * update an entire destination register.
 902  *
 903  * For example, dead code elimination and live variable analysis want to know
 904  * when a write to a variable screens off any preceding values that were in
 905  * it.
 906  */
 907 bool
 908 fs_inst::is_partial_write() const
 909 {
 910    return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
 911            (this->dst.width * type_sz(this->dst.type)) < 32 ||
 912            !this->dst.is_contiguous());
 913 }
 914
 915 int
 916 fs_inst::regs_read(fs_visitor *v, int arg) const
 917 {
 918    if (is_tex() && arg == 0 && src[0].file == GRF) {
 919       return mlen;
 920    } else if (opcode == FS_OPCODE_FB_WRITE && arg == 0) {
 921       return mlen;
 922    } else if (opcode == SHADER_OPCODE_UNTYPED_ATOMIC && arg == 0) {
 923       return mlen;
 924    } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ && arg == 0) {
 925       return mlen;
 926    }
 927
 928    switch (src[arg].file) {
 929    case BAD_FILE:
 930    case UNIFORM:
 931    case IMM:
 932       return 1;
 933    case GRF:
 934    case HW_REG:
 935       if (src[arg].stride == 0) {
 936          return 1;
 937       } else {
 938          int size = src[arg].width * src[arg].stride * type_sz(src[arg].type);
 939          return (size + 31) / 32;
 940       }
 941    case MRF:
 942       unreachable("MRF registers are not allowed as sources");
 943    default:
 944       unreachable("Invalid register file");
 945    }
 946 }
 947
 948 bool
 949 fs_inst::reads_flag() const
 950 {
 951    return predicate;
 952 }
 953
 954 bool
 955 fs_inst::writes_flag() const
 956 {
 957    return (conditional_mod && opcode != BRW_OPCODE_SEL) ||
 958           opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
 959 }
 960
 961 /**
 962  * Returns how many MRFs an FS opcode will write over.
 963  *
 964  * Note that this is not the 0 or 1 implied writes in an actual gen
 965  * instruction -- the FS opcodes often generate MOVs in addition.
 966  */
 967 int
 968 fs_visitor::implied_mrf_writes(fs_inst *inst)
 969 {
 970    if (inst->mlen == 0)
 971       return 0;
 972
 973    if (inst->base_mrf == -1)
 974       return 0;
 975
 976    switch (inst->opcode) {
 977    case SHADER_OPCODE_RCP:
 978    case SHADER_OPCODE_RSQ:
 979    case SHADER_OPCODE_SQRT:
 980    case SHADER_OPCODE_EXP2:
 981    case SHADER_OPCODE_LOG2:
 982    case SHADER_OPCODE_SIN:
 983    case SHADER_OPCODE_COS:
 984       return 1 * dispatch_width / 8;
 985    case SHADER_OPCODE_POW:
 986    case SHADER_OPCODE_INT_QUOTIENT:
 987    case SHADER_OPCODE_INT_REMAINDER:
 988       return 2 * dispatch_width / 8;
 989    case SHADER_OPCODE_TEX:
 990    case FS_OPCODE_TXB:
 991    case SHADER_OPCODE_TXD:
 992    case SHADER_OPCODE_TXF:
 993    case SHADER_OPCODE_TXF_CMS:
 994    case SHADER_OPCODE_TXF_MCS:
 995    case SHADER_OPCODE_TG4:
 996    case SHADER_OPCODE_TG4_OFFSET:
 997    case SHADER_OPCODE_TXL:
 998    case SHADER_OPCODE_TXS:
 999    case SHADER_OPCODE_LOD:
1000       return 1;
1001    case FS_OPCODE_FB_WRITE:
1002       return 2;
1003    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
1004    case SHADER_OPCODE_GEN4_SCRATCH_READ:
1005       return 1;
1006    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
1007       return inst->mlen;
1008    case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
1009       return 2;
1010    case SHADER_OPCODE_UNTYPED_ATOMIC:
1011    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
1012    case FS_OPCODE_INTERPOLATE_AT_CENTROID:
1013    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
1014    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
1015    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
1016       return 0;
1017    default:
1018       unreachable("not reached");
1019    }
1020 }
1021
1022 int
1023 fs_visitor::virtual_grf_alloc(int size)
1024 {
1025    if (virtual_grf_array_size <= virtual_grf_count) {
1026       if (virtual_grf_array_size == 0)
1027          virtual_grf_array_size = 16;
1028       else
1029          virtual_grf_array_size *= 2;
1030       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
1031                                    virtual_grf_array_size);
1032    }
1033    virtual_grf_sizes[virtual_grf_count] = size;
1034    return virtual_grf_count++;
1035 }
1036
1037 /** Fixed HW reg constructor. */
1038 fs_reg::fs_reg(enum register_file file, int reg)
1039 {
1040    init();
1041    this->file = file;
1042    this->reg = reg;
1043    this->type = BRW_REGISTER_TYPE_F;
1044
1045    switch (file) {
1046    case UNIFORM:
1047       this->width = 1;
1048       break;
1049    default:
1050       this->width = 8;
1051    }
1052 }
1053
1054 /** Fixed HW reg constructor. */
1055 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
1056 {
1057    init();
1058    this->file = file;
1059    this->reg = reg;
1060    this->type = type;
1061
1062    switch (file) {
1063    case UNIFORM:
1064       this->width = 1;
1065       break;
1066    default:
1067       this->width = 8;
1068    }
1069 }
1070
1071 /** Fixed HW reg constructor. */
1072 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type,
1073                uint8_t width)
1074 {
1075    init();
1076    this->file = file;
1077    this->reg = reg;
1078    this->type = type;
1079    this->width = width;
1080 }
1081
1082 /** Automatic reg constructor. */
1083 fs_reg::fs_reg(fs_visitor *v, const struct glsl_type *type)
1084 {
1085    init();
1086    int reg_width = v->dispatch_width / 8;
1087
1088    this->file = GRF;
1089    this->reg = v->virtual_grf_alloc(v->type_size(type) * reg_width);
1090    this->reg_offset = 0;
1091    this->type = brw_type_for_base_type(type);
1092    this->width = v->dispatch_width;
1093    assert(this->width == 8 || this->width == 16);
1094 }
1095
1096 fs_reg *
1097 fs_visitor::variable_storage(ir_variable *var)
1098 {
1099    return (fs_reg *)hash_table_find(this->variable_ht, var);
1100 }
1101
1102 void
1103 import_uniforms_callback(const void *key,
1104                          void *data,
1105                          void *closure)
1106 {
1107    struct hash_table *dst_ht = (struct hash_table *)closure;
1108    const fs_reg *reg = (const fs_reg *)data;
1109
1110    if (reg->file != UNIFORM)
1111       return;
1112
1113    hash_table_insert(dst_ht, data, key);
1114 }
1115
1116 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
1117  * This brings in those uniform definitions
1118  */
1119 void
1120 fs_visitor::import_uniforms(fs_visitor *v)
1121 {
1122    hash_table_call_foreach(v->variable_ht,
1123                            import_uniforms_callback,
1124                            variable_ht);
1125    this->push_constant_loc = v->push_constant_loc;
1126    this->pull_constant_loc = v->pull_constant_loc;
1127    this->uniforms = v->uniforms;
1128    this->param_size = v->param_size;
1129 }
1130
1131 /* Our support for uniforms is piggy-backed on the struct
1132  * gl_fragment_program, because that's where the values actually
1133  * get stored, rather than in some global gl_shader_program uniform
1134  * store.
1135  */
1136 void
1137 fs_visitor::setup_uniform_values(ir_variable *ir)
1138 {
1139    int namelen = strlen(ir->name);
1140
1141    /* The data for our (non-builtin) uniforms is stored in a series of
1142     * gl_uniform_driver_storage structs for each subcomponent that
1143     * glGetUniformLocation() could name.  We know it's been set up in the same
1144     * order we'd walk the type, so walk the list of storage and find anything
1145     * with our name, or the prefix of a component that starts with our name.
1146     */
1147    unsigned params_before = uniforms;
1148    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
1149       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
1150
1151       if (strncmp(ir->name, storage->name, namelen) != 0 ||
1152           (storage->name[namelen] != 0 &&
1153            storage->name[namelen] != '.' &&
1154            storage->name[namelen] != '[')) {
1155          continue;
1156       }
1157
1158       unsigned slots = storage->type->component_slots();
1159       if (storage->array_elements)
1160          slots *= storage->array_elements;
1161
1162       for (unsigned i = 0; i < slots; i++) {
1163          stage_prog_data->param[uniforms++] = &storage->storage[i];
1164       }
1165    }
1166
1167    /* Make sure we actually initialized the right amount of stuff here. */
1168    assert(params_before + ir->type->component_slots() == uniforms);
1169    (void)params_before;
1170 }
1171
1172
1173 /* Our support for builtin uniforms is even scarier than non-builtin.
1174  * It sits on top of the PROG_STATE_VAR parameters that are
1175  * automatically updated from GL context state.
1176  */
1177 void
1178 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
1179 {
1180    const ir_state_slot *const slots = ir->get_state_slots();
1181    assert(slots != NULL);
1182
1183    for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
1184       /* This state reference has already been setup by ir_to_mesa, but we'll
1185        * get the same index back here.
1186        */
1187       int index = _mesa_add_state_reference(this->prog->Parameters,
1188                                             (gl_state_index *)slots[i].tokens);
1189
1190       /* Add each of the unique swizzles of the element as a parameter.
1191        * This'll end up matching the expected layout of the
1192        * array/matrix/structure we're trying to fill in.
1193        */
1194       int last_swiz = -1;
1195       for (unsigned int j = 0; j < 4; j++) {
1196          int swiz = GET_SWZ(slots[i].swizzle, j);
1197          if (swiz == last_swiz)
1198             break;
1199          last_swiz = swiz;
1200
1201          stage_prog_data->param[uniforms++] =
1202             &prog->Parameters->ParameterValues[index][swiz];
1203       }
1204    }
1205 }
1206
1207 fs_reg *
1208 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
1209 {
1210    assert(stage == MESA_SHADER_FRAGMENT);
1211    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1212    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1213    fs_reg wpos = *reg;
1214    bool flip = !ir->data.origin_upper_left ^ key->render_to_fbo;
1215
1216    /* gl_FragCoord.x */
1217    if (ir->data.pixel_center_integer) {
1218       emit(MOV(wpos, this->pixel_x));
1219    } else {
1220       emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
1221    }
1222    wpos = offset(wpos, 1);
1223
1224    /* gl_FragCoord.y */
1225    if (!flip && ir->data.pixel_center_integer) {
1226       emit(MOV(wpos, this->pixel_y));
1227    } else {
1228       fs_reg pixel_y = this->pixel_y;
1229       float offset = (ir->data.pixel_center_integer ? 0.0 : 0.5);
1230
1231       if (flip) {
1232          pixel_y.negate = true;
1233          offset += key->drawable_height - 1.0;
1234       }
1235
1236       emit(ADD(wpos, pixel_y, fs_reg(offset)));
1237    }
1238    wpos = offset(wpos, 1);
1239
1240    /* gl_FragCoord.z */
1241    if (brw->gen >= 6) {
1242       emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
1243    } else {
1244       emit(FS_OPCODE_LINTERP, wpos,
1245            this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1246            this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1247            interp_reg(VARYING_SLOT_POS, 2));
1248    }
1249    wpos = offset(wpos, 1);
1250
1251    /* gl_FragCoord.w: Already set up in emit_interpolation */
1252    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1253
1254    return reg;
1255 }
1256
1257 fs_inst *
1258 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1259                          glsl_interp_qualifier interpolation_mode,
1260                          bool is_centroid, bool is_sample)
1261 {
1262    brw_wm_barycentric_interp_mode barycoord_mode;
1263    if (brw->gen >= 6) {
1264       if (is_centroid) {
1265          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1266             barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1267          else
1268             barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1269       } else if (is_sample) {
1270           if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1271             barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1272          else
1273             barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1274       } else {
1275          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1276             barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1277          else
1278             barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1279       }
1280    } else {
1281       /* On Ironlake and below, there is only one interpolation mode.
1282        * Centroid interpolation doesn't mean anything on this hardware --
1283        * there is no multisampling.
1284        */
1285       barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1286    }
1287    return emit(FS_OPCODE_LINTERP, attr,
1288                this->delta_x[barycoord_mode],
1289                this->delta_y[barycoord_mode], interp);
1290 }
1291
1292 fs_reg *
1293 fs_visitor::emit_general_interpolation(ir_variable *ir)
1294 {
1295    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1296    reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
1297    fs_reg attr = *reg;
1298
1299    assert(stage == MESA_SHADER_FRAGMENT);
1300    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1301    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1302
1303    unsigned int array_elements;
1304    const glsl_type *type;
1305
1306    if (ir->type->is_array()) {
1307       array_elements = ir->type->length;
1308       if (array_elements == 0) {
1309          fail("dereferenced array '%s' has length 0\n", ir->name);
1310       }
1311       type = ir->type->fields.array;
1312    } else {
1313       array_elements = 1;
1314       type = ir->type;
1315    }
1316
1317    glsl_interp_qualifier interpolation_mode =
1318       ir->determine_interpolation_mode(key->flat_shade);
1319
1320    int location = ir->data.location;
1321    for (unsigned int i = 0; i < array_elements; i++) {
1322       for (unsigned int j = 0; j < type->matrix_columns; j++) {
1323          if (prog_data->urb_setup[location] == -1) {
1324             /* If there's no incoming setup data for this slot, don't
1325              * emit interpolation for it.
1326              */
1327             attr = offset(attr, type->vector_elements);
1328             location++;
1329             continue;
1330          }
1331
1332          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1333             /* Constant interpolation (flat shading) case. The SF has
1334              * handed us defined values in only the constant offset
1335              * field of the setup reg.
1336              */
1337             for (unsigned int k = 0; k < type->vector_elements; k++) {
1338                struct brw_reg interp = interp_reg(location, k);
1339                interp = suboffset(interp, 3);
1340                interp.type = reg->type;
1341                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1342                attr = offset(attr, 1);
1343             }
1344          } else {
1345             /* Smooth/noperspective interpolation case. */
1346             for (unsigned int k = 0; k < type->vector_elements; k++) {
1347                struct brw_reg interp = interp_reg(location, k);
1348                if (brw->needs_unlit_centroid_workaround && ir->data.centroid) {
1349                   /* Get the pixel/sample mask into f0 so that we know
1350                    * which pixels are lit.  Then, for each channel that is
1351                    * unlit, replace the centroid data with non-centroid
1352                    * data.
1353                    */
1354                   emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1355
1356                   fs_inst *inst;
1357                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1358                                       false, false);
1359                   inst->predicate = BRW_PREDICATE_NORMAL;
1360                   inst->predicate_inverse = true;
1361                   if (brw->has_pln)
1362                      inst->no_dd_clear = true;
1363
1364                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1365                                       ir->data.centroid && !key->persample_shading,
1366                                       ir->data.sample || key->persample_shading);
1367                   inst->predicate = BRW_PREDICATE_NORMAL;
1368                   inst->predicate_inverse = false;
1369                   if (brw->has_pln)
1370                      inst->no_dd_check = true;
1371
1372                } else {
1373                   emit_linterp(attr, fs_reg(interp), interpolation_mode,
1374                                ir->data.centroid && !key->persample_shading,
1375                                ir->data.sample || key->persample_shading);
1376                }
1377                if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1378                   emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1379                }
1380                attr = offset(attr, 1);
1381             }
1382
1383          }
1384          location++;
1385       }
1386    }
1387
1388    return reg;
1389 }
1390
1391 fs_reg *
1392 fs_visitor::emit_frontfacing_interpolation()
1393 {
1394    fs_reg *reg = new(this->mem_ctx) fs_reg(this, glsl_type::bool_type);
1395
1396    if (brw->gen >= 6) {
1397       /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
1398        * a boolean result from this (~0/true or 0/false).
1399        *
1400        * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
1401        * this task in only one instruction:
1402        *    - a negation source modifier will flip the bit; and
1403        *    - a W -> D type conversion will sign extend the bit into the high
1404        *      word of the destination.
1405        *
1406        * An ASR 15 fills the low word of the destination.
1407        */
1408       fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
1409       g0.negate = true;
1410
1411       emit(ASR(*reg, g0, fs_reg(15)));
1412    } else {
1413       /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
1414        * a boolean result from this (1/true or 0/false).
1415        *
1416        * Like in the above case, since the bit is the MSB of g1.6:UD we can use
1417        * the negation source modifier to flip it. Unfortunately the SHR
1418        * instruction only operates on UD (or D with an abs source modifier)
1419        * sources without negation.
1420        *
1421        * Instead, use ASR (which will give ~0/true or 0/false) followed by an
1422        * AND 1.
1423        */
1424       fs_reg asr = fs_reg(this, glsl_type::bool_type);
1425       fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
1426       g1_6.negate = true;
1427
1428       emit(ASR(asr, g1_6, fs_reg(31)));
1429       emit(AND(*reg, asr, fs_reg(1)));
1430    }
1431
1432    return reg;
1433 }
1434
1435 void
1436 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1437 {
1438    assert(stage == MESA_SHADER_FRAGMENT);
1439    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1440    assert(dst.type == BRW_REGISTER_TYPE_F);
1441
1442    if (key->compute_pos_offset) {
1443       /* Convert int_sample_pos to floating point */
1444       emit(MOV(dst, int_sample_pos));
1445       /* Scale to the range [0, 1] */
1446       emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1447    }
1448    else {
1449       /* From ARB_sample_shading specification:
1450        * "When rendering to a non-multisample buffer, or if multisample
1451        *  rasterization is disabled, gl_SamplePosition will always be
1452        *  (0.5, 0.5).
1453        */
1454       emit(MOV(dst, fs_reg(0.5f)));
1455    }
1456 }
1457
1458 fs_reg *
1459 fs_visitor::emit_samplepos_setup()
1460 {
1461    assert(brw->gen >= 6);
1462
1463    this->current_annotation = "compute sample position";
1464    fs_reg *reg = new(this->mem_ctx) fs_reg(this, glsl_type::vec2_type);
1465    fs_reg pos = *reg;
1466    fs_reg int_sample_x = fs_reg(this, glsl_type::int_type);
1467    fs_reg int_sample_y = fs_reg(this, glsl_type::int_type);
1468
1469    /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1470     * mode will be enabled.
1471     *
1472     * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1473     * R31.1:0         Position Offset X/Y for Slot[3:0]
1474     * R31.3:2         Position Offset X/Y for Slot[7:4]
1475     * .....
1476     *
1477     * The X, Y sample positions come in as bytes in  thread payload. So, read
1478     * the positions using vstride=16, width=8, hstride=2.
1479     */
1480    struct brw_reg sample_pos_reg =
1481       stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1482                     BRW_REGISTER_TYPE_B), 16, 8, 2);
1483
1484    if (dispatch_width == 8) {
1485       emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1486    } else {
1487       emit(MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg)));
1488       emit(MOV(half(int_sample_x, 1), fs_reg(suboffset(sample_pos_reg, 16))))
1489          ->force_sechalf = true;
1490    }
1491    /* Compute gl_SamplePosition.x */
1492    compute_sample_position(pos, int_sample_x);
1493    pos = offset(pos, 1);
1494    if (dispatch_width == 8) {
1495       emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1496    } else {
1497       emit(MOV(half(int_sample_y, 0),
1498                fs_reg(suboffset(sample_pos_reg, 1))));
1499       emit(MOV(half(int_sample_y, 1), fs_reg(suboffset(sample_pos_reg, 17))))
1500          ->force_sechalf = true;
1501    }
1502    /* Compute gl_SamplePosition.y */
1503    compute_sample_position(pos, int_sample_y);
1504    return reg;
1505 }
1506
1507 fs_reg *
1508 fs_visitor::emit_sampleid_setup(ir_variable *ir)
1509 {
1510    assert(stage == MESA_SHADER_FRAGMENT);
1511    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1512    assert(brw->gen >= 6);
1513
1514    this->current_annotation = "compute sample id";
1515    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1516
1517    if (key->compute_sample_id) {
1518       fs_reg t1 = fs_reg(this, glsl_type::int_type);
1519       fs_reg t2 = fs_reg(this, glsl_type::int_type);
1520       t2.type = BRW_REGISTER_TYPE_UW;
1521
1522       /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1523        * 8x multisampling, subspan 0 will represent sample N (where N
1524        * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1525        * 7. We can find the value of N by looking at R0.0 bits 7:6
1526        * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1527        * (since samples are always delivered in pairs). That is, we
1528        * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1529        * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1530        * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1531        * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1532        * populating a temporary variable with the sequence (0, 1, 2, 3),
1533        * and then reading from it using vstride=1, width=4, hstride=0.
1534        * These computations hold good for 4x multisampling as well.
1535        *
1536        * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1537        * the first four slots are sample 0 of subspan 0; the next four
1538        * are sample 1 of subspan 0; the third group is sample 0 of
1539        * subspan 1, and finally sample 1 of subspan 1.
1540        */
1541       fs_inst *inst;
1542       inst = emit(BRW_OPCODE_AND, t1,
1543                   fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1544                   fs_reg(0xc0));
1545       inst->force_writemask_all = true;
1546       inst = emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1547       inst->force_writemask_all = true;
1548       /* This works for both SIMD8 and SIMD16 */
1549       inst = emit(MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210)));
1550       inst->force_writemask_all = true;
1551       /* This special instruction takes care of setting vstride=1,
1552        * width=4, hstride=0 of t2 during an ADD instruction.
1553        */
1554       emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1555    } else {
1556       /* As per GL_ARB_sample_shading specification:
1557        * "When rendering to a non-multisample buffer, or if multisample
1558        *  rasterization is disabled, gl_SampleID will always be zero."
1559        */
1560       emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1561    }
1562
1563    return reg;
1564 }
1565
1566 fs_reg
1567 fs_visitor::fix_math_operand(fs_reg src)
1568 {
1569    /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1570     * might be able to do better by doing execsize = 1 math and then
1571     * expanding that result out, but we would need to be careful with
1572     * masking.
1573     *
1574     * The hardware ignores source modifiers (negate and abs) on math
1575     * instructions, so we also move to a temp to set those up.
1576     */
1577    if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1578        !src.abs && !src.negate)
1579       return src;
1580
1581    /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1582     * operands to math
1583     */
1584    if (brw->gen >= 7 && src.file != IMM)
1585       return src;
1586
1587    fs_reg expanded = fs_reg(this, glsl_type::float_type);
1588    expanded.type = src.type;
1589    emit(BRW_OPCODE_MOV, expanded, src);
1590    return expanded;
1591 }
1592
1593 fs_inst *
1594 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1595 {
1596    switch (opcode) {
1597    case SHADER_OPCODE_RCP:
1598    case SHADER_OPCODE_RSQ:
1599    case SHADER_OPCODE_SQRT:
1600    case SHADER_OPCODE_EXP2:
1601    case SHADER_OPCODE_LOG2:
1602    case SHADER_OPCODE_SIN:
1603    case SHADER_OPCODE_COS:
1604       break;
1605    default:
1606       unreachable("not reached: bad math opcode");
1607    }
1608
1609    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
1610     * might be able to do better by doing execsize = 1 math and then
1611     * expanding that result out, but we would need to be careful with
1612     * masking.
1613     *
1614     * Gen 6 hardware ignores source modifiers (negate and abs) on math
1615     * instructions, so we also move to a temp to set those up.
1616     */
1617    if (brw->gen == 6 || brw->gen == 7)
1618       src = fix_math_operand(src);
1619
1620    fs_inst *inst = emit(opcode, dst, src);
1621
1622    if (brw->gen < 6) {
1623       inst->base_mrf = 2;
1624       inst->mlen = dispatch_width / 8;
1625    }
1626
1627    return inst;
1628 }
1629
1630 fs_inst *
1631 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1632 {
1633    int base_mrf = 2;
1634    fs_inst *inst;
1635
1636    if (brw->gen >= 8) {
1637       inst = emit(opcode, dst, src0, src1);
1638    } else if (brw->gen >= 6) {
1639       src0 = fix_math_operand(src0);
1640       src1 = fix_math_operand(src1);
1641
1642       inst = emit(opcode, dst, src0, src1);
1643    } else {
1644       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1645        * "Message Payload":
1646        *
1647        * "Operand0[7].  For the INT DIV functions, this operand is the
1648        *  denominator."
1649        *  ...
1650        * "Operand1[7].  For the INT DIV functions, this operand is the
1651        *  numerator."
1652        */
1653       bool is_int_div = opcode != SHADER_OPCODE_POW;
1654       fs_reg &op0 = is_int_div ? src1 : src0;
1655       fs_reg &op1 = is_int_div ? src0 : src1;
1656
1657       emit(MOV(fs_reg(MRF, base_mrf + 1, op1.type, dispatch_width), op1));
1658       inst = emit(opcode, dst, op0, reg_null_f);
1659
1660       inst->base_mrf = base_mrf;
1661       inst->mlen = 2 * dispatch_width / 8;
1662    }
1663    return inst;
1664 }
1665
1666 void
1667 fs_visitor::assign_curb_setup()
1668 {
1669    if (dispatch_width == 8) {
1670       prog_data->dispatch_grf_start_reg = payload.num_regs;
1671    } else {
1672       assert(stage == MESA_SHADER_FRAGMENT);
1673       brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1674       prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1675    }
1676
1677    prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1678
1679    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1680    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1681       for (unsigned int i = 0; i < inst->sources; i++) {
1682          if (inst->src[i].file == UNIFORM) {
1683             int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1684             int constant_nr;
1685             if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1686                constant_nr = push_constant_loc[uniform_nr];
1687             } else {
1688                /* Section 5.11 of the OpenGL 4.1 spec says:
1689                 * "Out-of-bounds reads return undefined values, which include
1690                 *  values from other variables of the active program or zero."
1691                 * Just return the first push constant.
1692                 */
1693                constant_nr = 0;
1694             }
1695
1696             struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1697                                                   constant_nr / 8,
1698                                                   constant_nr % 8);
1699
1700             inst->src[i].file = HW_REG;
1701             inst->src[i].fixed_hw_reg = byte_offset(
1702                retype(brw_reg, inst->src[i].type),
1703                inst->src[i].subreg_offset);
1704          }
1705       }
1706    }
1707 }
1708
1709 void
1710 fs_visitor::calculate_urb_setup()
1711 {
1712    assert(stage == MESA_SHADER_FRAGMENT);
1713    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1714    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1715
1716    memset(prog_data->urb_setup, -1,
1717           sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
1718
1719    int urb_next = 0;
1720    /* Figure out where each of the incoming setup attributes lands. */
1721    if (brw->gen >= 6) {
1722       if (_mesa_bitcount_64(prog->InputsRead &
1723                             BRW_FS_VARYING_INPUT_MASK) <= 16) {
1724          /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1725           * first 16 varying inputs, so we can put them wherever we want.
1726           * Just put them in order.
1727           *
1728           * This is useful because it means that (a) inputs not used by the
1729           * fragment shader won't take up valuable register space, and (b) we
1730           * won't have to recompile the fragment shader if it gets paired with
1731           * a different vertex (or geometry) shader.
1732           */
1733          for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1734             if (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1735                 BITFIELD64_BIT(i)) {
1736                prog_data->urb_setup[i] = urb_next++;
1737             }
1738          }
1739       } else {
1740          /* We have enough input varyings that the SF/SBE pipeline stage can't
1741           * arbitrarily rearrange them to suit our whim; we have to put them
1742           * in an order that matches the output of the previous pipeline stage
1743           * (geometry or vertex shader).
1744           */
1745          struct brw_vue_map prev_stage_vue_map;
1746          brw_compute_vue_map(brw, &prev_stage_vue_map,
1747                              key->input_slots_valid);
1748          int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1749          assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1750          for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1751               slot++) {
1752             int varying = prev_stage_vue_map.slot_to_varying[slot];
1753             /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1754              * unused.
1755              */
1756             if (varying != BRW_VARYING_SLOT_COUNT &&
1757                 (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1758                  BITFIELD64_BIT(varying))) {
1759                prog_data->urb_setup[varying] = slot - first_slot;
1760             }
1761          }
1762          urb_next = prev_stage_vue_map.num_slots - first_slot;
1763       }
1764    } else {
1765       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1766       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1767          /* Point size is packed into the header, not as a general attribute */
1768          if (i == VARYING_SLOT_PSIZ)
1769             continue;
1770
1771          if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1772             /* The back color slot is skipped when the front color is
1773              * also written to.  In addition, some slots can be
1774              * written in the vertex shader and not read in the
1775              * fragment shader.  So the register number must always be
1776              * incremented, mapped or not.
1777              */
1778             if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1779                prog_data->urb_setup[i] = urb_next;
1780             urb_next++;
1781          }
1782       }
1783
1784       /*
1785        * It's a FS only attribute, and we did interpolation for this attribute
1786        * in SF thread. So, count it here, too.
1787        *
1788        * See compile_sf_prog() for more info.
1789        */
1790       if (prog->InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1791          prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1792    }
1793
1794    prog_data->num_varying_inputs = urb_next;
1795 }
1796
1797 void
1798 fs_visitor::assign_urb_setup()
1799 {
1800    assert(stage == MESA_SHADER_FRAGMENT);
1801    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1802
1803    int urb_start = payload.num_regs + prog_data->base.curb_read_length;
1804
1805    /* Offset all the urb_setup[] index by the actual position of the
1806     * setup regs, now that the location of the constants has been chosen.
1807     */
1808    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1809       if (inst->opcode == FS_OPCODE_LINTERP) {
1810          assert(inst->src[2].file == HW_REG);
1811          inst->src[2].fixed_hw_reg.nr += urb_start;
1812       }
1813
1814       if (inst->opcode == FS_OPCODE_CINTERP) {
1815          assert(inst->src[0].file == HW_REG);
1816          inst->src[0].fixed_hw_reg.nr += urb_start;
1817       }
1818    }
1819
1820    /* Each attribute is 4 setup channels, each of which is half a reg. */
1821    this->first_non_payload_grf =
1822       urb_start + prog_data->num_varying_inputs * 2;
1823 }
1824
1825 /**
1826  * Split large virtual GRFs into separate components if we can.
1827  *
1828  * This is mostly duplicated with what brw_fs_vector_splitting does,
1829  * but that's really conservative because it's afraid of doing
1830  * splitting that doesn't result in real progress after the rest of
1831  * the optimization phases, which would cause infinite looping in
1832  * optimization.  We can do it once here, safely.  This also has the
1833  * opportunity to split interpolated values, or maybe even uniforms,
1834  * which we don't have at the IR level.
1835  *
1836  * We want to split, because virtual GRFs are what we register
1837  * allocate and spill (due to contiguousness requirements for some
1838  * instructions), and they're what we naturally generate in the
1839  * codegen process, but most virtual GRFs don't actually need to be
1840  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1841  * live intervals and better dead code elimination and coalescing.
1842  */
1843 void
1844 fs_visitor::split_virtual_grfs()
1845 {
1846    int num_vars = this->virtual_grf_count;
1847
1848    /* Count the total number of registers */
1849    int reg_count = 0;
1850    int vgrf_to_reg[num_vars];
1851    for (int i = 0; i < num_vars; i++) {
1852       vgrf_to_reg[i] = reg_count;
1853       reg_count += virtual_grf_sizes[i];
1854    }
1855
1856    /* An array of "split points".  For each register slot, this indicates
1857     * if this slot can be separated from the previous slot.  Every time an
1858     * instruction uses multiple elements of a register (as a source or
1859     * destination), we mark the used slots as inseparable.  Then we go
1860     * through and split the registers into the smallest pieces we can.
1861     */
1862    bool split_points[reg_count];
1863    memset(split_points, 0, sizeof(split_points));
1864
1865    /* Mark all used registers as fully splittable */
1866    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1867       if (inst->dst.file == GRF) {
1868          int reg = vgrf_to_reg[inst->dst.reg];
1869          for (int j = 1; j < this->virtual_grf_sizes[inst->dst.reg]; j++)
1870             split_points[reg + j] = true;
1871       }
1872
1873       for (int i = 0; i < inst->sources; i++) {
1874          if (inst->src[i].file == GRF) {
1875             int reg = vgrf_to_reg[inst->src[i].reg];
1876             for (int j = 1; j < this->virtual_grf_sizes[inst->src[i].reg]; j++)
1877                split_points[reg + j] = true;
1878          }
1879       }
1880    }
1881
1882    if (brw->has_pln &&
1883        this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1884       /* PLN opcodes rely on the delta_xy being contiguous.  We only have to
1885        * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1886        * Gen6, that was the only supported interpolation mode, and since Gen6,
1887        * delta_x and delta_y are in fixed hardware registers.
1888        */
1889       int vgrf = this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg;
1890       split_points[vgrf_to_reg[vgrf] + 1] = false;
1891    }
1892
1893    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1894       if (inst->dst.file == GRF) {
1895          int reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
1896          for (int j = 1; j < inst->regs_written; j++)
1897             split_points[reg + j] = false;
1898       }
1899       for (int i = 0; i < inst->sources; i++) {
1900          if (inst->src[i].file == GRF) {
1901             int reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
1902             for (int j = 1; j < inst->regs_read(this, i); j++)
1903                split_points[reg + j] = false;
1904          }
1905       }
1906    }
1907
1908    int new_virtual_grf[reg_count];
1909    int new_reg_offset[reg_count];
1910
1911    int reg = 0;
1912    for (int i = 0; i < num_vars; i++) {
1913       /* The first one should always be 0 as a quick sanity check. */
1914       assert(split_points[reg] == false);
1915
1916       /* j = 0 case */
1917       new_reg_offset[reg] = 0;
1918       reg++;
1919       int offset = 1;
1920
1921       /* j > 0 case */
1922       for (int j = 1; j < virtual_grf_sizes[i]; j++) {
1923          /* If this is a split point, reset the offset to 0 and allocate a
1924           * new virtual GRF for the previous offset many registers
1925           */
1926          if (split_points[reg]) {
1927             int grf = virtual_grf_alloc(offset);
1928             for (int k = reg - offset; k < reg; k++)
1929                new_virtual_grf[k] = grf;
1930             offset = 0;
1931          }
1932          new_reg_offset[reg] = offset;
1933          offset++;
1934          reg++;
1935       }
1936
1937       /* The last one gets the original register number */
1938       virtual_grf_sizes[i] = offset;
1939       for (int k = reg - offset; k < reg; k++)
1940          new_virtual_grf[k] = i;
1941    }
1942    assert(reg == reg_count);
1943
1944    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1945       if (inst->dst.file == GRF) {
1946          reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
1947          inst->dst.reg = new_virtual_grf[reg];
1948          inst->dst.reg_offset = new_reg_offset[reg];
1949          assert(new_reg_offset[reg] < virtual_grf_sizes[new_virtual_grf[reg]]);
1950       }
1951       for (int i = 0; i < inst->sources; i++) {
1952          if (inst->src[i].file == GRF) {
1953             reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
1954             inst->src[i].reg = new_virtual_grf[reg];
1955             inst->src[i].reg_offset = new_reg_offset[reg];
1956             assert(new_reg_offset[reg] < virtual_grf_sizes[new_virtual_grf[reg]]);
1957          }
1958       }
1959    }
1960    invalidate_live_intervals();
1961 }
1962
1963 /**
1964  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1965  *
1966  * During code generation, we create tons of temporary variables, many of
1967  * which get immediately killed and are never used again.  Yet, in later
1968  * optimization and analysis passes, such as compute_live_intervals, we need
1969  * to loop over all the virtual GRFs.  Compacting them can save a lot of
1970  * overhead.
1971  */
1972 bool
1973 fs_visitor::compact_virtual_grfs()
1974 {
1975    bool progress = false;
1976    int remap_table[this->virtual_grf_count];
1977    memset(remap_table, -1, sizeof(remap_table));
1978
1979    /* Mark which virtual GRFs are used. */
1980    foreach_block_and_inst(block, const fs_inst, inst, cfg) {
1981       if (inst->dst.file == GRF)
1982          remap_table[inst->dst.reg] = 0;
1983
1984       for (int i = 0; i < inst->sources; i++) {
1985          if (inst->src[i].file == GRF)
1986             remap_table[inst->src[i].reg] = 0;
1987       }
1988    }
1989
1990    /* Compact the GRF arrays. */
1991    int new_index = 0;
1992    for (int i = 0; i < this->virtual_grf_count; i++) {
1993       if (remap_table[i] == -1) {
1994          /* We just found an unused register.  This means that we are
1995           * actually going to compact something.
1996           */
1997          progress = true;
1998       } else {
1999          remap_table[i] = new_index;
2000          virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
2001          invalidate_live_intervals();
2002          ++new_index;
2003       }
2004    }
2005
2006    this->virtual_grf_count = new_index;
2007
2008    /* Patch all the instructions to use the newly renumbered registers */
2009    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2010       if (inst->dst.file == GRF)
2011          inst->dst.reg = remap_table[inst->dst.reg];
2012
2013       for (int i = 0; i < inst->sources; i++) {
2014          if (inst->src[i].file == GRF)
2015             inst->src[i].reg = remap_table[inst->src[i].reg];
2016       }
2017    }
2018
2019    /* Patch all the references to delta_x/delta_y, since they're used in
2020     * register allocation.  If they're unused, switch them to BAD_FILE so
2021     * we don't think some random VGRF is delta_x/delta_y.
2022     */
2023    for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
2024       if (delta_x[i].file == GRF) {
2025          if (remap_table[delta_x[i].reg] != -1) {
2026             delta_x[i].reg = remap_table[delta_x[i].reg];
2027          } else {
2028             delta_x[i].file = BAD_FILE;
2029          }
2030       }
2031    }
2032    for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
2033       if (delta_y[i].file == GRF) {
2034          if (remap_table[delta_y[i].reg] != -1) {
2035             delta_y[i].reg = remap_table[delta_y[i].reg];
2036          } else {
2037             delta_y[i].file = BAD_FILE;
2038          }
2039       }
2040    }
2041
2042    return progress;
2043 }
2044
2045 /*
2046  * Implements array access of uniforms by inserting a
2047  * PULL_CONSTANT_LOAD instruction.
2048  *
2049  * Unlike temporary GRF array access (where we don't support it due to
2050  * the difficulty of doing relative addressing on instruction
2051  * destinations), we could potentially do array access of uniforms
2052  * that were loaded in GRF space as push constants.  In real-world
2053  * usage we've seen, though, the arrays being used are always larger
2054  * than we could load as push constants, so just always move all
2055  * uniform array access out to a pull constant buffer.
2056  */
2057 void
2058 fs_visitor::move_uniform_array_access_to_pull_constants()
2059 {
2060    if (dispatch_width != 8)
2061       return;
2062
2063    pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2064    memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
2065
2066    /* Walk through and find array access of uniforms.  Put a copy of that
2067     * uniform in the pull constant buffer.
2068     *
2069     * Note that we don't move constant-indexed accesses to arrays.  No
2070     * testing has been done of the performance impact of this choice.
2071     */
2072    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2073       for (int i = 0 ; i < inst->sources; i++) {
2074          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2075             continue;
2076
2077          int uniform = inst->src[i].reg;
2078
2079          /* If this array isn't already present in the pull constant buffer,
2080           * add it.
2081           */
2082          if (pull_constant_loc[uniform] == -1) {
2083             const gl_constant_value **values = &stage_prog_data->param[uniform];
2084
2085             assert(param_size[uniform]);
2086
2087             for (int j = 0; j < param_size[uniform]; j++) {
2088                pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
2089
2090                stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
2091                   values[j];
2092             }
2093          }
2094       }
2095    }
2096 }
2097
2098 /**
2099  * Assign UNIFORM file registers to either push constants or pull constants.
2100  *
2101  * We allow a fragment shader to have more than the specified minimum
2102  * maximum number of fragment shader uniform components (64).  If
2103  * there are too many of these, they'd fill up all of register space.
2104  * So, this will push some of them out to the pull constant buffer and
2105  * update the program to load them.
2106  */
2107 void
2108 fs_visitor::assign_constant_locations()
2109 {
2110    /* Only the first compile (SIMD8 mode) gets to decide on locations. */
2111    if (dispatch_width != 8)
2112       return;
2113
2114    /* Find which UNIFORM registers are still in use. */
2115    bool is_live[uniforms];
2116    for (unsigned int i = 0; i < uniforms; i++) {
2117       is_live[i] = false;
2118    }
2119
2120    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2121       for (int i = 0; i < inst->sources; i++) {
2122          if (inst->src[i].file != UNIFORM)
2123             continue;
2124
2125          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
2126          if (constant_nr >= 0 && constant_nr < (int) uniforms)
2127             is_live[constant_nr] = true;
2128       }
2129    }
2130
2131    /* Only allow 16 registers (128 uniform components) as push constants.
2132     *
2133     * Just demote the end of the list.  We could probably do better
2134     * here, demoting things that are rarely used in the program first.
2135     *
2136     * If changing this value, note the limitation about total_regs in
2137     * brw_curbe.c.
2138     */
2139    unsigned int max_push_components = 16 * 8;
2140    unsigned int num_push_constants = 0;
2141
2142    push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2143
2144    for (unsigned int i = 0; i < uniforms; i++) {
2145       if (!is_live[i] || pull_constant_loc[i] != -1) {
2146          /* This UNIFORM register is either dead, or has already been demoted
2147           * to a pull const.  Mark it as no longer living in the param[] array.
2148           */
2149          push_constant_loc[i] = -1;
2150          continue;
2151       }
2152
2153       if (num_push_constants < max_push_components) {
2154          /* Retain as a push constant.  Record the location in the params[]
2155           * array.
2156           */
2157          push_constant_loc[i] = num_push_constants++;
2158       } else {
2159          /* Demote to a pull constant. */
2160          push_constant_loc[i] = -1;
2161
2162          int pull_index = stage_prog_data->nr_pull_params++;
2163          stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
2164          pull_constant_loc[i] = pull_index;
2165       }
2166    }
2167
2168    stage_prog_data->nr_params = num_push_constants;
2169
2170    /* Up until now, the param[] array has been indexed by reg + reg_offset
2171     * of UNIFORM registers.  Condense it to only contain the uniforms we
2172     * chose to upload as push constants.
2173     */
2174    for (unsigned int i = 0; i < uniforms; i++) {
2175       int remapped = push_constant_loc[i];
2176
2177       if (remapped == -1)
2178          continue;
2179
2180       assert(remapped <= (int)i);
2181       stage_prog_data->param[remapped] = stage_prog_data->param[i];
2182    }
2183 }
2184
2185 /**
2186  * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
2187  * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
2188  */
2189 void
2190 fs_visitor::demote_pull_constants()
2191 {
2192    foreach_block_and_inst (block, fs_inst, inst, cfg) {
2193       for (int i = 0; i < inst->sources; i++) {
2194          if (inst->src[i].file != UNIFORM)
2195             continue;
2196
2197          int pull_index = pull_constant_loc[inst->src[i].reg +
2198                                             inst->src[i].reg_offset];
2199          if (pull_index == -1)
2200             continue;
2201
2202          /* Set up the annotation tracking for new generated instructions. */
2203          base_ir = inst->ir;
2204          current_annotation = inst->annotation;
2205
2206          fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
2207          fs_reg dst = fs_reg(this, glsl_type::float_type);
2208
2209          /* Generate a pull load into dst. */
2210          if (inst->src[i].reladdr) {
2211             exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
2212                                                         surf_index,
2213                                                         *inst->src[i].reladdr,
2214                                                         pull_index);
2215             inst->insert_before(block, &list);
2216             inst->src[i].reladdr = NULL;
2217          } else {
2218             fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
2219             fs_inst *pull =
2220                new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 8,
2221                                     dst, surf_index, offset);
2222             inst->insert_before(block, pull);
2223             inst->src[i].set_smear(pull_index & 3);
2224          }
2225
2226          /* Rewrite the instruction to use the temporary VGRF. */
2227          inst->src[i].file = GRF;
2228          inst->src[i].reg = dst.reg;
2229          inst->src[i].reg_offset = 0;
2230          inst->src[i].width = dispatch_width;
2231       }
2232    }
2233    invalidate_live_intervals();
2234 }
2235
2236 bool
2237 fs_visitor::opt_algebraic()
2238 {
2239    bool progress = false;
2240
2241    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2242       switch (inst->opcode) {
2243       case BRW_OPCODE_MUL:
2244          if (inst->src[1].file != IMM)
2245             continue;
2246
2247          /* a * 1.0 = a */
2248          if (inst->src[1].is_one()) {
2249             inst->opcode = BRW_OPCODE_MOV;
2250             inst->src[1] = reg_undef;
2251             progress = true;
2252             break;
2253          }
2254
2255          /* a * 0.0 = 0.0 */
2256          if (inst->src[1].is_zero()) {
2257             inst->opcode = BRW_OPCODE_MOV;
2258             inst->src[0] = inst->src[1];
2259             inst->src[1] = reg_undef;
2260             progress = true;
2261             break;
2262          }
2263
2264          break;
2265       case BRW_OPCODE_ADD:
2266          if (inst->src[1].file != IMM)
2267             continue;
2268
2269          /* a + 0.0 = a */
2270          if (inst->src[1].is_zero()) {
2271             inst->opcode = BRW_OPCODE_MOV;
2272             inst->src[1] = reg_undef;
2273             progress = true;
2274             break;
2275          }
2276          break;
2277       case BRW_OPCODE_OR:
2278          if (inst->src[0].equals(inst->src[1])) {
2279             inst->opcode = BRW_OPCODE_MOV;
2280             inst->src[1] = reg_undef;
2281             progress = true;
2282             break;
2283          }
2284          break;
2285       case BRW_OPCODE_LRP:
2286          if (inst->src[1].equals(inst->src[2])) {
2287             inst->opcode = BRW_OPCODE_MOV;
2288             inst->src[0] = inst->src[1];
2289             inst->src[1] = reg_undef;
2290             inst->src[2] = reg_undef;
2291             progress = true;
2292             break;
2293          }
2294          break;
2295       case BRW_OPCODE_SEL:
2296          if (inst->src[0].equals(inst->src[1])) {
2297             inst->opcode = BRW_OPCODE_MOV;
2298             inst->src[1] = reg_undef;
2299             inst->predicate = BRW_PREDICATE_NONE;
2300             inst->predicate_inverse = false;
2301             progress = true;
2302          } else if (inst->saturate && inst->src[1].file == IMM) {
2303             switch (inst->conditional_mod) {
2304             case BRW_CONDITIONAL_LE:
2305             case BRW_CONDITIONAL_L:
2306                switch (inst->src[1].type) {
2307                case BRW_REGISTER_TYPE_F:
2308                   if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
2309                      inst->opcode = BRW_OPCODE_MOV;
2310                      inst->src[1] = reg_undef;
2311                      progress = true;
2312                   }
2313                   break;
2314                default:
2315                   break;
2316                }
2317                break;
2318             case BRW_CONDITIONAL_GE:
2319             case BRW_CONDITIONAL_G:
2320                switch (inst->src[1].type) {
2321                case BRW_REGISTER_TYPE_F:
2322                   if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
2323                      inst->opcode = BRW_OPCODE_MOV;
2324                      inst->src[1] = reg_undef;
2325                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
2326                      progress = true;
2327                   }
2328                   break;
2329                default:
2330                   break;
2331                }
2332             default:
2333                break;
2334             }
2335          }
2336          break;
2337       case SHADER_OPCODE_RCP: {
2338          fs_inst *prev = (fs_inst *)inst->prev;
2339          if (prev->opcode == SHADER_OPCODE_SQRT) {
2340             if (inst->src[0].equals(prev->dst)) {
2341                inst->opcode = SHADER_OPCODE_RSQ;
2342                inst->src[0] = prev->src[0];
2343                progress = true;
2344             }
2345          }
2346          break;
2347       }
2348       default:
2349          break;
2350       }
2351    }
2352
2353    return progress;
2354 }
2355
2356 bool
2357 fs_visitor::opt_register_renaming()
2358 {
2359    bool progress = false;
2360    int depth = 0;
2361
2362    int remap[virtual_grf_count];
2363    memset(remap, -1, sizeof(int) * virtual_grf_count);
2364
2365    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2366       if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
2367          depth++;
2368       } else if (inst->opcode == BRW_OPCODE_ENDIF ||
2369                  inst->opcode == BRW_OPCODE_WHILE) {
2370          depth--;
2371       }
2372
2373       /* Rewrite instruction sources. */
2374       for (int i = 0; i < inst->sources; i++) {
2375          if (inst->src[i].file == GRF &&
2376              remap[inst->src[i].reg] != -1 &&
2377              remap[inst->src[i].reg] != inst->src[i].reg) {
2378             inst->src[i].reg = remap[inst->src[i].reg];
2379             progress = true;
2380          }
2381       }
2382
2383       const int dst = inst->dst.reg;
2384
2385       if (depth == 0 &&
2386           inst->dst.file == GRF &&
2387           virtual_grf_sizes[inst->dst.reg] == inst->dst.width / 8 &&
2388           !inst->is_partial_write()) {
2389          if (remap[dst] == -1) {
2390             remap[dst] = dst;
2391          } else {
2392             remap[dst] = virtual_grf_alloc(inst->dst.width / 8);
2393             inst->dst.reg = remap[dst];
2394             progress = true;
2395          }
2396       } else if (inst->dst.file == GRF &&
2397                  remap[dst] != -1 &&
2398                  remap[dst] != dst) {
2399          inst->dst.reg = remap[dst];
2400          progress = true;
2401       }
2402    }
2403
2404    if (progress) {
2405       invalidate_live_intervals();
2406
2407       for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
2408          if (delta_x[i].file == GRF && remap[delta_x[i].reg] != -1) {
2409             delta_x[i].reg = remap[delta_x[i].reg];
2410          }
2411       }
2412       for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
2413          if (delta_y[i].file == GRF && remap[delta_y[i].reg] != -1) {
2414             delta_y[i].reg = remap[delta_y[i].reg];
2415          }
2416       }
2417    }
2418
2419    return progress;
2420 }
2421
2422 bool
2423 fs_visitor::compute_to_mrf()
2424 {
2425    bool progress = false;
2426    int next_ip = 0;
2427
2428    calculate_live_intervals();
2429
2430    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2431       int ip = next_ip;
2432       next_ip++;
2433
2434       if (inst->opcode != BRW_OPCODE_MOV ||
2435           inst->is_partial_write() ||
2436           inst->dst.file != MRF || inst->src[0].file != GRF ||
2437           inst->dst.type != inst->src[0].type ||
2438           inst->src[0].abs || inst->src[0].negate ||
2439           !inst->src[0].is_contiguous() ||
2440           inst->src[0].subreg_offset)
2441          continue;
2442
2443       /* Work out which hardware MRF registers are written by this
2444        * instruction.
2445        */
2446       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2447       int mrf_high;
2448       if (inst->dst.reg & BRW_MRF_COMPR4) {
2449          mrf_high = mrf_low + 4;
2450       } else if (inst->exec_size == 16) {
2451          mrf_high = mrf_low + 1;
2452       } else {
2453          mrf_high = mrf_low;
2454       }
2455
2456       /* Can't compute-to-MRF this GRF if someone else was going to
2457        * read it later.
2458        */
2459       if (this->virtual_grf_end[inst->src[0].reg] > ip)
2460          continue;
2461
2462       /* Found a move of a GRF to a MRF.  Let's see if we can go
2463        * rewrite the thing that made this GRF to write into the MRF.
2464        */
2465       foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2466          if (scan_inst->dst.file == GRF &&
2467              scan_inst->dst.reg == inst->src[0].reg) {
2468             /* Found the last thing to write our reg we want to turn
2469              * into a compute-to-MRF.
2470              */
2471
2472             /* If this one instruction didn't populate all the
2473              * channels, bail.  We might be able to rewrite everything
2474              * that writes that reg, but it would require smarter
2475              * tracking to delay the rewriting until complete success.
2476              */
2477             if (scan_inst->is_partial_write())
2478                break;
2479
2480             /* Things returning more than one register would need us to
2481              * understand coalescing out more than one MOV at a time.
2482              */
2483             if (scan_inst->regs_written > scan_inst->dst.width / 8)
2484                break;
2485
2486             /* SEND instructions can't have MRF as a destination. */
2487             if (scan_inst->mlen)
2488                break;
2489
2490             if (brw->gen == 6) {
2491                /* gen6 math instructions must have the destination be
2492                 * GRF, so no compute-to-MRF for them.
2493                 */
2494                if (scan_inst->is_math()) {
2495                   break;
2496                }
2497             }
2498
2499             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2500                /* Found the creator of our MRF's source value. */
2501                scan_inst->dst.file = MRF;
2502                scan_inst->dst.reg = inst->dst.reg;
2503                scan_inst->saturate |= inst->saturate;
2504                inst->remove(block);
2505                progress = true;
2506             }
2507             break;
2508          }
2509
2510          /* We don't handle control flow here.  Most computation of
2511           * values that end up in MRFs are shortly before the MRF
2512           * write anyway.
2513           */
2514          if (block->start() == scan_inst)
2515             break;
2516
2517          /* You can't read from an MRF, so if someone else reads our
2518           * MRF's source GRF that we wanted to rewrite, that stops us.
2519           */
2520          bool interfered = false;
2521          for (int i = 0; i < scan_inst->sources; i++) {
2522             if (scan_inst->src[i].file == GRF &&
2523                 scan_inst->src[i].reg == inst->src[0].reg &&
2524                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2525                interfered = true;
2526             }
2527          }
2528          if (interfered)
2529             break;
2530
2531          if (scan_inst->dst.file == MRF) {
2532             /* If somebody else writes our MRF here, we can't
2533              * compute-to-MRF before that.
2534              */
2535             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2536             int scan_mrf_high;
2537
2538             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2539                scan_mrf_high = scan_mrf_low + 4;
2540             } else if (scan_inst->exec_size == 16) {
2541                scan_mrf_high = scan_mrf_low + 1;
2542             } else {
2543                scan_mrf_high = scan_mrf_low;
2544             }
2545
2546             if (mrf_low == scan_mrf_low ||
2547                 mrf_low == scan_mrf_high ||
2548                 mrf_high == scan_mrf_low ||
2549                 mrf_high == scan_mrf_high) {
2550                break;
2551             }
2552          }
2553
2554          if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2555             /* Found a SEND instruction, which means that there are
2556              * live values in MRFs from base_mrf to base_mrf +
2557              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2558              * above it.
2559              */
2560             if (mrf_low >= scan_inst->base_mrf &&
2561                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2562                break;
2563             }
2564             if (mrf_high >= scan_inst->base_mrf &&
2565                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2566                break;
2567             }
2568          }
2569       }
2570    }
2571
2572    if (progress)
2573       invalidate_live_intervals();
2574
2575    return progress;
2576 }
2577
2578 /**
2579  * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
2580  * instructions to FS_OPCODE_REP_FB_WRITE.
2581  */
2582 void
2583 fs_visitor::emit_repclear_shader()
2584 {
2585    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2586    int base_mrf = 1;
2587    int color_mrf = base_mrf + 2;
2588
2589    fs_inst *mov = emit(MOV(vec4(brw_message_reg(color_mrf)),
2590                            fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F)));
2591    mov->force_writemask_all = true;
2592
2593    fs_inst *write;
2594    if (key->nr_color_regions == 1) {
2595       write = emit(FS_OPCODE_REP_FB_WRITE);
2596       write->saturate = key->clamp_fragment_color;
2597       write->base_mrf = color_mrf;
2598       write->target = 0;
2599       write->header_present = false;
2600       write->mlen = 1;
2601    } else {
2602       for (int i = 0; i < key->nr_color_regions; ++i) {
2603          write = emit(FS_OPCODE_REP_FB_WRITE);
2604          write->saturate = key->clamp_fragment_color;
2605          write->base_mrf = base_mrf;
2606          write->target = i;
2607          write->header_present = true;
2608          write->mlen = 3;
2609       }
2610    }
2611    write->eot = true;
2612
2613    calculate_cfg();
2614
2615    assign_constant_locations();
2616    assign_curb_setup();
2617
2618    /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
2619    assert(mov->src[0].file == HW_REG);
2620    mov->src[0] = brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
2621 }
2622
2623 /**
2624  * Walks through basic blocks, looking for repeated MRF writes and
2625  * removing the later ones.
2626  */
2627 bool
2628 fs_visitor::remove_duplicate_mrf_writes()
2629 {
2630    fs_inst *last_mrf_move[16];
2631    bool progress = false;
2632
2633    /* Need to update the MRF tracking for compressed instructions. */
2634    if (dispatch_width == 16)
2635       return false;
2636
2637    memset(last_mrf_move, 0, sizeof(last_mrf_move));
2638
2639    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
2640       if (inst->is_control_flow()) {
2641          memset(last_mrf_move, 0, sizeof(last_mrf_move));
2642       }
2643
2644       if (inst->opcode == BRW_OPCODE_MOV &&
2645           inst->dst.file == MRF) {
2646          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2647          if (prev_inst && inst->equals(prev_inst)) {
2648             inst->remove(block);
2649             progress = true;
2650             continue;
2651          }
2652       }
2653
2654       /* Clear out the last-write records for MRFs that were overwritten. */
2655       if (inst->dst.file == MRF) {
2656          last_mrf_move[inst->dst.reg] = NULL;
2657       }
2658
2659       if (inst->mlen > 0 && inst->base_mrf != -1) {
2660          /* Found a SEND instruction, which will include two or fewer
2661           * implied MRF writes.  We could do better here.
2662           */
2663          for (int i = 0; i < implied_mrf_writes(inst); i++) {
2664             last_mrf_move[inst->base_mrf + i] = NULL;
2665          }
2666       }
2667
2668       /* Clear out any MRF move records whose sources got overwritten. */
2669       if (inst->dst.file == GRF) {
2670          for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2671             if (last_mrf_move[i] &&
2672                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2673                last_mrf_move[i] = NULL;
2674             }
2675          }
2676       }
2677
2678       if (inst->opcode == BRW_OPCODE_MOV &&
2679           inst->dst.file == MRF &&
2680           inst->src[0].file == GRF &&
2681           !inst->is_partial_write()) {
2682          last_mrf_move[inst->dst.reg] = inst;
2683       }
2684    }
2685
2686    if (progress)
2687       invalidate_live_intervals();
2688
2689    return progress;
2690 }
2691
2692 static void
2693 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2694                         int first_grf, int grf_len)
2695 {
2696    /* Clear the flag for registers that actually got read (as expected). */
2697    for (int i = 0; i < inst->sources; i++) {
2698       int grf;
2699       if (inst->src[i].file == GRF) {
2700          grf = inst->src[i].reg;
2701       } else if (inst->src[i].file == HW_REG &&
2702                  inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2703          grf = inst->src[i].fixed_hw_reg.nr;
2704       } else {
2705          continue;
2706       }
2707
2708       if (grf >= first_grf &&
2709           grf < first_grf + grf_len) {
2710          deps[grf - first_grf] = false;
2711          if (inst->exec_size == 16)
2712             deps[grf - first_grf + 1] = false;
2713       }
2714    }
2715 }
2716
2717 /**
2718  * Implements this workaround for the original 965:
2719  *
2720  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2721  *      check for post destination dependencies on this instruction, software
2722  *      must ensure that there is no destination hazard for the case of ‘write
2723  *      followed by a posted write’ shown in the following example.
2724  *
2725  *      1. mov r3 0
2726  *      2. send r3.xy <rest of send instruction>
2727  *      3. mov r2 r3
2728  *
2729  *      Due to no post-destination dependency check on the ‘send’, the above
2730  *      code sequence could have two instructions (1 and 2) in flight at the
2731  *      same time that both consider ‘r3’ as the target of their final writes.
2732  */
2733 void
2734 fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
2735                                                         fs_inst *inst)
2736 {
2737    int write_len = inst->regs_written;
2738    int first_write_grf = inst->dst.reg;
2739    bool needs_dep[BRW_MAX_MRF];
2740    assert(write_len < (int)sizeof(needs_dep) - 1);
2741
2742    memset(needs_dep, false, sizeof(needs_dep));
2743    memset(needs_dep, true, write_len);
2744
2745    clear_deps_for_inst_src(inst, dispatch_width,
2746                            needs_dep, first_write_grf, write_len);
2747
2748    /* Walk backwards looking for writes to registers we're writing which
2749     * aren't read since being written.  If we hit the start of the program,
2750     * we assume that there are no outstanding dependencies on entry to the
2751     * program.
2752     */
2753    foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2754       /* If we hit control flow, assume that there *are* outstanding
2755        * dependencies, and force their cleanup before our instruction.
2756        */
2757       if (block->start() == scan_inst) {
2758          for (int i = 0; i < write_len; i++) {
2759             if (needs_dep[i]) {
2760                inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
2761             }
2762          }
2763          return;
2764       }
2765
2766       /* We insert our reads as late as possible on the assumption that any
2767        * instruction but a MOV that might have left us an outstanding
2768        * dependency has more latency than a MOV.
2769        */
2770       if (scan_inst->dst.file == GRF) {
2771          for (int i = 0; i < scan_inst->regs_written; i++) {
2772             int reg = scan_inst->dst.reg + i;
2773
2774             if (reg >= first_write_grf &&
2775                 reg < first_write_grf + write_len &&
2776                 needs_dep[reg - first_write_grf]) {
2777                inst->insert_before(block, DEP_RESOLVE_MOV(reg));
2778                needs_dep[reg - first_write_grf] = false;
2779                if (scan_inst->exec_size == 16)
2780                   needs_dep[reg - first_write_grf + 1] = false;
2781             }
2782          }
2783       }
2784
2785       /* Clear the flag for registers that actually got read (as expected). */
2786       clear_deps_for_inst_src(scan_inst, dispatch_width,
2787                               needs_dep, first_write_grf, write_len);
2788
2789       /* Continue the loop only if we haven't resolved all the dependencies */
2790       int i;
2791       for (i = 0; i < write_len; i++) {
2792          if (needs_dep[i])
2793             break;
2794       }
2795       if (i == write_len)
2796          return;
2797    }
2798 }
2799
2800 /**
2801  * Implements this workaround for the original 965:
2802  *
2803  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
2804  *      used as a destination register until after it has been sourced by an
2805  *      instruction with a different destination register.
2806  */
2807 void
2808 fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
2809 {
2810    int write_len = inst->regs_written;
2811    int first_write_grf = inst->dst.reg;
2812    bool needs_dep[BRW_MAX_MRF];
2813    assert(write_len < (int)sizeof(needs_dep) - 1);
2814
2815    memset(needs_dep, false, sizeof(needs_dep));
2816    memset(needs_dep, true, write_len);
2817    /* Walk forwards looking for writes to registers we're writing which aren't
2818     * read before being written.
2819     */
2820    foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst, block) {
2821       /* If we hit control flow, force resolve all remaining dependencies. */
2822       if (block->end() == scan_inst) {
2823          for (int i = 0; i < write_len; i++) {
2824             if (needs_dep[i])
2825                scan_inst->insert_before(block,
2826                                         DEP_RESOLVE_MOV(first_write_grf + i));
2827          }
2828          return;
2829       }
2830
2831       /* Clear the flag for registers that actually got read (as expected). */
2832       clear_deps_for_inst_src(scan_inst, dispatch_width,
2833                               needs_dep, first_write_grf, write_len);
2834
2835       /* We insert our reads as late as possible since they're reading the
2836        * result of a SEND, which has massive latency.
2837        */
2838       if (scan_inst->dst.file == GRF &&
2839           scan_inst->dst.reg >= first_write_grf &&
2840           scan_inst->dst.reg < first_write_grf + write_len &&
2841           needs_dep[scan_inst->dst.reg - first_write_grf]) {
2842          scan_inst->insert_before(block, DEP_RESOLVE_MOV(scan_inst->dst.reg));
2843          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2844       }
2845
2846       /* Continue the loop only if we haven't resolved all the dependencies */
2847       int i;
2848       for (i = 0; i < write_len; i++) {
2849          if (needs_dep[i])
2850             break;
2851       }
2852       if (i == write_len)
2853          return;
2854    }
2855
2856    /* If we hit the end of the program, resolve all remaining dependencies out
2857     * of paranoia.
2858     */
2859    fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2860    assert(last_inst->eot);
2861    for (int i = 0; i < write_len; i++) {
2862       if (needs_dep[i])
2863          last_inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
2864    }
2865 }
2866
2867 void
2868 fs_visitor::insert_gen4_send_dependency_workarounds()
2869 {
2870    if (brw->gen != 4 || brw->is_g4x)
2871       return;
2872
2873    bool progress = false;
2874
2875    /* Note that we're done with register allocation, so GRF fs_regs always
2876     * have a .reg_offset of 0.
2877     */
2878
2879    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2880       if (inst->mlen != 0 && inst->dst.file == GRF) {
2881          insert_gen4_pre_send_dependency_workarounds(block, inst);
2882          insert_gen4_post_send_dependency_workarounds(block, inst);
2883          progress = true;
2884       }
2885    }
2886
2887    if (progress)
2888       invalidate_live_intervals();
2889 }
2890
2891 /**
2892  * Turns the generic expression-style uniform pull constant load instruction
2893  * into a hardware-specific series of instructions for loading a pull
2894  * constant.
2895  *
2896  * The expression style allows the CSE pass before this to optimize out
2897  * repeated loads from the same offset, and gives the pre-register-allocation
2898  * scheduling full flexibility, while the conversion to native instructions
2899  * allows the post-register-allocation scheduler the best information
2900  * possible.
2901  *
2902  * Note that execution masking for setting up pull constant loads is special:
2903  * the channels that need to be written are unrelated to the current execution
2904  * mask, since a later instruction will use one of the result channels as a
2905  * source operand for all 8 or 16 of its channels.
2906  */
2907 void
2908 fs_visitor::lower_uniform_pull_constant_loads()
2909 {
2910    foreach_block_and_inst (block, fs_inst, inst, cfg) {
2911       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2912          continue;
2913
2914       if (brw->gen >= 7) {
2915          /* The offset arg before was a vec4-aligned byte offset.  We need to
2916           * turn it into a dword offset.
2917           */
2918          fs_reg const_offset_reg = inst->src[1];
2919          assert(const_offset_reg.file == IMM &&
2920                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2921          const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
2922          fs_reg payload = fs_reg(this, glsl_type::uint_type);
2923
2924          /* This is actually going to be a MOV, but since only the first dword
2925           * is accessed, we have a special opcode to do just that one.  Note
2926           * that this needs to be an operation that will be considered a def
2927           * by live variable analysis, or register allocation will explode.
2928           */
2929          fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2930                                                8, payload, const_offset_reg);
2931          setup->force_writemask_all = true;
2932
2933          setup->ir = inst->ir;
2934          setup->annotation = inst->annotation;
2935          inst->insert_before(block, setup);
2936
2937          /* Similarly, this will only populate the first 4 channels of the
2938           * result register (since we only use smear values from 0-3), but we
2939           * don't tell the optimizer.
2940           */
2941          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2942          inst->src[1] = payload;
2943
2944          invalidate_live_intervals();
2945       } else {
2946          /* Before register allocation, we didn't tell the scheduler about the
2947           * MRF we use.  We know it's safe to use this MRF because nothing
2948           * else does except for register spill/unspill, which generates and
2949           * uses its MRF within a single IR instruction.
2950           */
2951          inst->base_mrf = 14;
2952          inst->mlen = 1;
2953       }
2954    }
2955 }
2956
2957 bool
2958 fs_visitor::lower_load_payload()
2959 {
2960    bool progress = false;
2961
2962    int vgrf_to_reg[virtual_grf_count];
2963    int reg_count = 16; /* Leave room for MRF */
2964    for (int i = 0; i < virtual_grf_count; ++i) {
2965       vgrf_to_reg[i] = reg_count;
2966       reg_count += virtual_grf_sizes[i];
2967    }
2968
2969    struct {
2970       bool written:1; /* Whether this register has ever been written */
2971       bool force_writemask_all:1;
2972       bool force_sechalf:1;
2973    } metadata[reg_count];
2974    memset(metadata, 0, sizeof(metadata));
2975
2976    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
2977       int dst_reg;
2978       if (inst->dst.file == MRF) {
2979          dst_reg = inst->dst.reg;
2980       } else if (inst->dst.file == GRF) {
2981          dst_reg = vgrf_to_reg[inst->dst.reg];
2982       }
2983
2984       if (inst->dst.file == MRF || inst->dst.file == GRF) {
2985          bool force_sechalf = inst->force_sechalf;
2986          bool toggle_sechalf = inst->dst.width == 16 &&
2987                                type_sz(inst->dst.type) == 4;
2988          for (int i = 0; i < inst->regs_written; ++i) {
2989             metadata[dst_reg + i].written = true;
2990             metadata[dst_reg + i].force_sechalf = force_sechalf;
2991             metadata[dst_reg + i].force_writemask_all = inst->force_writemask_all;
2992             force_sechalf = (toggle_sechalf != force_sechalf);
2993          }
2994       }
2995
2996       if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
2997          assert(inst->dst.file == MRF || inst->dst.file == GRF);
2998          fs_reg dst = inst->dst;
2999
3000          for (int i = 0; i < inst->sources; i++) {
3001             dst.width = inst->src[i].effective_width;
3002             dst.type = inst->src[i].type;
3003
3004             if (inst->src[i].file == BAD_FILE) {
3005                /* Do nothing but otherwise increment as normal */
3006             } else if (dst.file == MRF &&
3007                        dst.width == 8 &&
3008                        brw->has_compr4 &&
3009                        i + 4 < inst->sources &&
3010                        inst->src[i + 4].equals(horiz_offset(inst->src[i], 8))) {
3011                fs_reg compr4_dst = dst;
3012                compr4_dst.reg += BRW_MRF_COMPR4;
3013                compr4_dst.width = 16;
3014                fs_reg compr4_src = inst->src[i];
3015                compr4_src.width = 16;
3016                fs_inst *mov = MOV(compr4_dst, compr4_src);
3017                mov->force_writemask_all = true;
3018                inst->insert_before(block, mov);
3019                /* Mark i+4 as BAD_FILE so we don't emit a MOV for it */
3020                inst->src[i + 4].file = BAD_FILE;
3021             } else {
3022                fs_inst *mov = MOV(dst, inst->src[i]);
3023                if (inst->src[i].file == GRF) {
3024                   int src_reg = vgrf_to_reg[inst->src[i].reg] +
3025                                 inst->src[i].reg_offset;
3026                   mov->force_sechalf = metadata[src_reg].force_sechalf;
3027                   mov->force_writemask_all = metadata[src_reg].force_writemask_all;
3028                   metadata[dst_reg] = metadata[src_reg];
3029                   if (dst.width * type_sz(dst.type) > 32) {
3030                      assert((!metadata[src_reg].written ||
3031                              !metadata[src_reg].force_sechalf) &&
3032                             (!metadata[src_reg + 1].written ||
3033                              metadata[src_reg + 1].force_sechalf));
3034                      metadata[dst_reg + 1] = metadata[src_reg + 1];
3035                   }
3036                } else {
3037                   metadata[dst_reg].force_writemask_all = false;
3038                   metadata[dst_reg].force_sechalf = false;
3039                   if (dst.width == 16) {
3040                      metadata[dst_reg + 1].force_writemask_all = false;
3041                      metadata[dst_reg + 1].force_sechalf = true;
3042                   }
3043                }
3044                inst->insert_before(block, mov);
3045             }
3046
3047             dst = offset(dst, 1);
3048          }
3049
3050          inst->remove(block);
3051          progress = true;
3052       }
3053    }
3054
3055    if (progress)
3056       invalidate_live_intervals();
3057
3058    return progress;
3059 }
3060
3061 void
3062 fs_visitor::dump_instructions()
3063 {
3064    dump_instructions(NULL);
3065 }
3066
3067 void
3068 fs_visitor::dump_instructions(const char *name)
3069 {
3070    calculate_register_pressure();
3071    FILE *file = stderr;
3072    if (name && geteuid() != 0) {
3073       file = fopen(name, "w");
3074       if (!file)
3075          file = stderr;
3076    }
3077
3078    int ip = 0, max_pressure = 0;
3079    foreach_block_and_inst(block, backend_instruction, inst, cfg) {
3080       max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
3081       fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
3082       dump_instruction(inst, file);
3083       ++ip;
3084    }
3085    fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
3086
3087    if (file != stderr) {
3088       fclose(file);
3089    }
3090 }
3091
3092 void
3093 fs_visitor::dump_instruction(backend_instruction *be_inst)
3094 {
3095    dump_instruction(be_inst, stderr);
3096 }
3097
3098 void
3099 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
3100 {
3101    fs_inst *inst = (fs_inst *)be_inst;
3102
3103    if (inst->predicate) {
3104       fprintf(file, "(%cf0.%d) ",
3105              inst->predicate_inverse ? '-' : '+',
3106              inst->flag_subreg);
3107    }
3108
3109    fprintf(file, "%s", brw_instruction_name(inst->opcode));
3110    if (inst->saturate)
3111       fprintf(file, ".sat");
3112    if (inst->conditional_mod) {
3113       fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
3114       if (!inst->predicate &&
3115           (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
3116                               inst->opcode != BRW_OPCODE_IF &&
3117                               inst->opcode != BRW_OPCODE_WHILE))) {
3118          fprintf(file, ".f0.%d", inst->flag_subreg);
3119       }
3120    }
3121    fprintf(file, "(%d) ", inst->exec_size);
3122
3123
3124    switch (inst->dst.file) {
3125    case GRF:
3126       fprintf(file, "vgrf%d", inst->dst.reg);
3127       if (inst->dst.width != dispatch_width)
3128          fprintf(file, "@%d", inst->dst.width);
3129       if (virtual_grf_sizes[inst->dst.reg] != inst->dst.width / 8 ||
3130           inst->dst.subreg_offset)
3131          fprintf(file, "+%d.%d",
3132                  inst->dst.reg_offset, inst->dst.subreg_offset);
3133       break;
3134    case MRF:
3135       fprintf(file, "m%d", inst->dst.reg);
3136       break;
3137    case BAD_FILE:
3138       fprintf(file, "(null)");
3139       break;
3140    case UNIFORM:
3141       fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
3142       break;
3143    case HW_REG:
3144       if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3145          switch (inst->dst.fixed_hw_reg.nr) {
3146          case BRW_ARF_NULL:
3147             fprintf(file, "null");
3148             break;
3149          case BRW_ARF_ADDRESS:
3150             fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
3151             break;
3152          case BRW_ARF_ACCUMULATOR:
3153             fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
3154             break;
3155          case BRW_ARF_FLAG:
3156             fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3157                              inst->dst.fixed_hw_reg.subnr);
3158             break;
3159          default:
3160             fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3161                                inst->dst.fixed_hw_reg.subnr);
3162             break;
3163          }
3164       } else {
3165          fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
3166       }
3167       if (inst->dst.fixed_hw_reg.subnr)
3168          fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
3169       break;
3170    default:
3171       fprintf(file, "???");
3172       break;
3173    }
3174    fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
3175
3176    for (int i = 0; i < inst->sources; i++) {
3177       if (inst->src[i].negate)
3178          fprintf(file, "-");
3179       if (inst->src[i].abs)
3180          fprintf(file, "|");
3181       switch (inst->src[i].file) {
3182       case GRF:
3183          fprintf(file, "vgrf%d", inst->src[i].reg);
3184          if (inst->src[i].width != dispatch_width)
3185             fprintf(file, "@%d", inst->src[i].width);
3186          if (virtual_grf_sizes[inst->src[i].reg] != inst->src[i].width / 8 ||
3187              inst->src[i].subreg_offset)
3188             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3189                     inst->src[i].subreg_offset);
3190          break;
3191       case MRF:
3192          fprintf(file, "***m%d***", inst->src[i].reg);
3193          break;
3194       case UNIFORM:
3195          fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
3196          if (inst->src[i].reladdr) {
3197             fprintf(file, "+reladdr");
3198          } else if (inst->src[i].subreg_offset) {
3199             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3200                     inst->src[i].subreg_offset);
3201          }
3202          break;
3203       case BAD_FILE:
3204          fprintf(file, "(null)");
3205          break;
3206       case IMM:
3207          switch (inst->src[i].type) {
3208          case BRW_REGISTER_TYPE_F:
3209             fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
3210             break;
3211          case BRW_REGISTER_TYPE_D:
3212             fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
3213             break;
3214          case BRW_REGISTER_TYPE_UD:
3215             fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
3216             break;
3217          default:
3218             fprintf(file, "???");
3219             break;
3220          }
3221          break;
3222       case HW_REG:
3223          if (inst->src[i].fixed_hw_reg.negate)
3224             fprintf(file, "-");
3225          if (inst->src[i].fixed_hw_reg.abs)
3226             fprintf(file, "|");
3227          if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3228             switch (inst->src[i].fixed_hw_reg.nr) {
3229             case BRW_ARF_NULL:
3230                fprintf(file, "null");
3231                break;
3232             case BRW_ARF_ADDRESS:
3233                fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
3234                break;
3235             case BRW_ARF_ACCUMULATOR:
3236                fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
3237                break;
3238             case BRW_ARF_FLAG:
3239                fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3240                                 inst->src[i].fixed_hw_reg.subnr);
3241                break;
3242             default:
3243                fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3244                                   inst->src[i].fixed_hw_reg.subnr);
3245                break;
3246             }
3247          } else {
3248             fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3249          }
3250          if (inst->src[i].fixed_hw_reg.subnr)
3251             fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
3252          if (inst->src[i].fixed_hw_reg.abs)
3253             fprintf(file, "|");
3254          break;
3255       default:
3256          fprintf(file, "???");
3257          break;
3258       }
3259       if (inst->src[i].abs)
3260          fprintf(file, "|");
3261
3262       if (inst->src[i].file != IMM) {
3263          fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
3264       }
3265
3266       if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
3267          fprintf(file, ", ");
3268    }
3269
3270    fprintf(file, " ");
3271
3272    if (dispatch_width == 16 && inst->exec_size == 8) {
3273       if (inst->force_sechalf)
3274          fprintf(file, "2ndhalf ");
3275       else
3276          fprintf(file, "1sthalf ");
3277    }
3278
3279    fprintf(file, "\n");
3280 }
3281
3282 /**
3283  * Possibly returns an instruction that set up @param reg.
3284  *
3285  * Sometimes we want to take the result of some expression/variable
3286  * dereference tree and rewrite the instruction generating the result
3287  * of the tree.  When processing the tree, we know that the
3288  * instructions generated are all writing temporaries that are dead
3289  * outside of this tree.  So, if we have some instructions that write
3290  * a temporary, we're free to point that temp write somewhere else.
3291  *
3292  * Note that this doesn't guarantee that the instruction generated
3293  * only reg -- it might be the size=4 destination of a texture instruction.
3294  */
3295 fs_inst *
3296 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3297                                            fs_inst *end,
3298                                            const fs_reg &reg)
3299 {
3300    if (end == start ||
3301        end->is_partial_write() ||
3302        reg.reladdr ||
3303        !reg.equals(end->dst)) {
3304       return NULL;
3305    } else {
3306       return end;
3307    }
3308 }
3309
3310 void
3311 fs_visitor::setup_payload_gen6()
3312 {
3313    bool uses_depth =
3314       (prog->InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3315    unsigned barycentric_interp_modes =
3316       (stage == MESA_SHADER_FRAGMENT) ?
3317       ((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
3318
3319    assert(brw->gen >= 6);
3320
3321    /* R0-1: masks, pixel X/Y coordinates. */
3322    payload.num_regs = 2;
3323    /* R2: only for 32-pixel dispatch.*/
3324
3325    /* R3-26: barycentric interpolation coordinates.  These appear in the
3326     * same order that they appear in the brw_wm_barycentric_interp_mode
3327     * enum.  Each set of coordinates occupies 2 registers if dispatch width
3328     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
3329     * appear if they were enabled using the "Barycentric Interpolation
3330     * Mode" bits in WM_STATE.
3331     */
3332    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3333       if (barycentric_interp_modes & (1 << i)) {
3334          payload.barycentric_coord_reg[i] = payload.num_regs;
3335          payload.num_regs += 2;
3336          if (dispatch_width == 16) {
3337             payload.num_regs += 2;
3338          }
3339       }
3340    }
3341
3342    /* R27: interpolated depth if uses source depth */
3343    if (uses_depth) {
3344       payload.source_depth_reg = payload.num_regs;
3345       payload.num_regs++;
3346       if (dispatch_width == 16) {
3347          /* R28: interpolated depth if not SIMD8. */
3348          payload.num_regs++;
3349       }
3350    }
3351    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3352    if (uses_depth) {
3353       payload.source_w_reg = payload.num_regs;
3354       payload.num_regs++;
3355       if (dispatch_width == 16) {
3356          /* R30: interpolated W if not SIMD8. */
3357          payload.num_regs++;
3358       }
3359    }
3360
3361    if (stage == MESA_SHADER_FRAGMENT) {
3362       brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3363       brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3364       prog_data->uses_pos_offset = key->compute_pos_offset;
3365       /* R31: MSAA position offsets. */
3366       if (prog_data->uses_pos_offset) {
3367          payload.sample_pos_reg = payload.num_regs;
3368          payload.num_regs++;
3369       }
3370    }
3371
3372    /* R32: MSAA input coverage mask */
3373    if (prog->SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3374       assert(brw->gen >= 7);
3375       payload.sample_mask_in_reg = payload.num_regs;
3376       payload.num_regs++;
3377       if (dispatch_width == 16) {
3378          /* R33: input coverage mask if not SIMD8. */
3379          payload.num_regs++;
3380       }
3381    }
3382
3383    /* R34-: bary for 32-pixel. */
3384    /* R58-59: interp W for 32-pixel. */
3385
3386    if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3387       source_depth_to_render_target = true;
3388    }
3389 }
3390
3391 void
3392 fs_visitor::assign_binding_table_offsets()
3393 {
3394    assert(stage == MESA_SHADER_FRAGMENT);
3395    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3396    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3397    uint32_t next_binding_table_offset = 0;
3398
3399    /* If there are no color regions, we still perform an FB write to a null
3400     * renderbuffer, which we place at surface index 0.
3401     */
3402    prog_data->binding_table.render_target_start = next_binding_table_offset;
3403    next_binding_table_offset += MAX2(key->nr_color_regions, 1);
3404
3405    assign_common_binding_table_offsets(next_binding_table_offset);
3406 }
3407
3408 void
3409 fs_visitor::calculate_register_pressure()
3410 {
3411    invalidate_live_intervals();
3412    calculate_live_intervals();
3413
3414    unsigned num_instructions = 0;
3415    foreach_block(block, cfg)
3416       num_instructions += block->instructions.length();
3417
3418    regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3419
3420    for (int reg = 0; reg < virtual_grf_count; reg++) {
3421       for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3422          regs_live_at_ip[ip] += virtual_grf_sizes[reg];
3423    }
3424 }
3425
3426 /**
3427  * Look for repeated FS_OPCODE_MOV_DISPATCH_TO_FLAGS and drop the later ones.
3428  *
3429  * The needs_unlit_centroid_workaround ends up producing one of these per
3430  * channel of centroid input, so it's good to clean them up.
3431  *
3432  * An assumption here is that nothing ever modifies the dispatched pixels
3433  * value that FS_OPCODE_MOV_DISPATCH_TO_FLAGS reads from, but the hardware
3434  * dictates that anyway.
3435  */
3436 void
3437 fs_visitor::opt_drop_redundant_mov_to_flags()
3438 {
3439    bool flag_mov_found[2] = {false};
3440
3441    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
3442       if (inst->is_control_flow()) {
3443          memset(flag_mov_found, 0, sizeof(flag_mov_found));
3444       } else if (inst->opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS) {
3445          if (!flag_mov_found[inst->flag_subreg])
3446             flag_mov_found[inst->flag_subreg] = true;
3447          else
3448             inst->remove(block);
3449       } else if (inst->writes_flag()) {
3450          flag_mov_found[inst->flag_subreg] = false;
3451       }
3452    }
3453 }
3454
3455 bool
3456 fs_visitor::run()
3457 {
3458    sanity_param_count = prog->Parameters->NumParameters;
3459    bool allocated_without_spills;
3460
3461    assign_binding_table_offsets();
3462
3463    if (brw->gen >= 6)
3464       setup_payload_gen6();
3465    else
3466       setup_payload_gen4();
3467
3468    if (0) {
3469       emit_dummy_fs();
3470    } else if (brw->use_rep_send && dispatch_width == 16) {
3471       emit_repclear_shader();
3472       allocated_without_spills = true;
3473    } else {
3474       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3475          emit_shader_time_begin();
3476
3477       calculate_urb_setup();
3478       if (prog->InputsRead > 0) {
3479          if (brw->gen < 6)
3480             emit_interpolation_setup_gen4();
3481          else
3482             emit_interpolation_setup_gen6();
3483       }
3484
3485       /* We handle discards by keeping track of the still-live pixels in f0.1.
3486        * Initialize it with the dispatched pixels.
3487        */
3488       bool uses_kill =
3489          (stage == MESA_SHADER_FRAGMENT) &&
3490          ((brw_wm_prog_data*) this->prog_data)->uses_kill;
3491       bool alpha_test_func =
3492          (stage == MESA_SHADER_FRAGMENT) &&
3493          ((brw_wm_prog_key*) this->key)->alpha_test_func;
3494       if (uses_kill || alpha_test_func) {
3495          fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3496          discard_init->flag_subreg = 1;
3497       }
3498
3499       /* Generate FS IR for main().  (the visitor only descends into
3500        * functions called "main").
3501        */
3502       if (shader) {
3503          foreach_in_list(ir_instruction, ir, shader->base.ir) {
3504             base_ir = ir;
3505             this->result = reg_undef;
3506             ir->accept(this);
3507          }
3508       } else {
3509          emit_fragment_program_code();
3510       }
3511       base_ir = NULL;
3512       if (failed)
3513          return false;
3514
3515       emit(FS_OPCODE_PLACEHOLDER_HALT);
3516
3517       if (alpha_test_func)
3518          emit_alpha_test();
3519
3520       emit_fb_writes();
3521
3522       calculate_cfg();
3523
3524       split_virtual_grfs();
3525
3526       move_uniform_array_access_to_pull_constants();
3527       assign_constant_locations();
3528       demote_pull_constants();
3529
3530       opt_drop_redundant_mov_to_flags();
3531
3532 #define OPT(pass, args...) do {                                            \
3533       pass_num++;                                                          \
3534       bool this_progress = pass(args);                                     \
3535                                                                            \
3536       if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) {      \
3537          char filename[64];                                                \
3538          snprintf(filename, 64, "fs%d-%04d-%02d-%02d-" #pass,              \
3539                   dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
3540                                                                            \
3541          backend_visitor::dump_instructions(filename);                     \
3542       }                                                                    \
3543                                                                            \
3544       progress = progress || this_progress;                                \
3545    } while (false)
3546
3547       if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
3548          char filename[64];
3549          snprintf(filename, 64, "fs%d-%04d-00-start",
3550                   dispatch_width, shader_prog ? shader_prog->Name : 0);
3551
3552          backend_visitor::dump_instructions(filename);
3553       }
3554
3555       bool progress;
3556       int iteration = 0;
3557       do {
3558          progress = false;
3559          iteration++;
3560          int pass_num = 0;
3561
3562          OPT(remove_duplicate_mrf_writes);
3563
3564          OPT(opt_algebraic);
3565          OPT(opt_cse);
3566          OPT(opt_copy_propagate);
3567          OPT(opt_peephole_predicated_break);
3568          OPT(dead_code_eliminate);
3569          OPT(opt_peephole_sel);
3570          OPT(dead_control_flow_eliminate, this);
3571          OPT(opt_register_renaming);
3572          OPT(opt_saturate_propagation);
3573          OPT(register_coalesce);
3574          OPT(compute_to_mrf);
3575
3576          OPT(compact_virtual_grfs);
3577       } while (progress);
3578
3579       if (lower_load_payload()) {
3580          split_virtual_grfs();
3581          register_coalesce();
3582          compute_to_mrf();
3583          dead_code_eliminate();
3584       }
3585
3586       lower_uniform_pull_constant_loads();
3587
3588       assign_curb_setup();
3589       assign_urb_setup();
3590
3591       static enum instruction_scheduler_mode pre_modes[] = {
3592          SCHEDULE_PRE,
3593          SCHEDULE_PRE_NON_LIFO,
3594          SCHEDULE_PRE_LIFO,
3595       };
3596
3597       /* Try each scheduling heuristic to see if it can successfully register
3598        * allocate without spilling.  They should be ordered by decreasing
3599        * performance but increasing likelihood of allocating.
3600        */
3601       for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3602          schedule_instructions(pre_modes[i]);
3603
3604          if (0) {
3605             assign_regs_trivial();
3606             allocated_without_spills = true;
3607          } else {
3608             allocated_without_spills = assign_regs(false);
3609          }
3610          if (allocated_without_spills)
3611             break;
3612       }
3613
3614       if (!allocated_without_spills) {
3615          /* We assume that any spilling is worse than just dropping back to
3616           * SIMD8.  There's probably actually some intermediate point where
3617           * SIMD16 with a couple of spills is still better.
3618           */
3619          if (dispatch_width == 16) {
3620             fail("Failure to register allocate.  Reduce number of "
3621                  "live scalar values to avoid this.");
3622          } else {
3623             perf_debug("Fragment shader triggered register spilling.  "
3624                        "Try reducing the number of live scalar values to "
3625                        "improve performance.\n");
3626          }
3627
3628          /* Since we're out of heuristics, just go spill registers until we
3629           * get an allocation.
3630           */
3631          while (!assign_regs(true)) {
3632             if (failed)
3633                break;
3634          }
3635       }
3636    }
3637    assert(force_uncompressed_stack == 0);
3638
3639    /* This must come after all optimization and register allocation, since
3640     * it inserts dead code that happens to have side effects, and it does
3641     * so based on the actual physical registers in use.
3642     */
3643    insert_gen4_send_dependency_workarounds();
3644
3645    if (failed)
3646       return false;
3647
3648    if (!allocated_without_spills)
3649       schedule_instructions(SCHEDULE_POST);
3650
3651    if (last_scratch > 0) {
3652       prog_data->total_scratch = brw_get_scratch_size(last_scratch);
3653    }
3654
3655    if (stage == MESA_SHADER_FRAGMENT) {
3656       brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3657       if (dispatch_width == 8)
3658          prog_data->reg_blocks = brw_register_blocks(grf_used);
3659       else
3660          prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
3661    }
3662
3663    /* If any state parameters were appended, then ParameterValues could have
3664     * been realloced, in which case the driver uniform storage set up by
3665     * _mesa_associate_uniform_storage() would point to freed memory.  Make
3666     * sure that didn't happen.
3667     */
3668    assert(sanity_param_count == prog->Parameters->NumParameters);
3669
3670    return !failed;
3671 }
3672
3673 const unsigned *
3674 brw_wm_fs_emit(struct brw_context *brw,
3675                void *mem_ctx,
3676                const struct brw_wm_prog_key *key,
3677                struct brw_wm_prog_data *prog_data,
3678                struct gl_fragment_program *fp,
3679                struct gl_shader_program *prog,
3680                unsigned *final_assembly_size)
3681 {
3682    bool start_busy = false;
3683    double start_time = 0;
3684
3685    if (unlikely(brw->perf_debug)) {
3686       start_busy = (brw->batch.last_bo &&
3687                     drm_intel_bo_busy(brw->batch.last_bo));
3688       start_time = get_time();
3689    }
3690
3691    struct brw_shader *shader = NULL;
3692    if (prog)
3693       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3694
3695    if (unlikely(INTEL_DEBUG & DEBUG_WM))
3696       brw_dump_ir(brw, "fragment", prog, &shader->base, &fp->Base);
3697
3698    /* Now the main event: Visit the shader IR and generate our FS IR for it.
3699     */
3700    fs_visitor v(brw, mem_ctx, key, prog_data, prog, fp, 8);
3701    if (!v.run()) {
3702       if (prog) {
3703          prog->LinkStatus = false;
3704          ralloc_strcat(&prog->InfoLog, v.fail_msg);
3705       }
3706
3707       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3708                     v.fail_msg);
3709
3710       return NULL;
3711    }
3712
3713    cfg_t *simd16_cfg = NULL;
3714    fs_visitor v2(brw, mem_ctx, key, prog_data, prog, fp, 16);
3715    if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16) ||
3716                                brw->use_rep_send)) {
3717       if (!v.simd16_unsupported) {
3718          /* Try a SIMD16 compile */
3719          v2.import_uniforms(&v);
3720          if (!v2.run()) {
3721             perf_debug("SIMD16 shader failed to compile, falling back to "
3722                        "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
3723          } else {
3724             simd16_cfg = v2.cfg;
3725          }
3726       } else {
3727          perf_debug("SIMD16 shader unsupported, falling back to "
3728                     "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
3729       }
3730    }
3731
3732    cfg_t *simd8_cfg;
3733    int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8;
3734    if (no_simd8 && simd16_cfg) {
3735       simd8_cfg = NULL;
3736       prog_data->no_8 = true;
3737    } else {
3738       simd8_cfg = v.cfg;
3739       prog_data->no_8 = false;
3740    }
3741
3742    const unsigned *assembly = NULL;
3743    fs_generator g(brw, mem_ctx, key, prog_data, prog, fp,
3744                   v.runtime_check_aads_emit, INTEL_DEBUG & DEBUG_WM);
3745    assembly = g.generate_assembly(simd8_cfg, simd16_cfg,
3746                                   final_assembly_size);
3747
3748    if (unlikely(brw->perf_debug) && shader) {
3749       if (shader->compiled_once)
3750          brw_wm_debug_recompile(brw, prog, key);
3751       shader->compiled_once = true;
3752
3753       if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3754          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3755                     (get_time() - start_time) * 1000);
3756       }
3757    }
3758
3759    return assembly;
3760 }
3761
3762 bool
3763 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
3764 {
3765    struct brw_context *brw = brw_context(ctx);
3766    struct brw_wm_prog_key key;
3767
3768    if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
3769       return true;
3770
3771    struct gl_fragment_program *fp = (struct gl_fragment_program *)
3772       prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
3773    struct brw_fragment_program *bfp = brw_fragment_program(fp);
3774    bool program_uses_dfdy = fp->UsesDFdy;
3775
3776    memset(&key, 0, sizeof(key));
3777
3778    if (brw->gen < 6) {
3779       if (fp->UsesKill)
3780          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3781
3782       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3783          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3784
3785       /* Just assume depth testing. */
3786       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3787       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3788    }
3789
3790    if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
3791                                          BRW_FS_VARYING_INPUT_MASK) > 16)
3792       key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
3793
3794    unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
3795    for (unsigned i = 0; i < sampler_count; i++) {
3796       if (fp->Base.ShadowSamplers & (1 << i)) {
3797          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3798          key.tex.swizzles[i] =
3799             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3800       } else {
3801          /* Color sampler: assume no swizzling. */
3802          key.tex.swizzles[i] = SWIZZLE_XYZW;
3803       }
3804    }
3805
3806    if (fp->Base.InputsRead & VARYING_BIT_POS) {
3807       key.drawable_height = ctx->DrawBuffer->Height;
3808    }
3809
3810    key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
3811          ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
3812          BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
3813
3814    if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3815       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
3816                           key.nr_color_regions > 1;
3817    }
3818
3819    /* GL_FRAGMENT_SHADER_DERIVATIVE_HINT is almost always GL_DONT_CARE.  The
3820     * quality of the derivatives is likely to be determined by the driconf
3821     * option.
3822     */
3823    key.high_quality_derivatives = brw->disable_derivative_optimization;
3824
3825    key.program_string_id = bfp->id;
3826
3827    uint32_t old_prog_offset = brw->wm.base.prog_offset;
3828    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3829
3830    bool success = do_wm_prog(brw, prog, bfp, &key);
3831
3832    brw->wm.base.prog_offset = old_prog_offset;
3833    brw->wm.prog_data = old_prog_data;
3834
3835    return success;
3836 }