src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 extern "C" {
  32
  33 #include <sys/types.h>
  34
  35 #include "util/hash_table.h"
  36 #include "main/macros.h"
  37 #include "main/shaderobj.h"
  38 #include "main/fbobject.h"
  39 #include "program/prog_parameter.h"
  40 #include "program/prog_print.h"
  41 #include "util/register_allocate.h"
  42 #include "program/sampler.h"
  43 #include "program/hash_table.h"
  44 #include "brw_context.h"
  45 #include "brw_eu.h"
  46 #include "brw_wm.h"
  47 }
  48 #include "brw_fs.h"
  49 #include "brw_cfg.h"
  50 #include "brw_dead_control_flow.h"
  51 #include "main/uniforms.h"
  52 #include "brw_fs_live_variables.h"
  53 #include "glsl/glsl_types.h"
  54
  55 void
  56 fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
  57               fs_reg *src, int sources)
  58 {
  59    memset(this, 0, sizeof(*this));
  60
  61    this->opcode = opcode;
  62    this->dst = dst;
  63    this->src = src;
  64    this->sources = sources;
  65    this->exec_size = exec_size;
  66
  67    assert(dst.file != IMM && dst.file != UNIFORM);
  68
  69    /* If exec_size == 0, try to guess it from the registers.  Since all
  70     * manner of things may use hardware registers, we first try to guess
  71     * based on GRF registers.  If this fails, we will go ahead and take the
  72     * width from the destination register.
  73     */
  74    if (this->exec_size == 0) {
  75       if (dst.file == GRF) {
  76          this->exec_size = dst.width;
  77       } else {
  78          for (int i = 0; i < sources; ++i) {
  79             if (src[i].file != GRF)
  80                continue;
  81
  82             if (this->exec_size <= 1)
  83                this->exec_size = src[i].width;
  84             assert(src[i].width == 1 || src[i].width == this->exec_size);
  85          }
  86       }
  87
  88       if (this->exec_size == 0 && dst.file != BAD_FILE)
  89          this->exec_size = dst.width;
  90    }
  91    assert(this->exec_size != 0);
  92
  93    for (int i = 0; i < sources; ++i) {
  94       switch (this->src[i].file) {
  95       case BAD_FILE:
  96          this->src[i].effective_width = 8;
  97          break;
  98       case GRF:
  99       case HW_REG:
 100          assert(this->src[i].width > 0);
 101          if (this->src[i].width == 1) {
 102             this->src[i].effective_width = this->exec_size;
 103          } else {
 104             this->src[i].effective_width = this->src[i].width;
 105          }
 106          break;
 107       case IMM:
 108       case UNIFORM:
 109          this->src[i].effective_width = this->exec_size;
 110          break;
 111       default:
 112          unreachable("Invalid source register file");
 113       }
 114    }
 115    this->dst.effective_width = this->exec_size;
 116
 117    this->conditional_mod = BRW_CONDITIONAL_NONE;
 118
 119    /* This will be the case for almost all instructions. */
 120    switch (dst.file) {
 121    case GRF:
 122    case HW_REG:
 123    case MRF:
 124       this->regs_written = (dst.width * dst.stride * type_sz(dst.type) + 31) / 32;
 125       break;
 126    case BAD_FILE:
 127       this->regs_written = 0;
 128       break;
 129    case IMM:
 130    case UNIFORM:
 131       unreachable("Invalid destination register file");
 132    default:
 133       unreachable("Invalid register file");
 134    }
 135
 136    this->writes_accumulator = false;
 137 }
 138
 139 fs_inst::fs_inst()
 140 {
 141    fs_reg *src = ralloc_array(this, fs_reg, 3);
 142    init(BRW_OPCODE_NOP, 8, dst, src, 0);
 143 }
 144
 145 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
 146 {
 147    fs_reg *src = ralloc_array(this, fs_reg, 3);
 148    init(opcode, exec_size, reg_undef, src, 0);
 149 }
 150
 151 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
 152 {
 153    fs_reg *src = ralloc_array(this, fs_reg, 3);
 154    init(opcode, 0, dst, src, 0);
 155 }
 156
 157 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 158                  const fs_reg &src0)
 159 {
 160    fs_reg *src = ralloc_array(this, fs_reg, 3);
 161    src[0] = src0;
 162    init(opcode, exec_size, dst, src, 1);
 163 }
 164
 165 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
 166 {
 167    fs_reg *src = ralloc_array(this, fs_reg, 3);
 168    src[0] = src0;
 169    init(opcode, 0, dst, src, 1);
 170 }
 171
 172 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 173                  const fs_reg &src0, const fs_reg &src1)
 174 {
 175    fs_reg *src = ralloc_array(this, fs_reg, 3);
 176    src[0] = src0;
 177    src[1] = src1;
 178    init(opcode, exec_size, dst, src, 2);
 179 }
 180
 181 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 182                  const fs_reg &src1)
 183 {
 184    fs_reg *src = ralloc_array(this, fs_reg, 3);
 185    src[0] = src0;
 186    src[1] = src1;
 187    init(opcode, 0, dst, src, 2);
 188 }
 189
 190 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 191                  const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
 192 {
 193    fs_reg *src = ralloc_array(this, fs_reg, 3);
 194    src[0] = src0;
 195    src[1] = src1;
 196    src[2] = src2;
 197    init(opcode, exec_size, dst, src, 3);
 198 }
 199
 200 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 201                  const fs_reg &src1, const fs_reg &src2)
 202 {
 203    fs_reg *src = ralloc_array(this, fs_reg, 3);
 204    src[0] = src0;
 205    src[1] = src1;
 206    src[2] = src2;
 207    init(opcode, 0, dst, src, 3);
 208 }
 209
 210 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, fs_reg src[], int sources)
 211 {
 212    init(opcode, 0, dst, src, sources);
 213 }
 214
 215 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
 216                  fs_reg src[], int sources)
 217 {
 218    init(opcode, exec_width, dst, src, sources);
 219 }
 220
 221 fs_inst::fs_inst(const fs_inst &that)
 222 {
 223    memcpy(this, &that, sizeof(that));
 224
 225    this->src = ralloc_array(this, fs_reg, that.sources);
 226
 227    for (int i = 0; i < that.sources; i++)
 228       this->src[i] = that.src[i];
 229 }
 230
 231 void
 232 fs_inst::resize_sources(uint8_t num_sources)
 233 {
 234    if (this->sources != num_sources) {
 235       this->src = reralloc(this, this->src, fs_reg, num_sources);
 236       this->sources = num_sources;
 237    }
 238 }
 239
 240 #define ALU1(op)                                                        \
 241    fs_inst *                                                            \
 242    fs_visitor::op(const fs_reg &dst, const fs_reg &src0)                \
 243    {                                                                    \
 244       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
 245    }
 246
 247 #define ALU2(op)                                                        \
 248    fs_inst *                                                            \
 249    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 250                   const fs_reg &src1)                                   \
 251    {                                                                    \
 252       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
 253    }
 254
 255 #define ALU2_ACC(op)                                                    \
 256    fs_inst *                                                            \
 257    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 258                   const fs_reg &src1)                                   \
 259    {                                                                    \
 260       fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
 261       inst->writes_accumulator = true;                                  \
 262       return inst;                                                      \
 263    }
 264
 265 #define ALU3(op)                                                        \
 266    fs_inst *                                                            \
 267    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 268                   const fs_reg &src1, const fs_reg &src2)               \
 269    {                                                                    \
 270       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
 271    }
 272
 273 ALU1(NOT)
 274 ALU1(MOV)
 275 ALU1(FRC)
 276 ALU1(RNDD)
 277 ALU1(RNDE)
 278 ALU1(RNDZ)
 279 ALU2(ADD)
 280 ALU2(MUL)
 281 ALU2_ACC(MACH)
 282 ALU2(AND)
 283 ALU2(OR)
 284 ALU2(XOR)
 285 ALU2(SHL)
 286 ALU2(SHR)
 287 ALU2(ASR)
 288 ALU3(LRP)
 289 ALU1(BFREV)
 290 ALU3(BFE)
 291 ALU2(BFI1)
 292 ALU3(BFI2)
 293 ALU1(FBH)
 294 ALU1(FBL)
 295 ALU1(CBIT)
 296 ALU3(MAD)
 297 ALU2_ACC(ADDC)
 298 ALU2_ACC(SUBB)
 299 ALU2(SEL)
 300 ALU2(MAC)
 301
 302 /** Gen4 predicated IF. */
 303 fs_inst *
 304 fs_visitor::IF(enum brw_predicate predicate)
 305 {
 306    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width);
 307    inst->predicate = predicate;
 308    return inst;
 309 }
 310
 311 /** Gen6 IF with embedded comparison. */
 312 fs_inst *
 313 fs_visitor::IF(const fs_reg &src0, const fs_reg &src1,
 314                enum brw_conditional_mod condition)
 315 {
 316    assert(brw->gen == 6);
 317    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width,
 318                                         reg_null_d, src0, src1);
 319    inst->conditional_mod = condition;
 320    return inst;
 321 }
 322
 323 /**
 324  * CMP: Sets the low bit of the destination channels with the result
 325  * of the comparison, while the upper bits are undefined, and updates
 326  * the flag register with the packed 16 bits of the result.
 327  */
 328 fs_inst *
 329 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
 330                 enum brw_conditional_mod condition)
 331 {
 332    fs_inst *inst;
 333
 334    /* Take the instruction:
 335     *
 336     * CMP null<d> src0<f> src1<f>
 337     *
 338     * Original gen4 does type conversion to the destination type before
 339     * comparison, producing garbage results for floating point comparisons.
 340     * gen5 does the comparison on the execution type (resolved source types),
 341     * so dst type doesn't matter.  gen6 does comparison and then uses the
 342     * result as if it was the dst type with no conversion, which happens to
 343     * mostly work out for float-interpreted-as-int since our comparisons are
 344     * for >0, =0, <0.
 345     */
 346    if (brw->gen == 4) {
 347       dst.type = src0.type;
 348       if (dst.file == HW_REG)
 349          dst.fixed_hw_reg.type = dst.type;
 350    }
 351
 352    resolve_ud_negate(&src0);
 353    resolve_ud_negate(&src1);
 354
 355    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
 356    inst->conditional_mod = condition;
 357
 358    return inst;
 359 }
 360
 361 fs_inst *
 362 fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources)
 363 {
 364    uint8_t exec_size = dst.width;
 365    for (int i = 0; i < sources; ++i) {
 366       assert(src[i].width % dst.width == 0);
 367       if (src[i].width > exec_size)
 368          exec_size = src[i].width;
 369    }
 370
 371    fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, exec_size,
 372                                         dst, src, sources);
 373    inst->regs_written = 0;
 374    for (int i = 0; i < sources; ++i) {
 375       /* The LOAD_PAYLOAD instruction only really makes sense if we are
 376        * dealing with whole registers.  If this ever changes, we can deal
 377        * with it later.
 378        */
 379       int size = src[i].effective_width * type_sz(src[i].type);
 380       assert(size % 32 == 0);
 381       inst->regs_written += (size + 31) / 32;
 382    }
 383
 384    return inst;
 385 }
 386
 387 exec_list
 388 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
 389                                        const fs_reg &surf_index,
 390                                        const fs_reg &varying_offset,
 391                                        uint32_t const_offset)
 392 {
 393    exec_list instructions;
 394    fs_inst *inst;
 395
 396    /* We have our constant surface use a pitch of 4 bytes, so our index can
 397     * be any component of a vector, and then we load 4 contiguous
 398     * components starting from that.
 399     *
 400     * We break down the const_offset to a portion added to the variable
 401     * offset and a portion done using reg_offset, which means that if you
 402     * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
 403     * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
 404     * CSE can later notice that those loads are all the same and eliminate
 405     * the redundant ones.
 406     */
 407    fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
 408    instructions.push_tail(ADD(vec4_offset,
 409                               varying_offset, fs_reg(const_offset & ~3)));
 410
 411    int scale = 1;
 412    if (brw->gen == 4 && dst.width == 8) {
 413       /* Pre-gen5, we can either use a SIMD8 message that requires (header,
 414        * u, v, r) as parameters, or we can just use the SIMD16 message
 415        * consisting of (header, u).  We choose the second, at the cost of a
 416        * longer return length.
 417        */
 418       scale = 2;
 419    }
 420
 421    enum opcode op;
 422    if (brw->gen >= 7)
 423       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
 424    else
 425       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
 426
 427    assert(dst.width % 8 == 0);
 428    int regs_written = 4 * (dst.width / 8) * scale;
 429    fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(regs_written),
 430                                dst.type, dst.width);
 431    inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
 432    inst->regs_written = regs_written;
 433    instructions.push_tail(inst);
 434
 435    if (brw->gen < 7) {
 436       inst->base_mrf = 13;
 437       inst->header_present = true;
 438       if (brw->gen == 4)
 439          inst->mlen = 3;
 440       else
 441          inst->mlen = 1 + dispatch_width / 8;
 442    }
 443
 444    fs_reg result = offset(vec4_result, (const_offset & 3) * scale);
 445    instructions.push_tail(MOV(dst, result));
 446
 447    return instructions;
 448 }
 449
 450 /**
 451  * A helper for MOV generation for fixing up broken hardware SEND dependency
 452  * handling.
 453  */
 454 fs_inst *
 455 fs_visitor::DEP_RESOLVE_MOV(int grf)
 456 {
 457    fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
 458
 459    inst->ir = NULL;
 460    inst->annotation = "send dependency resolve";
 461
 462    /* The caller always wants uncompressed to emit the minimal extra
 463     * dependencies, and to avoid having to deal with aligning its regs to 2.
 464     */
 465    inst->exec_size = 8;
 466
 467    return inst;
 468 }
 469
 470 bool
 471 fs_inst::equals(fs_inst *inst) const
 472 {
 473    return (opcode == inst->opcode &&
 474            dst.equals(inst->dst) &&
 475            src[0].equals(inst->src[0]) &&
 476            src[1].equals(inst->src[1]) &&
 477            src[2].equals(inst->src[2]) &&
 478            saturate == inst->saturate &&
 479            predicate == inst->predicate &&
 480            conditional_mod == inst->conditional_mod &&
 481            mlen == inst->mlen &&
 482            base_mrf == inst->base_mrf &&
 483            target == inst->target &&
 484            eot == inst->eot &&
 485            header_present == inst->header_present &&
 486            shadow_compare == inst->shadow_compare &&
 487            exec_size == inst->exec_size &&
 488            offset == inst->offset);
 489 }
 490
 491 bool
 492 fs_inst::overwrites_reg(const fs_reg &reg) const
 493 {
 494    return (reg.file == dst.file &&
 495            reg.reg == dst.reg &&
 496            reg.reg_offset >= dst.reg_offset  &&
 497            reg.reg_offset < dst.reg_offset + regs_written);
 498 }
 499
 500 bool
 501 fs_inst::is_send_from_grf() const
 502 {
 503    switch (opcode) {
 504    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
 505    case SHADER_OPCODE_SHADER_TIME_ADD:
 506    case FS_OPCODE_INTERPOLATE_AT_CENTROID:
 507    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
 508    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
 509    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
 510    case SHADER_OPCODE_UNTYPED_ATOMIC:
 511    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
 512       return true;
 513    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 514       return src[1].file == GRF;
 515    case FS_OPCODE_FB_WRITE:
 516       return src[0].file == GRF;
 517    default:
 518       if (is_tex())
 519          return src[0].file == GRF;
 520
 521       return false;
 522    }
 523 }
 524
 525 bool
 526 fs_inst::can_do_source_mods(struct brw_context *brw)
 527 {
 528    if (brw->gen == 6 && is_math())
 529       return false;
 530
 531    if (is_send_from_grf())
 532       return false;
 533
 534    if (!backend_instruction::can_do_source_mods())
 535       return false;
 536
 537    return true;
 538 }
 539
 540 void
 541 fs_reg::init()
 542 {
 543    memset(this, 0, sizeof(*this));
 544    stride = 1;
 545 }
 546
 547 /** Generic unset register constructor. */
 548 fs_reg::fs_reg()
 549 {
 550    init();
 551    this->file = BAD_FILE;
 552 }
 553
 554 /** Immediate value constructor. */
 555 fs_reg::fs_reg(float f)
 556 {
 557    init();
 558    this->file = IMM;
 559    this->type = BRW_REGISTER_TYPE_F;
 560    this->fixed_hw_reg.dw1.f = f;
 561    this->width = 1;
 562 }
 563
 564 /** Immediate value constructor. */
 565 fs_reg::fs_reg(int32_t i)
 566 {
 567    init();
 568    this->file = IMM;
 569    this->type = BRW_REGISTER_TYPE_D;
 570    this->fixed_hw_reg.dw1.d = i;
 571    this->width = 1;
 572 }
 573
 574 /** Immediate value constructor. */
 575 fs_reg::fs_reg(uint32_t u)
 576 {
 577    init();
 578    this->file = IMM;
 579    this->type = BRW_REGISTER_TYPE_UD;
 580    this->fixed_hw_reg.dw1.ud = u;
 581    this->width = 1;
 582 }
 583
 584 /** Fixed brw_reg. */
 585 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 586 {
 587    init();
 588    this->file = HW_REG;
 589    this->fixed_hw_reg = fixed_hw_reg;
 590    this->type = fixed_hw_reg.type;
 591    this->width = 1 << fixed_hw_reg.width;
 592 }
 593
 594 bool
 595 fs_reg::equals(const fs_reg &r) const
 596 {
 597    return (file == r.file &&
 598            reg == r.reg &&
 599            reg_offset == r.reg_offset &&
 600            subreg_offset == r.subreg_offset &&
 601            type == r.type &&
 602            negate == r.negate &&
 603            abs == r.abs &&
 604            !reladdr && !r.reladdr &&
 605            memcmp(&fixed_hw_reg, &r.fixed_hw_reg, sizeof(fixed_hw_reg)) == 0 &&
 606            width == r.width &&
 607            stride == r.stride);
 608 }
 609
 610 fs_reg &
 611 fs_reg::set_smear(unsigned subreg)
 612 {
 613    assert(file != HW_REG && file != IMM);
 614    subreg_offset = subreg * type_sz(type);
 615    stride = 0;
 616    return *this;
 617 }
 618
 619 bool
 620 fs_reg::is_contiguous() const
 621 {
 622    return stride == 1;
 623 }
 624
 625 int
 626 fs_visitor::type_size(const struct glsl_type *type)
 627 {
 628    unsigned int size, i;
 629
 630    switch (type->base_type) {
 631    case GLSL_TYPE_UINT:
 632    case GLSL_TYPE_INT:
 633    case GLSL_TYPE_FLOAT:
 634    case GLSL_TYPE_BOOL:
 635       return type->components();
 636    case GLSL_TYPE_ARRAY:
 637       return type_size(type->fields.array) * type->length;
 638    case GLSL_TYPE_STRUCT:
 639       size = 0;
 640       for (i = 0; i < type->length; i++) {
 641          size += type_size(type->fields.structure[i].type);
 642       }
 643       return size;
 644    case GLSL_TYPE_SAMPLER:
 645       /* Samplers take up no register space, since they're baked in at
 646        * link time.
 647        */
 648       return 0;
 649    case GLSL_TYPE_ATOMIC_UINT:
 650       return 0;
 651    case GLSL_TYPE_IMAGE:
 652    case GLSL_TYPE_VOID:
 653    case GLSL_TYPE_ERROR:
 654    case GLSL_TYPE_INTERFACE:
 655       unreachable("not reached");
 656    }
 657
 658    return 0;
 659 }
 660
 661 fs_reg
 662 fs_visitor::get_timestamp()
 663 {
 664    assert(brw->gen >= 7);
 665
 666    fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 667                                           BRW_ARF_TIMESTAMP,
 668                                           0),
 669                              BRW_REGISTER_TYPE_UD));
 670
 671    fs_reg dst = fs_reg(GRF, virtual_grf_alloc(1), BRW_REGISTER_TYPE_UD, 4);
 672
 673    fs_inst *mov = emit(MOV(dst, ts));
 674    /* We want to read the 3 fields we care about even if it's not enabled in
 675     * the dispatch.
 676     */
 677    mov->force_writemask_all = true;
 678
 679    /* The caller wants the low 32 bits of the timestamp.  Since it's running
 680     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 681     * which is plenty of time for our purposes.  It is identical across the
 682     * EUs, but since it's tracking GPU core speed it will increment at a
 683     * varying rate as render P-states change.
 684     *
 685     * The caller could also check if render P-states have changed (or anything
 686     * else that might disrupt timing) by setting smear to 2 and checking if
 687     * that field is != 0.
 688     */
 689    dst.set_smear(0);
 690
 691    return dst;
 692 }
 693
 694 void
 695 fs_visitor::emit_shader_time_begin()
 696 {
 697    current_annotation = "shader time start";
 698    shader_start_time = get_timestamp();
 699 }
 700
 701 void
 702 fs_visitor::emit_shader_time_end()
 703 {
 704    current_annotation = "shader time end";
 705
 706    enum shader_time_shader_type type, written_type, reset_type;
 707    if (dispatch_width == 8) {
 708       type = ST_FS8;
 709       written_type = ST_FS8_WRITTEN;
 710       reset_type = ST_FS8_RESET;
 711    } else {
 712       assert(dispatch_width == 16);
 713       type = ST_FS16;
 714       written_type = ST_FS16_WRITTEN;
 715       reset_type = ST_FS16_RESET;
 716    }
 717
 718    fs_reg shader_end_time = get_timestamp();
 719
 720    /* Check that there weren't any timestamp reset events (assuming these
 721     * were the only two timestamp reads that happened).
 722     */
 723    fs_reg reset = shader_end_time;
 724    reset.set_smear(2);
 725    fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
 726    test->conditional_mod = BRW_CONDITIONAL_Z;
 727    emit(IF(BRW_PREDICATE_NORMAL));
 728
 729    fs_reg start = shader_start_time;
 730    start.negate = true;
 731    fs_reg diff = fs_reg(GRF, virtual_grf_alloc(1), BRW_REGISTER_TYPE_UD, 1);
 732    emit(ADD(diff, start, shader_end_time));
 733
 734    /* If there were no instructions between the two timestamp gets, the diff
 735     * is 2 cycles.  Remove that overhead, so I can forget about that when
 736     * trying to determine the time taken for single instructions.
 737     */
 738    emit(ADD(diff, diff, fs_reg(-2u)));
 739
 740    emit_shader_time_write(type, diff);
 741    emit_shader_time_write(written_type, fs_reg(1u));
 742    emit(BRW_OPCODE_ELSE);
 743    emit_shader_time_write(reset_type, fs_reg(1u));
 744    emit(BRW_OPCODE_ENDIF);
 745 }
 746
 747 void
 748 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
 749                                    fs_reg value)
 750 {
 751    int shader_time_index =
 752       brw_get_shader_time_index(brw, shader_prog, prog, type);
 753    fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
 754
 755    fs_reg payload;
 756    if (dispatch_width == 8)
 757       payload = fs_reg(this, glsl_type::uvec2_type);
 758    else
 759       payload = fs_reg(this, glsl_type::uint_type);
 760
 761    emit(new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
 762                              fs_reg(), payload, offset, value));
 763 }
 764
 765 void
 766 fs_visitor::vfail(const char *format, va_list va)
 767 {
 768    char *msg;
 769
 770    if (failed)
 771       return;
 772
 773    failed = true;
 774
 775    msg = ralloc_vasprintf(mem_ctx, format, va);
 776    msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
 777
 778    this->fail_msg = msg;
 779
 780    if (INTEL_DEBUG & DEBUG_WM) {
 781       fprintf(stderr, "%s",  msg);
 782    }
 783 }
 784
 785 void
 786 fs_visitor::fail(const char *format, ...)
 787 {
 788    va_list va;
 789
 790    va_start(va, format);
 791    vfail(format, va);
 792    va_end(va);
 793 }
 794
 795 /**
 796  * Mark this program as impossible to compile in SIMD16 mode.
 797  *
 798  * During the SIMD8 compile (which happens first), we can detect and flag
 799  * things that are unsupported in SIMD16 mode, so the compiler can skip
 800  * the SIMD16 compile altogether.
 801  *
 802  * During a SIMD16 compile (if one happens anyway), this just calls fail().
 803  */
 804 void
 805 fs_visitor::no16(const char *format, ...)
 806 {
 807    va_list va;
 808
 809    va_start(va, format);
 810
 811    if (dispatch_width == 16) {
 812       vfail(format, va);
 813    } else {
 814       simd16_unsupported = true;
 815
 816       if (brw->perf_debug) {
 817          if (no16_msg)
 818             ralloc_vasprintf_append(&no16_msg, format, va);
 819          else
 820             no16_msg = ralloc_vasprintf(mem_ctx, format, va);
 821       }
 822    }
 823
 824    va_end(va);
 825 }
 826
 827 fs_inst *
 828 fs_visitor::emit(enum opcode opcode)
 829 {
 830    return emit(new(mem_ctx) fs_inst(opcode, dispatch_width));
 831 }
 832
 833 fs_inst *
 834 fs_visitor::emit(enum opcode opcode, const fs_reg &dst)
 835 {
 836    return emit(new(mem_ctx) fs_inst(opcode, dst));
 837 }
 838
 839 fs_inst *
 840 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
 841 {
 842    return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
 843 }
 844
 845 fs_inst *
 846 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 847                  const fs_reg &src1)
 848 {
 849    return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
 850 }
 851
 852 fs_inst *
 853 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 854                  const fs_reg &src1, const fs_reg &src2)
 855 {
 856    return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
 857 }
 858
 859 fs_inst *
 860 fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
 861                  fs_reg src[], int sources)
 862 {
 863    return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
 864 }
 865
 866 /**
 867  * Returns true if the instruction has a flag that means it won't
 868  * update an entire destination register.
 869  *
 870  * For example, dead code elimination and live variable analysis want to know
 871  * when a write to a variable screens off any preceding values that were in
 872  * it.
 873  */
 874 bool
 875 fs_inst::is_partial_write() const
 876 {
 877    return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
 878            (this->dst.width * type_sz(this->dst.type)) < 32 ||
 879            !this->dst.is_contiguous());
 880 }
 881
 882 int
 883 fs_inst::regs_read(fs_visitor *v, int arg) const
 884 {
 885    if (is_tex() && arg == 0 && src[0].file == GRF) {
 886       return mlen;
 887    } else if (opcode == FS_OPCODE_FB_WRITE && arg == 0) {
 888       return mlen;
 889    } else if (opcode == SHADER_OPCODE_UNTYPED_ATOMIC && arg == 0) {
 890       return mlen;
 891    } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ && arg == 0) {
 892       return mlen;
 893    }
 894
 895    switch (src[arg].file) {
 896    case BAD_FILE:
 897    case UNIFORM:
 898    case IMM:
 899       return 1;
 900    case GRF:
 901    case HW_REG:
 902       if (src[arg].stride == 0) {
 903          return 1;
 904       } else {
 905          int size = src[arg].width * src[arg].stride * type_sz(src[arg].type);
 906          return (size + 31) / 32;
 907       }
 908    case MRF:
 909       unreachable("MRF registers are not allowed as sources");
 910    default:
 911       unreachable("Invalid register file");
 912    }
 913 }
 914
 915 bool
 916 fs_inst::reads_flag() const
 917 {
 918    return predicate;
 919 }
 920
 921 bool
 922 fs_inst::writes_flag() const
 923 {
 924    return (conditional_mod && opcode != BRW_OPCODE_SEL) ||
 925           opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
 926 }
 927
 928 /**
 929  * Returns how many MRFs an FS opcode will write over.
 930  *
 931  * Note that this is not the 0 or 1 implied writes in an actual gen
 932  * instruction -- the FS opcodes often generate MOVs in addition.
 933  */
 934 int
 935 fs_visitor::implied_mrf_writes(fs_inst *inst)
 936 {
 937    if (inst->mlen == 0)
 938       return 0;
 939
 940    if (inst->base_mrf == -1)
 941       return 0;
 942
 943    switch (inst->opcode) {
 944    case SHADER_OPCODE_RCP:
 945    case SHADER_OPCODE_RSQ:
 946    case SHADER_OPCODE_SQRT:
 947    case SHADER_OPCODE_EXP2:
 948    case SHADER_OPCODE_LOG2:
 949    case SHADER_OPCODE_SIN:
 950    case SHADER_OPCODE_COS:
 951       return 1 * dispatch_width / 8;
 952    case SHADER_OPCODE_POW:
 953    case SHADER_OPCODE_INT_QUOTIENT:
 954    case SHADER_OPCODE_INT_REMAINDER:
 955       return 2 * dispatch_width / 8;
 956    case SHADER_OPCODE_TEX:
 957    case FS_OPCODE_TXB:
 958    case SHADER_OPCODE_TXD:
 959    case SHADER_OPCODE_TXF:
 960    case SHADER_OPCODE_TXF_CMS:
 961    case SHADER_OPCODE_TXF_MCS:
 962    case SHADER_OPCODE_TG4:
 963    case SHADER_OPCODE_TG4_OFFSET:
 964    case SHADER_OPCODE_TXL:
 965    case SHADER_OPCODE_TXS:
 966    case SHADER_OPCODE_LOD:
 967       return 1;
 968    case FS_OPCODE_FB_WRITE:
 969       return 2;
 970    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 971    case SHADER_OPCODE_GEN4_SCRATCH_READ:
 972       return 1;
 973    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
 974       return inst->mlen;
 975    case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
 976       return 2;
 977    case SHADER_OPCODE_UNTYPED_ATOMIC:
 978    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
 979    case FS_OPCODE_INTERPOLATE_AT_CENTROID:
 980    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
 981    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
 982    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
 983       return 0;
 984    default:
 985       unreachable("not reached");
 986    }
 987 }
 988
 989 int
 990 fs_visitor::virtual_grf_alloc(int size)
 991 {
 992    if (virtual_grf_array_size <= virtual_grf_count) {
 993       if (virtual_grf_array_size == 0)
 994          virtual_grf_array_size = 16;
 995       else
 996          virtual_grf_array_size *= 2;
 997       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 998                                    virtual_grf_array_size);
 999    }
1000    virtual_grf_sizes[virtual_grf_count] = size;
1001    return virtual_grf_count++;
1002 }
1003
1004 /** Fixed HW reg constructor. */
1005 fs_reg::fs_reg(enum register_file file, int reg)
1006 {
1007    init();
1008    this->file = file;
1009    this->reg = reg;
1010    this->type = BRW_REGISTER_TYPE_F;
1011
1012    switch (file) {
1013    case UNIFORM:
1014       this->width = 1;
1015       break;
1016    default:
1017       this->width = 8;
1018    }
1019 }
1020
1021 /** Fixed HW reg constructor. */
1022 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
1023 {
1024    init();
1025    this->file = file;
1026    this->reg = reg;
1027    this->type = type;
1028
1029    switch (file) {
1030    case UNIFORM:
1031       this->width = 1;
1032       break;
1033    default:
1034       this->width = 8;
1035    }
1036 }
1037
1038 /** Fixed HW reg constructor. */
1039 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type,
1040                uint8_t width)
1041 {
1042    init();
1043    this->file = file;
1044    this->reg = reg;
1045    this->type = type;
1046    this->width = width;
1047 }
1048
1049 /** Automatic reg constructor. */
1050 fs_reg::fs_reg(fs_visitor *v, const struct glsl_type *type)
1051 {
1052    init();
1053    int reg_width = v->dispatch_width / 8;
1054
1055    this->file = GRF;
1056    this->reg = v->virtual_grf_alloc(v->type_size(type) * reg_width);
1057    this->reg_offset = 0;
1058    this->type = brw_type_for_base_type(type);
1059    this->width = v->dispatch_width;
1060    assert(this->width == 8 || this->width == 16);
1061 }
1062
1063 fs_reg *
1064 fs_visitor::variable_storage(ir_variable *var)
1065 {
1066    return (fs_reg *)hash_table_find(this->variable_ht, var);
1067 }
1068
1069 void
1070 import_uniforms_callback(const void *key,
1071                          void *data,
1072                          void *closure)
1073 {
1074    struct hash_table *dst_ht = (struct hash_table *)closure;
1075    const fs_reg *reg = (const fs_reg *)data;
1076
1077    if (reg->file != UNIFORM)
1078       return;
1079
1080    hash_table_insert(dst_ht, data, key);
1081 }
1082
1083 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
1084  * This brings in those uniform definitions
1085  */
1086 void
1087 fs_visitor::import_uniforms(fs_visitor *v)
1088 {
1089    hash_table_call_foreach(v->variable_ht,
1090                            import_uniforms_callback,
1091                            variable_ht);
1092    this->push_constant_loc = v->push_constant_loc;
1093    this->pull_constant_loc = v->pull_constant_loc;
1094    this->uniforms = v->uniforms;
1095    this->param_size = v->param_size;
1096 }
1097
1098 /* Our support for uniforms is piggy-backed on the struct
1099  * gl_fragment_program, because that's where the values actually
1100  * get stored, rather than in some global gl_shader_program uniform
1101  * store.
1102  */
1103 void
1104 fs_visitor::setup_uniform_values(ir_variable *ir)
1105 {
1106    int namelen = strlen(ir->name);
1107
1108    /* The data for our (non-builtin) uniforms is stored in a series of
1109     * gl_uniform_driver_storage structs for each subcomponent that
1110     * glGetUniformLocation() could name.  We know it's been set up in the same
1111     * order we'd walk the type, so walk the list of storage and find anything
1112     * with our name, or the prefix of a component that starts with our name.
1113     */
1114    unsigned params_before = uniforms;
1115    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
1116       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
1117
1118       if (strncmp(ir->name, storage->name, namelen) != 0 ||
1119           (storage->name[namelen] != 0 &&
1120            storage->name[namelen] != '.' &&
1121            storage->name[namelen] != '[')) {
1122          continue;
1123       }
1124
1125       unsigned slots = storage->type->component_slots();
1126       if (storage->array_elements)
1127          slots *= storage->array_elements;
1128
1129       for (unsigned i = 0; i < slots; i++) {
1130          stage_prog_data->param[uniforms++] = &storage->storage[i];
1131       }
1132    }
1133
1134    /* Make sure we actually initialized the right amount of stuff here. */
1135    assert(params_before + ir->type->component_slots() == uniforms);
1136    (void)params_before;
1137 }
1138
1139
1140 /* Our support for builtin uniforms is even scarier than non-builtin.
1141  * It sits on top of the PROG_STATE_VAR parameters that are
1142  * automatically updated from GL context state.
1143  */
1144 void
1145 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
1146 {
1147    const ir_state_slot *const slots = ir->get_state_slots();
1148    assert(slots != NULL);
1149
1150    for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
1151       /* This state reference has already been setup by ir_to_mesa, but we'll
1152        * get the same index back here.
1153        */
1154       int index = _mesa_add_state_reference(this->prog->Parameters,
1155                                             (gl_state_index *)slots[i].tokens);
1156
1157       /* Add each of the unique swizzles of the element as a parameter.
1158        * This'll end up matching the expected layout of the
1159        * array/matrix/structure we're trying to fill in.
1160        */
1161       int last_swiz = -1;
1162       for (unsigned int j = 0; j < 4; j++) {
1163          int swiz = GET_SWZ(slots[i].swizzle, j);
1164          if (swiz == last_swiz)
1165             break;
1166          last_swiz = swiz;
1167
1168          stage_prog_data->param[uniforms++] =
1169             &prog->Parameters->ParameterValues[index][swiz];
1170       }
1171    }
1172 }
1173
1174 fs_reg *
1175 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
1176 {
1177    assert(stage == MESA_SHADER_FRAGMENT);
1178    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1179    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1180    fs_reg wpos = *reg;
1181    bool flip = !ir->data.origin_upper_left ^ key->render_to_fbo;
1182
1183    /* gl_FragCoord.x */
1184    if (ir->data.pixel_center_integer) {
1185       emit(MOV(wpos, this->pixel_x));
1186    } else {
1187       emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
1188    }
1189    wpos = offset(wpos, 1);
1190
1191    /* gl_FragCoord.y */
1192    if (!flip && ir->data.pixel_center_integer) {
1193       emit(MOV(wpos, this->pixel_y));
1194    } else {
1195       fs_reg pixel_y = this->pixel_y;
1196       float offset = (ir->data.pixel_center_integer ? 0.0 : 0.5);
1197
1198       if (flip) {
1199          pixel_y.negate = true;
1200          offset += key->drawable_height - 1.0;
1201       }
1202
1203       emit(ADD(wpos, pixel_y, fs_reg(offset)));
1204    }
1205    wpos = offset(wpos, 1);
1206
1207    /* gl_FragCoord.z */
1208    if (brw->gen >= 6) {
1209       emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
1210    } else {
1211       emit(FS_OPCODE_LINTERP, wpos,
1212            this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1213            this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1214            interp_reg(VARYING_SLOT_POS, 2));
1215    }
1216    wpos = offset(wpos, 1);
1217
1218    /* gl_FragCoord.w: Already set up in emit_interpolation */
1219    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1220
1221    return reg;
1222 }
1223
1224 fs_inst *
1225 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1226                          glsl_interp_qualifier interpolation_mode,
1227                          bool is_centroid, bool is_sample)
1228 {
1229    brw_wm_barycentric_interp_mode barycoord_mode;
1230    if (brw->gen >= 6) {
1231       if (is_centroid) {
1232          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1233             barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1234          else
1235             barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1236       } else if (is_sample) {
1237           if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1238             barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1239          else
1240             barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1241       } else {
1242          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1243             barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1244          else
1245             barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1246       }
1247    } else {
1248       /* On Ironlake and below, there is only one interpolation mode.
1249        * Centroid interpolation doesn't mean anything on this hardware --
1250        * there is no multisampling.
1251        */
1252       barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1253    }
1254    return emit(FS_OPCODE_LINTERP, attr,
1255                this->delta_x[barycoord_mode],
1256                this->delta_y[barycoord_mode], interp);
1257 }
1258
1259 fs_reg *
1260 fs_visitor::emit_general_interpolation(ir_variable *ir)
1261 {
1262    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1263    reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
1264    fs_reg attr = *reg;
1265
1266    assert(stage == MESA_SHADER_FRAGMENT);
1267    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1268    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1269
1270    unsigned int array_elements;
1271    const glsl_type *type;
1272
1273    if (ir->type->is_array()) {
1274       array_elements = ir->type->length;
1275       if (array_elements == 0) {
1276          fail("dereferenced array '%s' has length 0\n", ir->name);
1277       }
1278       type = ir->type->fields.array;
1279    } else {
1280       array_elements = 1;
1281       type = ir->type;
1282    }
1283
1284    glsl_interp_qualifier interpolation_mode =
1285       ir->determine_interpolation_mode(key->flat_shade);
1286
1287    int location = ir->data.location;
1288    for (unsigned int i = 0; i < array_elements; i++) {
1289       for (unsigned int j = 0; j < type->matrix_columns; j++) {
1290          if (prog_data->urb_setup[location] == -1) {
1291             /* If there's no incoming setup data for this slot, don't
1292              * emit interpolation for it.
1293              */
1294             attr = offset(attr, type->vector_elements);
1295             location++;
1296             continue;
1297          }
1298
1299          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1300             /* Constant interpolation (flat shading) case. The SF has
1301              * handed us defined values in only the constant offset
1302              * field of the setup reg.
1303              */
1304             for (unsigned int k = 0; k < type->vector_elements; k++) {
1305                struct brw_reg interp = interp_reg(location, k);
1306                interp = suboffset(interp, 3);
1307                interp.type = reg->type;
1308                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1309                attr = offset(attr, 1);
1310             }
1311          } else {
1312             /* Smooth/noperspective interpolation case. */
1313             for (unsigned int k = 0; k < type->vector_elements; k++) {
1314                struct brw_reg interp = interp_reg(location, k);
1315                if (brw->needs_unlit_centroid_workaround && ir->data.centroid) {
1316                   /* Get the pixel/sample mask into f0 so that we know
1317                    * which pixels are lit.  Then, for each channel that is
1318                    * unlit, replace the centroid data with non-centroid
1319                    * data.
1320                    */
1321                   emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1322
1323                   fs_inst *inst;
1324                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1325                                       false, false);
1326                   inst->predicate = BRW_PREDICATE_NORMAL;
1327                   inst->predicate_inverse = true;
1328                   if (brw->has_pln)
1329                      inst->no_dd_clear = true;
1330
1331                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1332                                       ir->data.centroid && !key->persample_shading,
1333                                       ir->data.sample || key->persample_shading);
1334                   inst->predicate = BRW_PREDICATE_NORMAL;
1335                   inst->predicate_inverse = false;
1336                   if (brw->has_pln)
1337                      inst->no_dd_check = true;
1338
1339                } else {
1340                   emit_linterp(attr, fs_reg(interp), interpolation_mode,
1341                                ir->data.centroid && !key->persample_shading,
1342                                ir->data.sample || key->persample_shading);
1343                }
1344                if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1345                   emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1346                }
1347                attr = offset(attr, 1);
1348             }
1349
1350          }
1351          location++;
1352       }
1353    }
1354
1355    return reg;
1356 }
1357
1358 fs_reg *
1359 fs_visitor::emit_frontfacing_interpolation()
1360 {
1361    fs_reg *reg = new(this->mem_ctx) fs_reg(this, glsl_type::bool_type);
1362
1363    if (brw->gen >= 6) {
1364       /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
1365        * a boolean result from this (~0/true or 0/false).
1366        *
1367        * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
1368        * this task in only one instruction:
1369        *    - a negation source modifier will flip the bit; and
1370        *    - a W -> D type conversion will sign extend the bit into the high
1371        *      word of the destination.
1372        *
1373        * An ASR 15 fills the low word of the destination.
1374        */
1375       fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
1376       g0.negate = true;
1377
1378       emit(ASR(*reg, g0, fs_reg(15)));
1379    } else {
1380       /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
1381        * a boolean result from this (1/true or 0/false).
1382        *
1383        * Like in the above case, since the bit is the MSB of g1.6:UD we can use
1384        * the negation source modifier to flip it. Unfortunately the SHR
1385        * instruction only operates on UD (or D with an abs source modifier)
1386        * sources without negation.
1387        *
1388        * Instead, use ASR (which will give ~0/true or 0/false) followed by an
1389        * AND 1.
1390        */
1391       fs_reg asr = fs_reg(this, glsl_type::bool_type);
1392       fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
1393       g1_6.negate = true;
1394
1395       emit(ASR(asr, g1_6, fs_reg(31)));
1396       emit(AND(*reg, asr, fs_reg(1)));
1397    }
1398
1399    return reg;
1400 }
1401
1402 void
1403 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1404 {
1405    assert(stage == MESA_SHADER_FRAGMENT);
1406    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1407    assert(dst.type == BRW_REGISTER_TYPE_F);
1408
1409    if (key->compute_pos_offset) {
1410       /* Convert int_sample_pos to floating point */
1411       emit(MOV(dst, int_sample_pos));
1412       /* Scale to the range [0, 1] */
1413       emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1414    }
1415    else {
1416       /* From ARB_sample_shading specification:
1417        * "When rendering to a non-multisample buffer, or if multisample
1418        *  rasterization is disabled, gl_SamplePosition will always be
1419        *  (0.5, 0.5).
1420        */
1421       emit(MOV(dst, fs_reg(0.5f)));
1422    }
1423 }
1424
1425 fs_reg *
1426 fs_visitor::emit_samplepos_setup()
1427 {
1428    assert(brw->gen >= 6);
1429
1430    this->current_annotation = "compute sample position";
1431    fs_reg *reg = new(this->mem_ctx) fs_reg(this, glsl_type::vec2_type);
1432    fs_reg pos = *reg;
1433    fs_reg int_sample_x = fs_reg(this, glsl_type::int_type);
1434    fs_reg int_sample_y = fs_reg(this, glsl_type::int_type);
1435
1436    /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1437     * mode will be enabled.
1438     *
1439     * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1440     * R31.1:0         Position Offset X/Y for Slot[3:0]
1441     * R31.3:2         Position Offset X/Y for Slot[7:4]
1442     * .....
1443     *
1444     * The X, Y sample positions come in as bytes in  thread payload. So, read
1445     * the positions using vstride=16, width=8, hstride=2.
1446     */
1447    struct brw_reg sample_pos_reg =
1448       stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1449                     BRW_REGISTER_TYPE_B), 16, 8, 2);
1450
1451    if (dispatch_width == 8) {
1452       emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1453    } else {
1454       emit(MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg)));
1455       emit(MOV(half(int_sample_x, 1), fs_reg(suboffset(sample_pos_reg, 16))))
1456          ->force_sechalf = true;
1457    }
1458    /* Compute gl_SamplePosition.x */
1459    compute_sample_position(pos, int_sample_x);
1460    pos = offset(pos, 1);
1461    if (dispatch_width == 8) {
1462       emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1463    } else {
1464       emit(MOV(half(int_sample_y, 0),
1465                fs_reg(suboffset(sample_pos_reg, 1))));
1466       emit(MOV(half(int_sample_y, 1), fs_reg(suboffset(sample_pos_reg, 17))))
1467          ->force_sechalf = true;
1468    }
1469    /* Compute gl_SamplePosition.y */
1470    compute_sample_position(pos, int_sample_y);
1471    return reg;
1472 }
1473
1474 fs_reg *
1475 fs_visitor::emit_sampleid_setup()
1476 {
1477    assert(stage == MESA_SHADER_FRAGMENT);
1478    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1479    assert(brw->gen >= 6);
1480
1481    this->current_annotation = "compute sample id";
1482    fs_reg *reg = new(this->mem_ctx) fs_reg(this, glsl_type::int_type);
1483
1484    if (key->compute_sample_id) {
1485       fs_reg t1 = fs_reg(this, glsl_type::int_type);
1486       fs_reg t2 = fs_reg(this, glsl_type::int_type);
1487       t2.type = BRW_REGISTER_TYPE_UW;
1488
1489       /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1490        * 8x multisampling, subspan 0 will represent sample N (where N
1491        * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1492        * 7. We can find the value of N by looking at R0.0 bits 7:6
1493        * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1494        * (since samples are always delivered in pairs). That is, we
1495        * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1496        * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1497        * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1498        * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1499        * populating a temporary variable with the sequence (0, 1, 2, 3),
1500        * and then reading from it using vstride=1, width=4, hstride=0.
1501        * These computations hold good for 4x multisampling as well.
1502        *
1503        * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1504        * the first four slots are sample 0 of subspan 0; the next four
1505        * are sample 1 of subspan 0; the third group is sample 0 of
1506        * subspan 1, and finally sample 1 of subspan 1.
1507        */
1508       fs_inst *inst;
1509       inst = emit(BRW_OPCODE_AND, t1,
1510                   fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1511                   fs_reg(0xc0));
1512       inst->force_writemask_all = true;
1513       inst = emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1514       inst->force_writemask_all = true;
1515       /* This works for both SIMD8 and SIMD16 */
1516       inst = emit(MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210)));
1517       inst->force_writemask_all = true;
1518       /* This special instruction takes care of setting vstride=1,
1519        * width=4, hstride=0 of t2 during an ADD instruction.
1520        */
1521       emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1522    } else {
1523       /* As per GL_ARB_sample_shading specification:
1524        * "When rendering to a non-multisample buffer, or if multisample
1525        *  rasterization is disabled, gl_SampleID will always be zero."
1526        */
1527       emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1528    }
1529
1530    return reg;
1531 }
1532
1533 fs_reg
1534 fs_visitor::fix_math_operand(fs_reg src)
1535 {
1536    /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1537     * might be able to do better by doing execsize = 1 math and then
1538     * expanding that result out, but we would need to be careful with
1539     * masking.
1540     *
1541     * The hardware ignores source modifiers (negate and abs) on math
1542     * instructions, so we also move to a temp to set those up.
1543     */
1544    if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1545        !src.abs && !src.negate)
1546       return src;
1547
1548    /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1549     * operands to math
1550     */
1551    if (brw->gen >= 7 && src.file != IMM)
1552       return src;
1553
1554    fs_reg expanded = fs_reg(this, glsl_type::float_type);
1555    expanded.type = src.type;
1556    emit(BRW_OPCODE_MOV, expanded, src);
1557    return expanded;
1558 }
1559
1560 fs_inst *
1561 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1562 {
1563    switch (opcode) {
1564    case SHADER_OPCODE_RCP:
1565    case SHADER_OPCODE_RSQ:
1566    case SHADER_OPCODE_SQRT:
1567    case SHADER_OPCODE_EXP2:
1568    case SHADER_OPCODE_LOG2:
1569    case SHADER_OPCODE_SIN:
1570    case SHADER_OPCODE_COS:
1571       break;
1572    default:
1573       unreachable("not reached: bad math opcode");
1574    }
1575
1576    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
1577     * might be able to do better by doing execsize = 1 math and then
1578     * expanding that result out, but we would need to be careful with
1579     * masking.
1580     *
1581     * Gen 6 hardware ignores source modifiers (negate and abs) on math
1582     * instructions, so we also move to a temp to set those up.
1583     */
1584    if (brw->gen == 6 || brw->gen == 7)
1585       src = fix_math_operand(src);
1586
1587    fs_inst *inst = emit(opcode, dst, src);
1588
1589    if (brw->gen < 6) {
1590       inst->base_mrf = 2;
1591       inst->mlen = dispatch_width / 8;
1592    }
1593
1594    return inst;
1595 }
1596
1597 fs_inst *
1598 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1599 {
1600    int base_mrf = 2;
1601    fs_inst *inst;
1602
1603    if (brw->gen >= 8) {
1604       inst = emit(opcode, dst, src0, src1);
1605    } else if (brw->gen >= 6) {
1606       src0 = fix_math_operand(src0);
1607       src1 = fix_math_operand(src1);
1608
1609       inst = emit(opcode, dst, src0, src1);
1610    } else {
1611       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1612        * "Message Payload":
1613        *
1614        * "Operand0[7].  For the INT DIV functions, this operand is the
1615        *  denominator."
1616        *  ...
1617        * "Operand1[7].  For the INT DIV functions, this operand is the
1618        *  numerator."
1619        */
1620       bool is_int_div = opcode != SHADER_OPCODE_POW;
1621       fs_reg &op0 = is_int_div ? src1 : src0;
1622       fs_reg &op1 = is_int_div ? src0 : src1;
1623
1624       emit(MOV(fs_reg(MRF, base_mrf + 1, op1.type, dispatch_width), op1));
1625       inst = emit(opcode, dst, op0, reg_null_f);
1626
1627       inst->base_mrf = base_mrf;
1628       inst->mlen = 2 * dispatch_width / 8;
1629    }
1630    return inst;
1631 }
1632
1633 void
1634 fs_visitor::assign_curb_setup()
1635 {
1636    if (dispatch_width == 8) {
1637       prog_data->dispatch_grf_start_reg = payload.num_regs;
1638    } else {
1639       assert(stage == MESA_SHADER_FRAGMENT);
1640       brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1641       prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1642    }
1643
1644    prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1645
1646    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1647    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1648       for (unsigned int i = 0; i < inst->sources; i++) {
1649          if (inst->src[i].file == UNIFORM) {
1650             int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1651             int constant_nr;
1652             if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1653                constant_nr = push_constant_loc[uniform_nr];
1654             } else {
1655                /* Section 5.11 of the OpenGL 4.1 spec says:
1656                 * "Out-of-bounds reads return undefined values, which include
1657                 *  values from other variables of the active program or zero."
1658                 * Just return the first push constant.
1659                 */
1660                constant_nr = 0;
1661             }
1662
1663             struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1664                                                   constant_nr / 8,
1665                                                   constant_nr % 8);
1666
1667             inst->src[i].file = HW_REG;
1668             inst->src[i].fixed_hw_reg = byte_offset(
1669                retype(brw_reg, inst->src[i].type),
1670                inst->src[i].subreg_offset);
1671          }
1672       }
1673    }
1674 }
1675
1676 void
1677 fs_visitor::calculate_urb_setup()
1678 {
1679    assert(stage == MESA_SHADER_FRAGMENT);
1680    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1681    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1682
1683    memset(prog_data->urb_setup, -1,
1684           sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
1685
1686    int urb_next = 0;
1687    /* Figure out where each of the incoming setup attributes lands. */
1688    if (brw->gen >= 6) {
1689       if (_mesa_bitcount_64(prog->InputsRead &
1690                             BRW_FS_VARYING_INPUT_MASK) <= 16) {
1691          /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1692           * first 16 varying inputs, so we can put them wherever we want.
1693           * Just put them in order.
1694           *
1695           * This is useful because it means that (a) inputs not used by the
1696           * fragment shader won't take up valuable register space, and (b) we
1697           * won't have to recompile the fragment shader if it gets paired with
1698           * a different vertex (or geometry) shader.
1699           */
1700          for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1701             if (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1702                 BITFIELD64_BIT(i)) {
1703                prog_data->urb_setup[i] = urb_next++;
1704             }
1705          }
1706       } else {
1707          /* We have enough input varyings that the SF/SBE pipeline stage can't
1708           * arbitrarily rearrange them to suit our whim; we have to put them
1709           * in an order that matches the output of the previous pipeline stage
1710           * (geometry or vertex shader).
1711           */
1712          struct brw_vue_map prev_stage_vue_map;
1713          brw_compute_vue_map(brw, &prev_stage_vue_map,
1714                              key->input_slots_valid);
1715          int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1716          assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1717          for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1718               slot++) {
1719             int varying = prev_stage_vue_map.slot_to_varying[slot];
1720             /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1721              * unused.
1722              */
1723             if (varying != BRW_VARYING_SLOT_COUNT &&
1724                 (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1725                  BITFIELD64_BIT(varying))) {
1726                prog_data->urb_setup[varying] = slot - first_slot;
1727             }
1728          }
1729          urb_next = prev_stage_vue_map.num_slots - first_slot;
1730       }
1731    } else {
1732       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1733       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1734          /* Point size is packed into the header, not as a general attribute */
1735          if (i == VARYING_SLOT_PSIZ)
1736             continue;
1737
1738          if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1739             /* The back color slot is skipped when the front color is
1740              * also written to.  In addition, some slots can be
1741              * written in the vertex shader and not read in the
1742              * fragment shader.  So the register number must always be
1743              * incremented, mapped or not.
1744              */
1745             if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1746                prog_data->urb_setup[i] = urb_next;
1747             urb_next++;
1748          }
1749       }
1750
1751       /*
1752        * It's a FS only attribute, and we did interpolation for this attribute
1753        * in SF thread. So, count it here, too.
1754        *
1755        * See compile_sf_prog() for more info.
1756        */
1757       if (prog->InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1758          prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1759    }
1760
1761    prog_data->num_varying_inputs = urb_next;
1762 }
1763
1764 void
1765 fs_visitor::assign_urb_setup()
1766 {
1767    assert(stage == MESA_SHADER_FRAGMENT);
1768    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1769
1770    int urb_start = payload.num_regs + prog_data->base.curb_read_length;
1771
1772    /* Offset all the urb_setup[] index by the actual position of the
1773     * setup regs, now that the location of the constants has been chosen.
1774     */
1775    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1776       if (inst->opcode == FS_OPCODE_LINTERP) {
1777          assert(inst->src[2].file == HW_REG);
1778          inst->src[2].fixed_hw_reg.nr += urb_start;
1779       }
1780
1781       if (inst->opcode == FS_OPCODE_CINTERP) {
1782          assert(inst->src[0].file == HW_REG);
1783          inst->src[0].fixed_hw_reg.nr += urb_start;
1784       }
1785    }
1786
1787    /* Each attribute is 4 setup channels, each of which is half a reg. */
1788    this->first_non_payload_grf =
1789       urb_start + prog_data->num_varying_inputs * 2;
1790 }
1791
1792 /**
1793  * Split large virtual GRFs into separate components if we can.
1794  *
1795  * This is mostly duplicated with what brw_fs_vector_splitting does,
1796  * but that's really conservative because it's afraid of doing
1797  * splitting that doesn't result in real progress after the rest of
1798  * the optimization phases, which would cause infinite looping in
1799  * optimization.  We can do it once here, safely.  This also has the
1800  * opportunity to split interpolated values, or maybe even uniforms,
1801  * which we don't have at the IR level.
1802  *
1803  * We want to split, because virtual GRFs are what we register
1804  * allocate and spill (due to contiguousness requirements for some
1805  * instructions), and they're what we naturally generate in the
1806  * codegen process, but most virtual GRFs don't actually need to be
1807  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1808  * live intervals and better dead code elimination and coalescing.
1809  */
1810 void
1811 fs_visitor::split_virtual_grfs()
1812 {
1813    int num_vars = this->virtual_grf_count;
1814
1815    /* Count the total number of registers */
1816    int reg_count = 0;
1817    int vgrf_to_reg[num_vars];
1818    for (int i = 0; i < num_vars; i++) {
1819       vgrf_to_reg[i] = reg_count;
1820       reg_count += virtual_grf_sizes[i];
1821    }
1822
1823    /* An array of "split points".  For each register slot, this indicates
1824     * if this slot can be separated from the previous slot.  Every time an
1825     * instruction uses multiple elements of a register (as a source or
1826     * destination), we mark the used slots as inseparable.  Then we go
1827     * through and split the registers into the smallest pieces we can.
1828     */
1829    bool split_points[reg_count];
1830    memset(split_points, 0, sizeof(split_points));
1831
1832    /* Mark all used registers as fully splittable */
1833    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1834       if (inst->dst.file == GRF) {
1835          int reg = vgrf_to_reg[inst->dst.reg];
1836          for (int j = 1; j < this->virtual_grf_sizes[inst->dst.reg]; j++)
1837             split_points[reg + j] = true;
1838       }
1839
1840       for (int i = 0; i < inst->sources; i++) {
1841          if (inst->src[i].file == GRF) {
1842             int reg = vgrf_to_reg[inst->src[i].reg];
1843             for (int j = 1; j < this->virtual_grf_sizes[inst->src[i].reg]; j++)
1844                split_points[reg + j] = true;
1845          }
1846       }
1847    }
1848
1849    if (brw->has_pln &&
1850        this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1851       /* PLN opcodes rely on the delta_xy being contiguous.  We only have to
1852        * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1853        * Gen6, that was the only supported interpolation mode, and since Gen6,
1854        * delta_x and delta_y are in fixed hardware registers.
1855        */
1856       int vgrf = this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg;
1857       split_points[vgrf_to_reg[vgrf] + 1] = false;
1858    }
1859
1860    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1861       if (inst->dst.file == GRF) {
1862          int reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
1863          for (int j = 1; j < inst->regs_written; j++)
1864             split_points[reg + j] = false;
1865       }
1866       for (int i = 0; i < inst->sources; i++) {
1867          if (inst->src[i].file == GRF) {
1868             int reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
1869             for (int j = 1; j < inst->regs_read(this, i); j++)
1870                split_points[reg + j] = false;
1871          }
1872       }
1873    }
1874
1875    int new_virtual_grf[reg_count];
1876    int new_reg_offset[reg_count];
1877
1878    int reg = 0;
1879    for (int i = 0; i < num_vars; i++) {
1880       /* The first one should always be 0 as a quick sanity check. */
1881       assert(split_points[reg] == false);
1882
1883       /* j = 0 case */
1884       new_reg_offset[reg] = 0;
1885       reg++;
1886       int offset = 1;
1887
1888       /* j > 0 case */
1889       for (int j = 1; j < virtual_grf_sizes[i]; j++) {
1890          /* If this is a split point, reset the offset to 0 and allocate a
1891           * new virtual GRF for the previous offset many registers
1892           */
1893          if (split_points[reg]) {
1894             assert(offset <= MAX_VGRF_SIZE);
1895             int grf = virtual_grf_alloc(offset);
1896             for (int k = reg - offset; k < reg; k++)
1897                new_virtual_grf[k] = grf;
1898             offset = 0;
1899          }
1900          new_reg_offset[reg] = offset;
1901          offset++;
1902          reg++;
1903       }
1904
1905       /* The last one gets the original register number */
1906       assert(offset <= MAX_VGRF_SIZE);
1907       virtual_grf_sizes[i] = offset;
1908       for (int k = reg - offset; k < reg; k++)
1909          new_virtual_grf[k] = i;
1910    }
1911    assert(reg == reg_count);
1912
1913    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1914       if (inst->dst.file == GRF) {
1915          reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
1916          inst->dst.reg = new_virtual_grf[reg];
1917          inst->dst.reg_offset = new_reg_offset[reg];
1918          assert(new_reg_offset[reg] < virtual_grf_sizes[new_virtual_grf[reg]]);
1919       }
1920       for (int i = 0; i < inst->sources; i++) {
1921          if (inst->src[i].file == GRF) {
1922             reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
1923             inst->src[i].reg = new_virtual_grf[reg];
1924             inst->src[i].reg_offset = new_reg_offset[reg];
1925             assert(new_reg_offset[reg] < virtual_grf_sizes[new_virtual_grf[reg]]);
1926          }
1927       }
1928    }
1929    invalidate_live_intervals();
1930 }
1931
1932 /**
1933  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1934  *
1935  * During code generation, we create tons of temporary variables, many of
1936  * which get immediately killed and are never used again.  Yet, in later
1937  * optimization and analysis passes, such as compute_live_intervals, we need
1938  * to loop over all the virtual GRFs.  Compacting them can save a lot of
1939  * overhead.
1940  */
1941 bool
1942 fs_visitor::compact_virtual_grfs()
1943 {
1944    bool progress = false;
1945    int remap_table[this->virtual_grf_count];
1946    memset(remap_table, -1, sizeof(remap_table));
1947
1948    /* Mark which virtual GRFs are used. */
1949    foreach_block_and_inst(block, const fs_inst, inst, cfg) {
1950       if (inst->dst.file == GRF)
1951          remap_table[inst->dst.reg] = 0;
1952
1953       for (int i = 0; i < inst->sources; i++) {
1954          if (inst->src[i].file == GRF)
1955             remap_table[inst->src[i].reg] = 0;
1956       }
1957    }
1958
1959    /* Compact the GRF arrays. */
1960    int new_index = 0;
1961    for (int i = 0; i < this->virtual_grf_count; i++) {
1962       if (remap_table[i] == -1) {
1963          /* We just found an unused register.  This means that we are
1964           * actually going to compact something.
1965           */
1966          progress = true;
1967       } else {
1968          remap_table[i] = new_index;
1969          virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1970          invalidate_live_intervals();
1971          ++new_index;
1972       }
1973    }
1974
1975    this->virtual_grf_count = new_index;
1976
1977    /* Patch all the instructions to use the newly renumbered registers */
1978    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1979       if (inst->dst.file == GRF)
1980          inst->dst.reg = remap_table[inst->dst.reg];
1981
1982       for (int i = 0; i < inst->sources; i++) {
1983          if (inst->src[i].file == GRF)
1984             inst->src[i].reg = remap_table[inst->src[i].reg];
1985       }
1986    }
1987
1988    /* Patch all the references to delta_x/delta_y, since they're used in
1989     * register allocation.  If they're unused, switch them to BAD_FILE so
1990     * we don't think some random VGRF is delta_x/delta_y.
1991     */
1992    for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
1993       if (delta_x[i].file == GRF) {
1994          if (remap_table[delta_x[i].reg] != -1) {
1995             delta_x[i].reg = remap_table[delta_x[i].reg];
1996          } else {
1997             delta_x[i].file = BAD_FILE;
1998          }
1999       }
2000    }
2001    for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
2002       if (delta_y[i].file == GRF) {
2003          if (remap_table[delta_y[i].reg] != -1) {
2004             delta_y[i].reg = remap_table[delta_y[i].reg];
2005          } else {
2006             delta_y[i].file = BAD_FILE;
2007          }
2008       }
2009    }
2010
2011    return progress;
2012 }
2013
2014 /*
2015  * Implements array access of uniforms by inserting a
2016  * PULL_CONSTANT_LOAD instruction.
2017  *
2018  * Unlike temporary GRF array access (where we don't support it due to
2019  * the difficulty of doing relative addressing on instruction
2020  * destinations), we could potentially do array access of uniforms
2021  * that were loaded in GRF space as push constants.  In real-world
2022  * usage we've seen, though, the arrays being used are always larger
2023  * than we could load as push constants, so just always move all
2024  * uniform array access out to a pull constant buffer.
2025  */
2026 void
2027 fs_visitor::move_uniform_array_access_to_pull_constants()
2028 {
2029    if (dispatch_width != 8)
2030       return;
2031
2032    pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2033    memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
2034
2035    /* Walk through and find array access of uniforms.  Put a copy of that
2036     * uniform in the pull constant buffer.
2037     *
2038     * Note that we don't move constant-indexed accesses to arrays.  No
2039     * testing has been done of the performance impact of this choice.
2040     */
2041    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2042       for (int i = 0 ; i < inst->sources; i++) {
2043          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2044             continue;
2045
2046          int uniform = inst->src[i].reg;
2047
2048          /* If this array isn't already present in the pull constant buffer,
2049           * add it.
2050           */
2051          if (pull_constant_loc[uniform] == -1) {
2052             const gl_constant_value **values = &stage_prog_data->param[uniform];
2053
2054             assert(param_size[uniform]);
2055
2056             for (int j = 0; j < param_size[uniform]; j++) {
2057                pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
2058
2059                stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
2060                   values[j];
2061             }
2062          }
2063       }
2064    }
2065 }
2066
2067 /**
2068  * Assign UNIFORM file registers to either push constants or pull constants.
2069  *
2070  * We allow a fragment shader to have more than the specified minimum
2071  * maximum number of fragment shader uniform components (64).  If
2072  * there are too many of these, they'd fill up all of register space.
2073  * So, this will push some of them out to the pull constant buffer and
2074  * update the program to load them.
2075  */
2076 void
2077 fs_visitor::assign_constant_locations()
2078 {
2079    /* Only the first compile (SIMD8 mode) gets to decide on locations. */
2080    if (dispatch_width != 8)
2081       return;
2082
2083    /* Find which UNIFORM registers are still in use. */
2084    bool is_live[uniforms];
2085    for (unsigned int i = 0; i < uniforms; i++) {
2086       is_live[i] = false;
2087    }
2088
2089    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2090       for (int i = 0; i < inst->sources; i++) {
2091          if (inst->src[i].file != UNIFORM)
2092             continue;
2093
2094          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
2095          if (constant_nr >= 0 && constant_nr < (int) uniforms)
2096             is_live[constant_nr] = true;
2097       }
2098    }
2099
2100    /* Only allow 16 registers (128 uniform components) as push constants.
2101     *
2102     * Just demote the end of the list.  We could probably do better
2103     * here, demoting things that are rarely used in the program first.
2104     *
2105     * If changing this value, note the limitation about total_regs in
2106     * brw_curbe.c.
2107     */
2108    unsigned int max_push_components = 16 * 8;
2109    unsigned int num_push_constants = 0;
2110
2111    push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2112
2113    for (unsigned int i = 0; i < uniforms; i++) {
2114       if (!is_live[i] || pull_constant_loc[i] != -1) {
2115          /* This UNIFORM register is either dead, or has already been demoted
2116           * to a pull const.  Mark it as no longer living in the param[] array.
2117           */
2118          push_constant_loc[i] = -1;
2119          continue;
2120       }
2121
2122       if (num_push_constants < max_push_components) {
2123          /* Retain as a push constant.  Record the location in the params[]
2124           * array.
2125           */
2126          push_constant_loc[i] = num_push_constants++;
2127       } else {
2128          /* Demote to a pull constant. */
2129          push_constant_loc[i] = -1;
2130
2131          int pull_index = stage_prog_data->nr_pull_params++;
2132          stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
2133          pull_constant_loc[i] = pull_index;
2134       }
2135    }
2136
2137    stage_prog_data->nr_params = num_push_constants;
2138
2139    /* Up until now, the param[] array has been indexed by reg + reg_offset
2140     * of UNIFORM registers.  Condense it to only contain the uniforms we
2141     * chose to upload as push constants.
2142     */
2143    for (unsigned int i = 0; i < uniforms; i++) {
2144       int remapped = push_constant_loc[i];
2145
2146       if (remapped == -1)
2147          continue;
2148
2149       assert(remapped <= (int)i);
2150       stage_prog_data->param[remapped] = stage_prog_data->param[i];
2151    }
2152 }
2153
2154 /**
2155  * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
2156  * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
2157  */
2158 void
2159 fs_visitor::demote_pull_constants()
2160 {
2161    foreach_block_and_inst (block, fs_inst, inst, cfg) {
2162       for (int i = 0; i < inst->sources; i++) {
2163          if (inst->src[i].file != UNIFORM)
2164             continue;
2165
2166          int pull_index = pull_constant_loc[inst->src[i].reg +
2167                                             inst->src[i].reg_offset];
2168          if (pull_index == -1)
2169             continue;
2170
2171          /* Set up the annotation tracking for new generated instructions. */
2172          base_ir = inst->ir;
2173          current_annotation = inst->annotation;
2174
2175          fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
2176          fs_reg dst = fs_reg(this, glsl_type::float_type);
2177
2178          /* Generate a pull load into dst. */
2179          if (inst->src[i].reladdr) {
2180             exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
2181                                                         surf_index,
2182                                                         *inst->src[i].reladdr,
2183                                                         pull_index);
2184             inst->insert_before(block, &list);
2185             inst->src[i].reladdr = NULL;
2186          } else {
2187             fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
2188             fs_inst *pull =
2189                new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 8,
2190                                     dst, surf_index, offset);
2191             inst->insert_before(block, pull);
2192             inst->src[i].set_smear(pull_index & 3);
2193          }
2194
2195          /* Rewrite the instruction to use the temporary VGRF. */
2196          inst->src[i].file = GRF;
2197          inst->src[i].reg = dst.reg;
2198          inst->src[i].reg_offset = 0;
2199          inst->src[i].width = dispatch_width;
2200       }
2201    }
2202    invalidate_live_intervals();
2203 }
2204
2205 bool
2206 fs_visitor::opt_algebraic()
2207 {
2208    bool progress = false;
2209
2210    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2211       switch (inst->opcode) {
2212       case BRW_OPCODE_MUL:
2213          if (inst->src[1].file != IMM)
2214             continue;
2215
2216          /* a * 1.0 = a */
2217          if (inst->src[1].is_one()) {
2218             inst->opcode = BRW_OPCODE_MOV;
2219             inst->src[1] = reg_undef;
2220             progress = true;
2221             break;
2222          }
2223
2224          /* a * 0.0 = 0.0 */
2225          if (inst->src[1].is_zero()) {
2226             inst->opcode = BRW_OPCODE_MOV;
2227             inst->src[0] = inst->src[1];
2228             inst->src[1] = reg_undef;
2229             progress = true;
2230             break;
2231          }
2232
2233          break;
2234       case BRW_OPCODE_ADD:
2235          if (inst->src[1].file != IMM)
2236             continue;
2237
2238          /* a + 0.0 = a */
2239          if (inst->src[1].is_zero()) {
2240             inst->opcode = BRW_OPCODE_MOV;
2241             inst->src[1] = reg_undef;
2242             progress = true;
2243             break;
2244          }
2245          break;
2246       case BRW_OPCODE_OR:
2247          if (inst->src[0].equals(inst->src[1])) {
2248             inst->opcode = BRW_OPCODE_MOV;
2249             inst->src[1] = reg_undef;
2250             progress = true;
2251             break;
2252          }
2253          break;
2254       case BRW_OPCODE_LRP:
2255          if (inst->src[1].equals(inst->src[2])) {
2256             inst->opcode = BRW_OPCODE_MOV;
2257             inst->src[0] = inst->src[1];
2258             inst->src[1] = reg_undef;
2259             inst->src[2] = reg_undef;
2260             progress = true;
2261             break;
2262          }
2263          break;
2264       case BRW_OPCODE_SEL:
2265          if (inst->src[0].equals(inst->src[1])) {
2266             inst->opcode = BRW_OPCODE_MOV;
2267             inst->src[1] = reg_undef;
2268             inst->predicate = BRW_PREDICATE_NONE;
2269             inst->predicate_inverse = false;
2270             progress = true;
2271          } else if (inst->saturate && inst->src[1].file == IMM) {
2272             switch (inst->conditional_mod) {
2273             case BRW_CONDITIONAL_LE:
2274             case BRW_CONDITIONAL_L:
2275                switch (inst->src[1].type) {
2276                case BRW_REGISTER_TYPE_F:
2277                   if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
2278                      inst->opcode = BRW_OPCODE_MOV;
2279                      inst->src[1] = reg_undef;
2280                      progress = true;
2281                   }
2282                   break;
2283                default:
2284                   break;
2285                }
2286                break;
2287             case BRW_CONDITIONAL_GE:
2288             case BRW_CONDITIONAL_G:
2289                switch (inst->src[1].type) {
2290                case BRW_REGISTER_TYPE_F:
2291                   if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
2292                      inst->opcode = BRW_OPCODE_MOV;
2293                      inst->src[1] = reg_undef;
2294                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
2295                      progress = true;
2296                   }
2297                   break;
2298                default:
2299                   break;
2300                }
2301             default:
2302                break;
2303             }
2304          }
2305          break;
2306       case SHADER_OPCODE_RCP: {
2307          fs_inst *prev = (fs_inst *)inst->prev;
2308          if (prev->opcode == SHADER_OPCODE_SQRT) {
2309             if (inst->src[0].equals(prev->dst)) {
2310                inst->opcode = SHADER_OPCODE_RSQ;
2311                inst->src[0] = prev->src[0];
2312                progress = true;
2313             }
2314          }
2315          break;
2316       }
2317       default:
2318          break;
2319       }
2320    }
2321
2322    return progress;
2323 }
2324
2325 bool
2326 fs_visitor::opt_register_renaming()
2327 {
2328    bool progress = false;
2329    int depth = 0;
2330
2331    int remap[virtual_grf_count];
2332    memset(remap, -1, sizeof(int) * virtual_grf_count);
2333
2334    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2335       if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
2336          depth++;
2337       } else if (inst->opcode == BRW_OPCODE_ENDIF ||
2338                  inst->opcode == BRW_OPCODE_WHILE) {
2339          depth--;
2340       }
2341
2342       /* Rewrite instruction sources. */
2343       for (int i = 0; i < inst->sources; i++) {
2344          if (inst->src[i].file == GRF &&
2345              remap[inst->src[i].reg] != -1 &&
2346              remap[inst->src[i].reg] != inst->src[i].reg) {
2347             inst->src[i].reg = remap[inst->src[i].reg];
2348             progress = true;
2349          }
2350       }
2351
2352       const int dst = inst->dst.reg;
2353
2354       if (depth == 0 &&
2355           inst->dst.file == GRF &&
2356           virtual_grf_sizes[inst->dst.reg] == inst->dst.width / 8 &&
2357           !inst->is_partial_write()) {
2358          if (remap[dst] == -1) {
2359             remap[dst] = dst;
2360          } else {
2361             remap[dst] = virtual_grf_alloc(inst->dst.width / 8);
2362             inst->dst.reg = remap[dst];
2363             progress = true;
2364          }
2365       } else if (inst->dst.file == GRF &&
2366                  remap[dst] != -1 &&
2367                  remap[dst] != dst) {
2368          inst->dst.reg = remap[dst];
2369          progress = true;
2370       }
2371    }
2372
2373    if (progress) {
2374       invalidate_live_intervals();
2375
2376       for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
2377          if (delta_x[i].file == GRF && remap[delta_x[i].reg] != -1) {
2378             delta_x[i].reg = remap[delta_x[i].reg];
2379          }
2380       }
2381       for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
2382          if (delta_y[i].file == GRF && remap[delta_y[i].reg] != -1) {
2383             delta_y[i].reg = remap[delta_y[i].reg];
2384          }
2385       }
2386    }
2387
2388    return progress;
2389 }
2390
2391 bool
2392 fs_visitor::compute_to_mrf()
2393 {
2394    bool progress = false;
2395    int next_ip = 0;
2396
2397    /* No MRFs on Gen >= 7. */
2398    if (brw->gen >= 7)
2399       return false;
2400
2401    calculate_live_intervals();
2402
2403    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2404       int ip = next_ip;
2405       next_ip++;
2406
2407       if (inst->opcode != BRW_OPCODE_MOV ||
2408           inst->is_partial_write() ||
2409           inst->dst.file != MRF || inst->src[0].file != GRF ||
2410           inst->dst.type != inst->src[0].type ||
2411           inst->src[0].abs || inst->src[0].negate ||
2412           !inst->src[0].is_contiguous() ||
2413           inst->src[0].subreg_offset)
2414          continue;
2415
2416       /* Work out which hardware MRF registers are written by this
2417        * instruction.
2418        */
2419       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2420       int mrf_high;
2421       if (inst->dst.reg & BRW_MRF_COMPR4) {
2422          mrf_high = mrf_low + 4;
2423       } else if (inst->exec_size == 16) {
2424          mrf_high = mrf_low + 1;
2425       } else {
2426          mrf_high = mrf_low;
2427       }
2428
2429       /* Can't compute-to-MRF this GRF if someone else was going to
2430        * read it later.
2431        */
2432       if (this->virtual_grf_end[inst->src[0].reg] > ip)
2433          continue;
2434
2435       /* Found a move of a GRF to a MRF.  Let's see if we can go
2436        * rewrite the thing that made this GRF to write into the MRF.
2437        */
2438       foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2439          if (scan_inst->dst.file == GRF &&
2440              scan_inst->dst.reg == inst->src[0].reg) {
2441             /* Found the last thing to write our reg we want to turn
2442              * into a compute-to-MRF.
2443              */
2444
2445             /* If this one instruction didn't populate all the
2446              * channels, bail.  We might be able to rewrite everything
2447              * that writes that reg, but it would require smarter
2448              * tracking to delay the rewriting until complete success.
2449              */
2450             if (scan_inst->is_partial_write())
2451                break;
2452
2453             /* Things returning more than one register would need us to
2454              * understand coalescing out more than one MOV at a time.
2455              */
2456             if (scan_inst->regs_written > scan_inst->dst.width / 8)
2457                break;
2458
2459             /* SEND instructions can't have MRF as a destination. */
2460             if (scan_inst->mlen)
2461                break;
2462
2463             if (brw->gen == 6) {
2464                /* gen6 math instructions must have the destination be
2465                 * GRF, so no compute-to-MRF for them.
2466                 */
2467                if (scan_inst->is_math()) {
2468                   break;
2469                }
2470             }
2471
2472             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2473                /* Found the creator of our MRF's source value. */
2474                scan_inst->dst.file = MRF;
2475                scan_inst->dst.reg = inst->dst.reg;
2476                scan_inst->saturate |= inst->saturate;
2477                inst->remove(block);
2478                progress = true;
2479             }
2480             break;
2481          }
2482
2483          /* We don't handle control flow here.  Most computation of
2484           * values that end up in MRFs are shortly before the MRF
2485           * write anyway.
2486           */
2487          if (block->start() == scan_inst)
2488             break;
2489
2490          /* You can't read from an MRF, so if someone else reads our
2491           * MRF's source GRF that we wanted to rewrite, that stops us.
2492           */
2493          bool interfered = false;
2494          for (int i = 0; i < scan_inst->sources; i++) {
2495             if (scan_inst->src[i].file == GRF &&
2496                 scan_inst->src[i].reg == inst->src[0].reg &&
2497                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2498                interfered = true;
2499             }
2500          }
2501          if (interfered)
2502             break;
2503
2504          if (scan_inst->dst.file == MRF) {
2505             /* If somebody else writes our MRF here, we can't
2506              * compute-to-MRF before that.
2507              */
2508             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2509             int scan_mrf_high;
2510
2511             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2512                scan_mrf_high = scan_mrf_low + 4;
2513             } else if (scan_inst->exec_size == 16) {
2514                scan_mrf_high = scan_mrf_low + 1;
2515             } else {
2516                scan_mrf_high = scan_mrf_low;
2517             }
2518
2519             if (mrf_low == scan_mrf_low ||
2520                 mrf_low == scan_mrf_high ||
2521                 mrf_high == scan_mrf_low ||
2522                 mrf_high == scan_mrf_high) {
2523                break;
2524             }
2525          }
2526
2527          if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2528             /* Found a SEND instruction, which means that there are
2529              * live values in MRFs from base_mrf to base_mrf +
2530              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2531              * above it.
2532              */
2533             if (mrf_low >= scan_inst->base_mrf &&
2534                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2535                break;
2536             }
2537             if (mrf_high >= scan_inst->base_mrf &&
2538                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2539                break;
2540             }
2541          }
2542       }
2543    }
2544
2545    if (progress)
2546       invalidate_live_intervals();
2547
2548    return progress;
2549 }
2550
2551 /**
2552  * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
2553  * instructions to FS_OPCODE_REP_FB_WRITE.
2554  */
2555 void
2556 fs_visitor::emit_repclear_shader()
2557 {
2558    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2559    int base_mrf = 1;
2560    int color_mrf = base_mrf + 2;
2561
2562    fs_inst *mov = emit(MOV(vec4(brw_message_reg(color_mrf)),
2563                            fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F)));
2564    mov->force_writemask_all = true;
2565
2566    fs_inst *write;
2567    if (key->nr_color_regions == 1) {
2568       write = emit(FS_OPCODE_REP_FB_WRITE);
2569       write->saturate = key->clamp_fragment_color;
2570       write->base_mrf = color_mrf;
2571       write->target = 0;
2572       write->header_present = false;
2573       write->mlen = 1;
2574    } else {
2575       assume(key->nr_color_regions > 0);
2576       for (int i = 0; i < key->nr_color_regions; ++i) {
2577          write = emit(FS_OPCODE_REP_FB_WRITE);
2578          write->saturate = key->clamp_fragment_color;
2579          write->base_mrf = base_mrf;
2580          write->target = i;
2581          write->header_present = true;
2582          write->mlen = 3;
2583       }
2584    }
2585    write->eot = true;
2586
2587    calculate_cfg();
2588
2589    assign_constant_locations();
2590    assign_curb_setup();
2591
2592    /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
2593    assert(mov->src[0].file == HW_REG);
2594    mov->src[0] = brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
2595 }
2596
2597 /**
2598  * Walks through basic blocks, looking for repeated MRF writes and
2599  * removing the later ones.
2600  */
2601 bool
2602 fs_visitor::remove_duplicate_mrf_writes()
2603 {
2604    fs_inst *last_mrf_move[16];
2605    bool progress = false;
2606
2607    /* Need to update the MRF tracking for compressed instructions. */
2608    if (dispatch_width == 16)
2609       return false;
2610
2611    memset(last_mrf_move, 0, sizeof(last_mrf_move));
2612
2613    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
2614       if (inst->is_control_flow()) {
2615          memset(last_mrf_move, 0, sizeof(last_mrf_move));
2616       }
2617
2618       if (inst->opcode == BRW_OPCODE_MOV &&
2619           inst->dst.file == MRF) {
2620          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2621          if (prev_inst && inst->equals(prev_inst)) {
2622             inst->remove(block);
2623             progress = true;
2624             continue;
2625          }
2626       }
2627
2628       /* Clear out the last-write records for MRFs that were overwritten. */
2629       if (inst->dst.file == MRF) {
2630          last_mrf_move[inst->dst.reg] = NULL;
2631       }
2632
2633       if (inst->mlen > 0 && inst->base_mrf != -1) {
2634          /* Found a SEND instruction, which will include two or fewer
2635           * implied MRF writes.  We could do better here.
2636           */
2637          for (int i = 0; i < implied_mrf_writes(inst); i++) {
2638             last_mrf_move[inst->base_mrf + i] = NULL;
2639          }
2640       }
2641
2642       /* Clear out any MRF move records whose sources got overwritten. */
2643       if (inst->dst.file == GRF) {
2644          for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2645             if (last_mrf_move[i] &&
2646                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2647                last_mrf_move[i] = NULL;
2648             }
2649          }
2650       }
2651
2652       if (inst->opcode == BRW_OPCODE_MOV &&
2653           inst->dst.file == MRF &&
2654           inst->src[0].file == GRF &&
2655           !inst->is_partial_write()) {
2656          last_mrf_move[inst->dst.reg] = inst;
2657       }
2658    }
2659
2660    if (progress)
2661       invalidate_live_intervals();
2662
2663    return progress;
2664 }
2665
2666 static void
2667 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2668                         int first_grf, int grf_len)
2669 {
2670    /* Clear the flag for registers that actually got read (as expected). */
2671    for (int i = 0; i < inst->sources; i++) {
2672       int grf;
2673       if (inst->src[i].file == GRF) {
2674          grf = inst->src[i].reg;
2675       } else if (inst->src[i].file == HW_REG &&
2676                  inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2677          grf = inst->src[i].fixed_hw_reg.nr;
2678       } else {
2679          continue;
2680       }
2681
2682       if (grf >= first_grf &&
2683           grf < first_grf + grf_len) {
2684          deps[grf - first_grf] = false;
2685          if (inst->exec_size == 16)
2686             deps[grf - first_grf + 1] = false;
2687       }
2688    }
2689 }
2690
2691 /**
2692  * Implements this workaround for the original 965:
2693  *
2694  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2695  *      check for post destination dependencies on this instruction, software
2696  *      must ensure that there is no destination hazard for the case of ‘write
2697  *      followed by a posted write’ shown in the following example.
2698  *
2699  *      1. mov r3 0
2700  *      2. send r3.xy <rest of send instruction>
2701  *      3. mov r2 r3
2702  *
2703  *      Due to no post-destination dependency check on the ‘send’, the above
2704  *      code sequence could have two instructions (1 and 2) in flight at the
2705  *      same time that both consider ‘r3’ as the target of their final writes.
2706  */
2707 void
2708 fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
2709                                                         fs_inst *inst)
2710 {
2711    int write_len = inst->regs_written;
2712    int first_write_grf = inst->dst.reg;
2713    bool needs_dep[BRW_MAX_MRF];
2714    assert(write_len < (int)sizeof(needs_dep) - 1);
2715
2716    memset(needs_dep, false, sizeof(needs_dep));
2717    memset(needs_dep, true, write_len);
2718
2719    clear_deps_for_inst_src(inst, dispatch_width,
2720                            needs_dep, first_write_grf, write_len);
2721
2722    /* Walk backwards looking for writes to registers we're writing which
2723     * aren't read since being written.  If we hit the start of the program,
2724     * we assume that there are no outstanding dependencies on entry to the
2725     * program.
2726     */
2727    foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2728       /* If we hit control flow, assume that there *are* outstanding
2729        * dependencies, and force their cleanup before our instruction.
2730        */
2731       if (block->start() == scan_inst) {
2732          for (int i = 0; i < write_len; i++) {
2733             if (needs_dep[i]) {
2734                inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
2735             }
2736          }
2737          return;
2738       }
2739
2740       /* We insert our reads as late as possible on the assumption that any
2741        * instruction but a MOV that might have left us an outstanding
2742        * dependency has more latency than a MOV.
2743        */
2744       if (scan_inst->dst.file == GRF) {
2745          for (int i = 0; i < scan_inst->regs_written; i++) {
2746             int reg = scan_inst->dst.reg + i;
2747
2748             if (reg >= first_write_grf &&
2749                 reg < first_write_grf + write_len &&
2750                 needs_dep[reg - first_write_grf]) {
2751                inst->insert_before(block, DEP_RESOLVE_MOV(reg));
2752                needs_dep[reg - first_write_grf] = false;
2753                if (scan_inst->exec_size == 16)
2754                   needs_dep[reg - first_write_grf + 1] = false;
2755             }
2756          }
2757       }
2758
2759       /* Clear the flag for registers that actually got read (as expected). */
2760       clear_deps_for_inst_src(scan_inst, dispatch_width,
2761                               needs_dep, first_write_grf, write_len);
2762
2763       /* Continue the loop only if we haven't resolved all the dependencies */
2764       int i;
2765       for (i = 0; i < write_len; i++) {
2766          if (needs_dep[i])
2767             break;
2768       }
2769       if (i == write_len)
2770          return;
2771    }
2772 }
2773
2774 /**
2775  * Implements this workaround for the original 965:
2776  *
2777  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
2778  *      used as a destination register until after it has been sourced by an
2779  *      instruction with a different destination register.
2780  */
2781 void
2782 fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
2783 {
2784    int write_len = inst->regs_written;
2785    int first_write_grf = inst->dst.reg;
2786    bool needs_dep[BRW_MAX_MRF];
2787    assert(write_len < (int)sizeof(needs_dep) - 1);
2788
2789    memset(needs_dep, false, sizeof(needs_dep));
2790    memset(needs_dep, true, write_len);
2791    /* Walk forwards looking for writes to registers we're writing which aren't
2792     * read before being written.
2793     */
2794    foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst, block) {
2795       /* If we hit control flow, force resolve all remaining dependencies. */
2796       if (block->end() == scan_inst) {
2797          for (int i = 0; i < write_len; i++) {
2798             if (needs_dep[i])
2799                scan_inst->insert_before(block,
2800                                         DEP_RESOLVE_MOV(first_write_grf + i));
2801          }
2802          return;
2803       }
2804
2805       /* Clear the flag for registers that actually got read (as expected). */
2806       clear_deps_for_inst_src(scan_inst, dispatch_width,
2807                               needs_dep, first_write_grf, write_len);
2808
2809       /* We insert our reads as late as possible since they're reading the
2810        * result of a SEND, which has massive latency.
2811        */
2812       if (scan_inst->dst.file == GRF &&
2813           scan_inst->dst.reg >= first_write_grf &&
2814           scan_inst->dst.reg < first_write_grf + write_len &&
2815           needs_dep[scan_inst->dst.reg - first_write_grf]) {
2816          scan_inst->insert_before(block, DEP_RESOLVE_MOV(scan_inst->dst.reg));
2817          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2818       }
2819
2820       /* Continue the loop only if we haven't resolved all the dependencies */
2821       int i;
2822       for (i = 0; i < write_len; i++) {
2823          if (needs_dep[i])
2824             break;
2825       }
2826       if (i == write_len)
2827          return;
2828    }
2829
2830    /* If we hit the end of the program, resolve all remaining dependencies out
2831     * of paranoia.
2832     */
2833    fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2834    assert(last_inst->eot);
2835    for (int i = 0; i < write_len; i++) {
2836       if (needs_dep[i])
2837          last_inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
2838    }
2839 }
2840
2841 void
2842 fs_visitor::insert_gen4_send_dependency_workarounds()
2843 {
2844    if (brw->gen != 4 || brw->is_g4x)
2845       return;
2846
2847    bool progress = false;
2848
2849    /* Note that we're done with register allocation, so GRF fs_regs always
2850     * have a .reg_offset of 0.
2851     */
2852
2853    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2854       if (inst->mlen != 0 && inst->dst.file == GRF) {
2855          insert_gen4_pre_send_dependency_workarounds(block, inst);
2856          insert_gen4_post_send_dependency_workarounds(block, inst);
2857          progress = true;
2858       }
2859    }
2860
2861    if (progress)
2862       invalidate_live_intervals();
2863 }
2864
2865 /**
2866  * Turns the generic expression-style uniform pull constant load instruction
2867  * into a hardware-specific series of instructions for loading a pull
2868  * constant.
2869  *
2870  * The expression style allows the CSE pass before this to optimize out
2871  * repeated loads from the same offset, and gives the pre-register-allocation
2872  * scheduling full flexibility, while the conversion to native instructions
2873  * allows the post-register-allocation scheduler the best information
2874  * possible.
2875  *
2876  * Note that execution masking for setting up pull constant loads is special:
2877  * the channels that need to be written are unrelated to the current execution
2878  * mask, since a later instruction will use one of the result channels as a
2879  * source operand for all 8 or 16 of its channels.
2880  */
2881 void
2882 fs_visitor::lower_uniform_pull_constant_loads()
2883 {
2884    foreach_block_and_inst (block, fs_inst, inst, cfg) {
2885       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2886          continue;
2887
2888       if (brw->gen >= 7) {
2889          /* The offset arg before was a vec4-aligned byte offset.  We need to
2890           * turn it into a dword offset.
2891           */
2892          fs_reg const_offset_reg = inst->src[1];
2893          assert(const_offset_reg.file == IMM &&
2894                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2895          const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
2896          fs_reg payload = fs_reg(this, glsl_type::uint_type);
2897
2898          /* This is actually going to be a MOV, but since only the first dword
2899           * is accessed, we have a special opcode to do just that one.  Note
2900           * that this needs to be an operation that will be considered a def
2901           * by live variable analysis, or register allocation will explode.
2902           */
2903          fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2904                                                8, payload, const_offset_reg);
2905          setup->force_writemask_all = true;
2906
2907          setup->ir = inst->ir;
2908          setup->annotation = inst->annotation;
2909          inst->insert_before(block, setup);
2910
2911          /* Similarly, this will only populate the first 4 channels of the
2912           * result register (since we only use smear values from 0-3), but we
2913           * don't tell the optimizer.
2914           */
2915          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2916          inst->src[1] = payload;
2917
2918          invalidate_live_intervals();
2919       } else {
2920          /* Before register allocation, we didn't tell the scheduler about the
2921           * MRF we use.  We know it's safe to use this MRF because nothing
2922           * else does except for register spill/unspill, which generates and
2923           * uses its MRF within a single IR instruction.
2924           */
2925          inst->base_mrf = 14;
2926          inst->mlen = 1;
2927       }
2928    }
2929 }
2930
2931 bool
2932 fs_visitor::lower_load_payload()
2933 {
2934    bool progress = false;
2935
2936    int vgrf_to_reg[virtual_grf_count];
2937    int reg_count = 16; /* Leave room for MRF */
2938    for (int i = 0; i < virtual_grf_count; ++i) {
2939       vgrf_to_reg[i] = reg_count;
2940       reg_count += virtual_grf_sizes[i];
2941    }
2942
2943    struct {
2944       bool written:1; /* Whether this register has ever been written */
2945       bool force_writemask_all:1;
2946       bool force_sechalf:1;
2947    } metadata[reg_count];
2948    memset(metadata, 0, sizeof(metadata));
2949
2950    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
2951       int dst_reg;
2952       if (inst->dst.file == GRF) {
2953          dst_reg = vgrf_to_reg[inst->dst.reg];
2954       } else {
2955          /* MRF */
2956          dst_reg = inst->dst.reg;
2957       }
2958
2959       if (inst->dst.file == MRF || inst->dst.file == GRF) {
2960          bool force_sechalf = inst->force_sechalf;
2961          bool toggle_sechalf = inst->dst.width == 16 &&
2962                                type_sz(inst->dst.type) == 4;
2963          for (int i = 0; i < inst->regs_written; ++i) {
2964             metadata[dst_reg + i].written = true;
2965             metadata[dst_reg + i].force_sechalf = force_sechalf;
2966             metadata[dst_reg + i].force_writemask_all = inst->force_writemask_all;
2967             force_sechalf = (toggle_sechalf != force_sechalf);
2968          }
2969       }
2970
2971       if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
2972          assert(inst->dst.file == MRF || inst->dst.file == GRF);
2973          fs_reg dst = inst->dst;
2974
2975          for (int i = 0; i < inst->sources; i++) {
2976             dst.width = inst->src[i].effective_width;
2977             dst.type = inst->src[i].type;
2978
2979             if (inst->src[i].file == BAD_FILE) {
2980                /* Do nothing but otherwise increment as normal */
2981             } else if (dst.file == MRF &&
2982                        dst.width == 8 &&
2983                        brw->has_compr4 &&
2984                        i + 4 < inst->sources &&
2985                        inst->src[i + 4].equals(horiz_offset(inst->src[i], 8))) {
2986                fs_reg compr4_dst = dst;
2987                compr4_dst.reg += BRW_MRF_COMPR4;
2988                compr4_dst.width = 16;
2989                fs_reg compr4_src = inst->src[i];
2990                compr4_src.width = 16;
2991                fs_inst *mov = MOV(compr4_dst, compr4_src);
2992                mov->force_writemask_all = true;
2993                inst->insert_before(block, mov);
2994                /* Mark i+4 as BAD_FILE so we don't emit a MOV for it */
2995                inst->src[i + 4].file = BAD_FILE;
2996             } else {
2997                fs_inst *mov = MOV(dst, inst->src[i]);
2998                if (inst->src[i].file == GRF) {
2999                   int src_reg = vgrf_to_reg[inst->src[i].reg] +
3000                                 inst->src[i].reg_offset;
3001                   mov->force_sechalf = metadata[src_reg].force_sechalf;
3002                   mov->force_writemask_all = metadata[src_reg].force_writemask_all;
3003                   metadata[dst_reg] = metadata[src_reg];
3004                   if (dst.width * type_sz(dst.type) > 32) {
3005                      assert((!metadata[src_reg].written ||
3006                              !metadata[src_reg].force_sechalf) &&
3007                             (!metadata[src_reg + 1].written ||
3008                              metadata[src_reg + 1].force_sechalf));
3009                      metadata[dst_reg + 1] = metadata[src_reg + 1];
3010                   }
3011                } else {
3012                   metadata[dst_reg].force_writemask_all = false;
3013                   metadata[dst_reg].force_sechalf = false;
3014                   if (dst.width == 16) {
3015                      metadata[dst_reg + 1].force_writemask_all = false;
3016                      metadata[dst_reg + 1].force_sechalf = true;
3017                   }
3018                }
3019                inst->insert_before(block, mov);
3020             }
3021
3022             dst = offset(dst, 1);
3023          }
3024
3025          inst->remove(block);
3026          progress = true;
3027       }
3028    }
3029
3030    if (progress)
3031       invalidate_live_intervals();
3032
3033    return progress;
3034 }
3035
3036 void
3037 fs_visitor::dump_instructions()
3038 {
3039    dump_instructions(NULL);
3040 }
3041
3042 void
3043 fs_visitor::dump_instructions(const char *name)
3044 {
3045    calculate_register_pressure();
3046    FILE *file = stderr;
3047    if (name && geteuid() != 0) {
3048       file = fopen(name, "w");
3049       if (!file)
3050          file = stderr;
3051    }
3052
3053    int ip = 0, max_pressure = 0;
3054    foreach_block_and_inst(block, backend_instruction, inst, cfg) {
3055       max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
3056       fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
3057       dump_instruction(inst, file);
3058       ++ip;
3059    }
3060    fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
3061
3062    if (file != stderr) {
3063       fclose(file);
3064    }
3065 }
3066
3067 void
3068 fs_visitor::dump_instruction(backend_instruction *be_inst)
3069 {
3070    dump_instruction(be_inst, stderr);
3071 }
3072
3073 void
3074 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
3075 {
3076    fs_inst *inst = (fs_inst *)be_inst;
3077
3078    if (inst->predicate) {
3079       fprintf(file, "(%cf0.%d) ",
3080              inst->predicate_inverse ? '-' : '+',
3081              inst->flag_subreg);
3082    }
3083
3084    fprintf(file, "%s", brw_instruction_name(inst->opcode));
3085    if (inst->saturate)
3086       fprintf(file, ".sat");
3087    if (inst->conditional_mod) {
3088       fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
3089       if (!inst->predicate &&
3090           (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
3091                               inst->opcode != BRW_OPCODE_IF &&
3092                               inst->opcode != BRW_OPCODE_WHILE))) {
3093          fprintf(file, ".f0.%d", inst->flag_subreg);
3094       }
3095    }
3096    fprintf(file, "(%d) ", inst->exec_size);
3097
3098
3099    switch (inst->dst.file) {
3100    case GRF:
3101       fprintf(file, "vgrf%d", inst->dst.reg);
3102       if (inst->dst.width != dispatch_width)
3103          fprintf(file, "@%d", inst->dst.width);
3104       if (virtual_grf_sizes[inst->dst.reg] != inst->dst.width / 8 ||
3105           inst->dst.subreg_offset)
3106          fprintf(file, "+%d.%d",
3107                  inst->dst.reg_offset, inst->dst.subreg_offset);
3108       break;
3109    case MRF:
3110       fprintf(file, "m%d", inst->dst.reg);
3111       break;
3112    case BAD_FILE:
3113       fprintf(file, "(null)");
3114       break;
3115    case UNIFORM:
3116       fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
3117       break;
3118    case HW_REG:
3119       if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3120          switch (inst->dst.fixed_hw_reg.nr) {
3121          case BRW_ARF_NULL:
3122             fprintf(file, "null");
3123             break;
3124          case BRW_ARF_ADDRESS:
3125             fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
3126             break;
3127          case BRW_ARF_ACCUMULATOR:
3128             fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
3129             break;
3130          case BRW_ARF_FLAG:
3131             fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3132                              inst->dst.fixed_hw_reg.subnr);
3133             break;
3134          default:
3135             fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3136                                inst->dst.fixed_hw_reg.subnr);
3137             break;
3138          }
3139       } else {
3140          fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
3141       }
3142       if (inst->dst.fixed_hw_reg.subnr)
3143          fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
3144       break;
3145    default:
3146       fprintf(file, "???");
3147       break;
3148    }
3149    fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
3150
3151    for (int i = 0; i < inst->sources; i++) {
3152       if (inst->src[i].negate)
3153          fprintf(file, "-");
3154       if (inst->src[i].abs)
3155          fprintf(file, "|");
3156       switch (inst->src[i].file) {
3157       case GRF:
3158          fprintf(file, "vgrf%d", inst->src[i].reg);
3159          if (inst->src[i].width != dispatch_width)
3160             fprintf(file, "@%d", inst->src[i].width);
3161          if (virtual_grf_sizes[inst->src[i].reg] != inst->src[i].width / 8 ||
3162              inst->src[i].subreg_offset)
3163             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3164                     inst->src[i].subreg_offset);
3165          break;
3166       case MRF:
3167          fprintf(file, "***m%d***", inst->src[i].reg);
3168          break;
3169       case UNIFORM:
3170          fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
3171          if (inst->src[i].reladdr) {
3172             fprintf(file, "+reladdr");
3173          } else if (inst->src[i].subreg_offset) {
3174             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3175                     inst->src[i].subreg_offset);
3176          }
3177          break;
3178       case BAD_FILE:
3179          fprintf(file, "(null)");
3180          break;
3181       case IMM:
3182          switch (inst->src[i].type) {
3183          case BRW_REGISTER_TYPE_F:
3184             fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
3185             break;
3186          case BRW_REGISTER_TYPE_D:
3187             fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
3188             break;
3189          case BRW_REGISTER_TYPE_UD:
3190             fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
3191             break;
3192          default:
3193             fprintf(file, "???");
3194             break;
3195          }
3196          break;
3197       case HW_REG:
3198          if (inst->src[i].fixed_hw_reg.negate)
3199             fprintf(file, "-");
3200          if (inst->src[i].fixed_hw_reg.abs)
3201             fprintf(file, "|");
3202          if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3203             switch (inst->src[i].fixed_hw_reg.nr) {
3204             case BRW_ARF_NULL:
3205                fprintf(file, "null");
3206                break;
3207             case BRW_ARF_ADDRESS:
3208                fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
3209                break;
3210             case BRW_ARF_ACCUMULATOR:
3211                fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
3212                break;
3213             case BRW_ARF_FLAG:
3214                fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3215                                 inst->src[i].fixed_hw_reg.subnr);
3216                break;
3217             default:
3218                fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3219                                   inst->src[i].fixed_hw_reg.subnr);
3220                break;
3221             }
3222          } else {
3223             fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3224          }
3225          if (inst->src[i].fixed_hw_reg.subnr)
3226             fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
3227          if (inst->src[i].fixed_hw_reg.abs)
3228             fprintf(file, "|");
3229          break;
3230       default:
3231          fprintf(file, "???");
3232          break;
3233       }
3234       if (inst->src[i].abs)
3235          fprintf(file, "|");
3236
3237       if (inst->src[i].file != IMM) {
3238          fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
3239       }
3240
3241       if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
3242          fprintf(file, ", ");
3243    }
3244
3245    fprintf(file, " ");
3246
3247    if (dispatch_width == 16 && inst->exec_size == 8) {
3248       if (inst->force_sechalf)
3249          fprintf(file, "2ndhalf ");
3250       else
3251          fprintf(file, "1sthalf ");
3252    }
3253
3254    fprintf(file, "\n");
3255 }
3256
3257 /**
3258  * Possibly returns an instruction that set up @param reg.
3259  *
3260  * Sometimes we want to take the result of some expression/variable
3261  * dereference tree and rewrite the instruction generating the result
3262  * of the tree.  When processing the tree, we know that the
3263  * instructions generated are all writing temporaries that are dead
3264  * outside of this tree.  So, if we have some instructions that write
3265  * a temporary, we're free to point that temp write somewhere else.
3266  *
3267  * Note that this doesn't guarantee that the instruction generated
3268  * only reg -- it might be the size=4 destination of a texture instruction.
3269  */
3270 fs_inst *
3271 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3272                                            fs_inst *end,
3273                                            const fs_reg &reg)
3274 {
3275    if (end == start ||
3276        end->is_partial_write() ||
3277        reg.reladdr ||
3278        !reg.equals(end->dst)) {
3279       return NULL;
3280    } else {
3281       return end;
3282    }
3283 }
3284
3285 void
3286 fs_visitor::setup_payload_gen6()
3287 {
3288    bool uses_depth =
3289       (prog->InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3290    unsigned barycentric_interp_modes =
3291       (stage == MESA_SHADER_FRAGMENT) ?
3292       ((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
3293
3294    assert(brw->gen >= 6);
3295
3296    /* R0-1: masks, pixel X/Y coordinates. */
3297    payload.num_regs = 2;
3298    /* R2: only for 32-pixel dispatch.*/
3299
3300    /* R3-26: barycentric interpolation coordinates.  These appear in the
3301     * same order that they appear in the brw_wm_barycentric_interp_mode
3302     * enum.  Each set of coordinates occupies 2 registers if dispatch width
3303     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
3304     * appear if they were enabled using the "Barycentric Interpolation
3305     * Mode" bits in WM_STATE.
3306     */
3307    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3308       if (barycentric_interp_modes & (1 << i)) {
3309          payload.barycentric_coord_reg[i] = payload.num_regs;
3310          payload.num_regs += 2;
3311          if (dispatch_width == 16) {
3312             payload.num_regs += 2;
3313          }
3314       }
3315    }
3316
3317    /* R27: interpolated depth if uses source depth */
3318    if (uses_depth) {
3319       payload.source_depth_reg = payload.num_regs;
3320       payload.num_regs++;
3321       if (dispatch_width == 16) {
3322          /* R28: interpolated depth if not SIMD8. */
3323          payload.num_regs++;
3324       }
3325    }
3326    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3327    if (uses_depth) {
3328       payload.source_w_reg = payload.num_regs;
3329       payload.num_regs++;
3330       if (dispatch_width == 16) {
3331          /* R30: interpolated W if not SIMD8. */
3332          payload.num_regs++;
3333       }
3334    }
3335
3336    if (stage == MESA_SHADER_FRAGMENT) {
3337       brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3338       brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3339       prog_data->uses_pos_offset = key->compute_pos_offset;
3340       /* R31: MSAA position offsets. */
3341       if (prog_data->uses_pos_offset) {
3342          payload.sample_pos_reg = payload.num_regs;
3343          payload.num_regs++;
3344       }
3345    }
3346
3347    /* R32: MSAA input coverage mask */
3348    if (prog->SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3349       assert(brw->gen >= 7);
3350       payload.sample_mask_in_reg = payload.num_regs;
3351       payload.num_regs++;
3352       if (dispatch_width == 16) {
3353          /* R33: input coverage mask if not SIMD8. */
3354          payload.num_regs++;
3355       }
3356    }
3357
3358    /* R34-: bary for 32-pixel. */
3359    /* R58-59: interp W for 32-pixel. */
3360
3361    if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3362       source_depth_to_render_target = true;
3363    }
3364 }
3365
3366 void
3367 fs_visitor::assign_binding_table_offsets()
3368 {
3369    assert(stage == MESA_SHADER_FRAGMENT);
3370    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3371    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3372    uint32_t next_binding_table_offset = 0;
3373
3374    /* If there are no color regions, we still perform an FB write to a null
3375     * renderbuffer, which we place at surface index 0.
3376     */
3377    prog_data->binding_table.render_target_start = next_binding_table_offset;
3378    next_binding_table_offset += MAX2(key->nr_color_regions, 1);
3379
3380    assign_common_binding_table_offsets(next_binding_table_offset);
3381 }
3382
3383 void
3384 fs_visitor::calculate_register_pressure()
3385 {
3386    invalidate_live_intervals();
3387    calculate_live_intervals();
3388
3389    unsigned num_instructions = 0;
3390    foreach_block(block, cfg)
3391       num_instructions += block->instructions.length();
3392
3393    regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3394
3395    for (int reg = 0; reg < virtual_grf_count; reg++) {
3396       for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3397          regs_live_at_ip[ip] += virtual_grf_sizes[reg];
3398    }
3399 }
3400
3401 /**
3402  * Look for repeated FS_OPCODE_MOV_DISPATCH_TO_FLAGS and drop the later ones.
3403  *
3404  * The needs_unlit_centroid_workaround ends up producing one of these per
3405  * channel of centroid input, so it's good to clean them up.
3406  *
3407  * An assumption here is that nothing ever modifies the dispatched pixels
3408  * value that FS_OPCODE_MOV_DISPATCH_TO_FLAGS reads from, but the hardware
3409  * dictates that anyway.
3410  */
3411 void
3412 fs_visitor::opt_drop_redundant_mov_to_flags()
3413 {
3414    bool flag_mov_found[2] = {false};
3415
3416    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
3417       if (inst->is_control_flow()) {
3418          memset(flag_mov_found, 0, sizeof(flag_mov_found));
3419       } else if (inst->opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS) {
3420          if (!flag_mov_found[inst->flag_subreg])
3421             flag_mov_found[inst->flag_subreg] = true;
3422          else
3423             inst->remove(block);
3424       } else if (inst->writes_flag()) {
3425          flag_mov_found[inst->flag_subreg] = false;
3426       }
3427    }
3428 }
3429
3430 void
3431 fs_visitor::optimize()
3432 {
3433    calculate_cfg();
3434
3435    split_virtual_grfs();
3436
3437    move_uniform_array_access_to_pull_constants();
3438    assign_constant_locations();
3439    demote_pull_constants();
3440
3441    opt_drop_redundant_mov_to_flags();
3442
3443 #define OPT(pass, args...) do {                                         \
3444       pass_num++;                                                       \
3445       bool this_progress = pass(args);                                  \
3446                                                                         \
3447       if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) {   \
3448          char filename[64];                                             \
3449          snprintf(filename, 64, "fs%d-%04d-%02d-%02d-" #pass,           \
3450                   dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
3451                                                                         \
3452          backend_visitor::dump_instructions(filename);                  \
3453       }                                                                 \
3454                                                                         \
3455       progress = progress || this_progress;                             \
3456    } while (false)
3457
3458    if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
3459       char filename[64];
3460       snprintf(filename, 64, "fs%d-%04d-00-start",
3461                dispatch_width, shader_prog ? shader_prog->Name : 0);
3462
3463       backend_visitor::dump_instructions(filename);
3464    }
3465
3466    bool progress;
3467    int iteration = 0;
3468    do {
3469       progress = false;
3470       iteration++;
3471       int pass_num = 0;
3472
3473       OPT(remove_duplicate_mrf_writes);
3474
3475       OPT(opt_algebraic);
3476       OPT(opt_cse);
3477       OPT(opt_copy_propagate);
3478       OPT(opt_peephole_predicated_break);
3479       OPT(dead_code_eliminate);
3480       OPT(opt_peephole_sel);
3481       OPT(dead_control_flow_eliminate, this);
3482       OPT(opt_register_renaming);
3483       OPT(opt_saturate_propagation);
3484       OPT(register_coalesce);
3485       OPT(compute_to_mrf);
3486
3487       OPT(compact_virtual_grfs);
3488    } while (progress);
3489
3490    if (lower_load_payload()) {
3491       split_virtual_grfs();
3492       register_coalesce();
3493       compute_to_mrf();
3494       dead_code_eliminate();
3495    }
3496
3497    lower_uniform_pull_constant_loads();
3498 }
3499
3500 void
3501 fs_visitor::allocate_registers()
3502 {
3503    bool allocated_without_spills;
3504
3505    static enum instruction_scheduler_mode pre_modes[] = {
3506       SCHEDULE_PRE,
3507       SCHEDULE_PRE_NON_LIFO,
3508       SCHEDULE_PRE_LIFO,
3509    };
3510
3511    /* Try each scheduling heuristic to see if it can successfully register
3512     * allocate without spilling.  They should be ordered by decreasing
3513     * performance but increasing likelihood of allocating.
3514     */
3515    for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3516       schedule_instructions(pre_modes[i]);
3517
3518       if (0) {
3519          assign_regs_trivial();
3520          allocated_without_spills = true;
3521       } else {
3522          allocated_without_spills = assign_regs(false);
3523       }
3524       if (allocated_without_spills)
3525          break;
3526    }
3527
3528    if (!allocated_without_spills) {
3529       /* We assume that any spilling is worse than just dropping back to
3530        * SIMD8.  There's probably actually some intermediate point where
3531        * SIMD16 with a couple of spills is still better.
3532        */
3533       if (dispatch_width == 16) {
3534          fail("Failure to register allocate.  Reduce number of "
3535               "live scalar values to avoid this.");
3536       } else {
3537          perf_debug("Fragment shader triggered register spilling.  "
3538                     "Try reducing the number of live scalar values to "
3539                     "improve performance.\n");
3540       }
3541
3542       /* Since we're out of heuristics, just go spill registers until we
3543        * get an allocation.
3544        */
3545       while (!assign_regs(true)) {
3546          if (failed)
3547             break;
3548       }
3549    }
3550
3551    /* This must come after all optimization and register allocation, since
3552     * it inserts dead code that happens to have side effects, and it does
3553     * so based on the actual physical registers in use.
3554     */
3555    insert_gen4_send_dependency_workarounds();
3556
3557    if (failed)
3558       return;
3559
3560    if (!allocated_without_spills)
3561       schedule_instructions(SCHEDULE_POST);
3562
3563    if (last_scratch > 0)
3564       prog_data->total_scratch = brw_get_scratch_size(last_scratch);
3565 }
3566
3567 bool
3568 fs_visitor::run()
3569 {
3570    sanity_param_count = prog->Parameters->NumParameters;
3571
3572    assign_binding_table_offsets();
3573
3574    if (brw->gen >= 6)
3575       setup_payload_gen6();
3576    else
3577       setup_payload_gen4();
3578
3579    if (0) {
3580       emit_dummy_fs();
3581    } else if (brw->use_rep_send && dispatch_width == 16) {
3582       emit_repclear_shader();
3583    } else {
3584       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3585          emit_shader_time_begin();
3586
3587       calculate_urb_setup();
3588       if (prog->InputsRead > 0) {
3589          if (brw->gen < 6)
3590             emit_interpolation_setup_gen4();
3591          else
3592             emit_interpolation_setup_gen6();
3593       }
3594
3595       /* We handle discards by keeping track of the still-live pixels in f0.1.
3596        * Initialize it with the dispatched pixels.
3597        */
3598       bool uses_kill =
3599          (stage == MESA_SHADER_FRAGMENT) &&
3600          ((brw_wm_prog_data*) this->prog_data)->uses_kill;
3601       bool alpha_test_func =
3602          (stage == MESA_SHADER_FRAGMENT) &&
3603          ((brw_wm_prog_key*) this->key)->alpha_test_func;
3604       if (uses_kill || alpha_test_func) {
3605          fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3606          discard_init->flag_subreg = 1;
3607       }
3608
3609       /* Generate FS IR for main().  (the visitor only descends into
3610        * functions called "main").
3611        */
3612       if (shader) {
3613          foreach_in_list(ir_instruction, ir, shader->base.ir) {
3614             base_ir = ir;
3615             this->result = reg_undef;
3616             ir->accept(this);
3617          }
3618       } else {
3619          emit_fragment_program_code();
3620       }
3621       base_ir = NULL;
3622       if (failed)
3623          return false;
3624
3625       emit(FS_OPCODE_PLACEHOLDER_HALT);
3626
3627       if (alpha_test_func)
3628          emit_alpha_test();
3629
3630       emit_fb_writes();
3631
3632       optimize();
3633
3634       assign_curb_setup();
3635       assign_urb_setup();
3636
3637       allocate_registers();
3638
3639       if (failed)
3640          return false;
3641    }
3642
3643    if (stage == MESA_SHADER_FRAGMENT) {
3644       brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3645       if (dispatch_width == 8)
3646          prog_data->reg_blocks = brw_register_blocks(grf_used);
3647       else
3648          prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
3649    }
3650
3651    /* If any state parameters were appended, then ParameterValues could have
3652     * been realloced, in which case the driver uniform storage set up by
3653     * _mesa_associate_uniform_storage() would point to freed memory.  Make
3654     * sure that didn't happen.
3655     */
3656    assert(sanity_param_count == prog->Parameters->NumParameters);
3657
3658    return !failed;
3659 }
3660
3661 const unsigned *
3662 brw_wm_fs_emit(struct brw_context *brw,
3663                void *mem_ctx,
3664                const struct brw_wm_prog_key *key,
3665                struct brw_wm_prog_data *prog_data,
3666                struct gl_fragment_program *fp,
3667                struct gl_shader_program *prog,
3668                unsigned *final_assembly_size)
3669 {
3670    bool start_busy = false;
3671    double start_time = 0;
3672
3673    if (unlikely(brw->perf_debug)) {
3674       start_busy = (brw->batch.last_bo &&
3675                     drm_intel_bo_busy(brw->batch.last_bo));
3676       start_time = get_time();
3677    }
3678
3679    struct brw_shader *shader = NULL;
3680    if (prog)
3681       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3682
3683    if (unlikely(INTEL_DEBUG & DEBUG_WM))
3684       brw_dump_ir("fragment", prog, &shader->base, &fp->Base);
3685
3686    /* Now the main event: Visit the shader IR and generate our FS IR for it.
3687     */
3688    fs_visitor v(brw, mem_ctx, key, prog_data, prog, fp, 8);
3689    if (!v.run()) {
3690       if (prog) {
3691          prog->LinkStatus = false;
3692          ralloc_strcat(&prog->InfoLog, v.fail_msg);
3693       }
3694
3695       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3696                     v.fail_msg);
3697
3698       return NULL;
3699    }
3700
3701    cfg_t *simd16_cfg = NULL;
3702    fs_visitor v2(brw, mem_ctx, key, prog_data, prog, fp, 16);
3703    if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16) ||
3704                                brw->use_rep_send)) {
3705       if (!v.simd16_unsupported) {
3706          /* Try a SIMD16 compile */
3707          v2.import_uniforms(&v);
3708          if (!v2.run()) {
3709             perf_debug("SIMD16 shader failed to compile, falling back to "
3710                        "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
3711          } else {
3712             simd16_cfg = v2.cfg;
3713          }
3714       } else {
3715          perf_debug("SIMD16 shader unsupported, falling back to "
3716                     "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
3717       }
3718    }
3719
3720    cfg_t *simd8_cfg;
3721    int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8;
3722    if (no_simd8 && simd16_cfg) {
3723       simd8_cfg = NULL;
3724       prog_data->no_8 = true;
3725    } else {
3726       simd8_cfg = v.cfg;
3727       prog_data->no_8 = false;
3728    }
3729
3730    fs_generator g(brw, mem_ctx, key, prog_data, prog, fp,
3731                   v.runtime_check_aads_emit, INTEL_DEBUG & DEBUG_WM);
3732    if (simd8_cfg)
3733       g.generate_code(simd8_cfg, 8);
3734    if (simd16_cfg)
3735       prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
3736
3737    if (unlikely(brw->perf_debug) && shader) {
3738       if (shader->compiled_once)
3739          brw_wm_debug_recompile(brw, prog, key);
3740       shader->compiled_once = true;
3741
3742       if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3743          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3744                     (get_time() - start_time) * 1000);
3745       }
3746    }
3747
3748    return g.get_assembly(final_assembly_size);
3749 }
3750
3751 extern "C" bool
3752 brw_fs_precompile(struct gl_context *ctx,
3753                   struct gl_shader_program *shader_prog,
3754                   struct gl_program *prog)
3755 {
3756    struct brw_context *brw = brw_context(ctx);
3757    struct brw_wm_prog_key key;
3758
3759    struct gl_fragment_program *fp = (struct gl_fragment_program *) prog;
3760    struct brw_fragment_program *bfp = brw_fragment_program(fp);
3761    bool program_uses_dfdy = fp->UsesDFdy;
3762
3763    memset(&key, 0, sizeof(key));
3764
3765    if (brw->gen < 6) {
3766       if (fp->UsesKill)
3767          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3768
3769       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3770          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3771
3772       /* Just assume depth testing. */
3773       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3774       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3775    }
3776
3777    if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
3778                                          BRW_FS_VARYING_INPUT_MASK) > 16)
3779       key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
3780
3781    unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
3782    for (unsigned i = 0; i < sampler_count; i++) {
3783       if (fp->Base.ShadowSamplers & (1 << i)) {
3784          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3785          key.tex.swizzles[i] =
3786             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3787       } else {
3788          /* Color sampler: assume no swizzling. */
3789          key.tex.swizzles[i] = SWIZZLE_XYZW;
3790       }
3791    }
3792
3793    if (fp->Base.InputsRead & VARYING_BIT_POS) {
3794       key.drawable_height = ctx->DrawBuffer->Height;
3795    }
3796
3797    key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
3798          ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
3799          BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
3800
3801    if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3802       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
3803                           key.nr_color_regions > 1;
3804    }
3805
3806    /* GL_FRAGMENT_SHADER_DERIVATIVE_HINT is almost always GL_DONT_CARE.  The
3807     * quality of the derivatives is likely to be determined by the driconf
3808     * option.
3809     */
3810    key.high_quality_derivatives = brw->disable_derivative_optimization;
3811
3812    key.program_string_id = bfp->id;
3813
3814    uint32_t old_prog_offset = brw->wm.base.prog_offset;
3815    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3816
3817    bool success = do_wm_prog(brw, shader_prog, bfp, &key);
3818
3819    brw->wm.base.prog_offset = old_prog_offset;
3820    brw->wm.prog_data = old_prog_data;
3821
3822    return success;
3823 }