src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 extern "C" {
  32
  33 #include <sys/types.h>
  34
  35 #include "util/hash_table.h"
  36 #include "main/macros.h"
  37 #include "main/shaderobj.h"
  38 #include "main/fbobject.h"
  39 #include "program/prog_parameter.h"
  40 #include "program/prog_print.h"
  41 #include "util/register_allocate.h"
  42 #include "program/hash_table.h"
  43 #include "brw_context.h"
  44 #include "brw_eu.h"
  45 #include "brw_wm.h"
  46 }
  47 #include "brw_fs.h"
  48 #include "brw_cfg.h"
  49 #include "brw_dead_control_flow.h"
  50 #include "main/uniforms.h"
  51 #include "brw_fs_live_variables.h"
  52 #include "glsl/glsl_types.h"
  53 #include "program/sampler.h"
  54
  55 void
  56 fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
  57               fs_reg *src, int sources)
  58 {
  59    memset(this, 0, sizeof(*this));
  60
  61    this->opcode = opcode;
  62    this->dst = dst;
  63    this->src = src;
  64    this->sources = sources;
  65    this->exec_size = exec_size;
  66
  67    assert(dst.file != IMM && dst.file != UNIFORM);
  68
  69    /* If exec_size == 0, try to guess it from the registers.  Since all
  70     * manner of things may use hardware registers, we first try to guess
  71     * based on GRF registers.  If this fails, we will go ahead and take the
  72     * width from the destination register.
  73     */
  74    if (this->exec_size == 0) {
  75       if (dst.file == GRF) {
  76          this->exec_size = dst.width;
  77       } else {
  78          for (int i = 0; i < sources; ++i) {
  79             if (src[i].file != GRF && src[i].file != ATTR)
  80                continue;
  81
  82             if (this->exec_size <= 1)
  83                this->exec_size = src[i].width;
  84             assert(src[i].width == 1 || src[i].width == this->exec_size);
  85          }
  86       }
  87
  88       if (this->exec_size == 0 && dst.file != BAD_FILE)
  89          this->exec_size = dst.width;
  90    }
  91    assert(this->exec_size != 0);
  92
  93    for (int i = 0; i < sources; ++i) {
  94       switch (this->src[i].file) {
  95       case BAD_FILE:
  96          this->src[i].effective_width = 8;
  97          break;
  98       case GRF:
  99       case HW_REG:
 100       case ATTR:
 101          assert(this->src[i].width > 0);
 102          if (this->src[i].width == 1) {
 103             this->src[i].effective_width = this->exec_size;
 104          } else {
 105             this->src[i].effective_width = this->src[i].width;
 106          }
 107          break;
 108       case IMM:
 109       case UNIFORM:
 110          this->src[i].effective_width = this->exec_size;
 111          break;
 112       default:
 113          unreachable("Invalid source register file");
 114       }
 115    }
 116    this->dst.effective_width = this->exec_size;
 117
 118    this->conditional_mod = BRW_CONDITIONAL_NONE;
 119
 120    /* This will be the case for almost all instructions. */
 121    switch (dst.file) {
 122    case GRF:
 123    case HW_REG:
 124    case MRF:
 125    case ATTR:
 126       this->regs_written = (dst.width * dst.stride * type_sz(dst.type) + 31) / 32;
 127       break;
 128    case BAD_FILE:
 129       this->regs_written = 0;
 130       break;
 131    case IMM:
 132    case UNIFORM:
 133       unreachable("Invalid destination register file");
 134    default:
 135       unreachable("Invalid register file");
 136    }
 137
 138    this->writes_accumulator = false;
 139 }
 140
 141 fs_inst::fs_inst()
 142 {
 143    fs_reg *src = ralloc_array(this, fs_reg, 3);
 144    init(BRW_OPCODE_NOP, 8, dst, src, 0);
 145 }
 146
 147 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
 148 {
 149    fs_reg *src = ralloc_array(this, fs_reg, 3);
 150    init(opcode, exec_size, reg_undef, src, 0);
 151 }
 152
 153 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
 154 {
 155    fs_reg *src = ralloc_array(this, fs_reg, 3);
 156    init(opcode, 0, dst, src, 0);
 157 }
 158
 159 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 160                  const fs_reg &src0)
 161 {
 162    fs_reg *src = ralloc_array(this, fs_reg, 3);
 163    src[0] = src0;
 164    init(opcode, exec_size, dst, src, 1);
 165 }
 166
 167 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
 168 {
 169    fs_reg *src = ralloc_array(this, fs_reg, 3);
 170    src[0] = src0;
 171    init(opcode, 0, dst, src, 1);
 172 }
 173
 174 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 175                  const fs_reg &src0, const fs_reg &src1)
 176 {
 177    fs_reg *src = ralloc_array(this, fs_reg, 3);
 178    src[0] = src0;
 179    src[1] = src1;
 180    init(opcode, exec_size, dst, src, 2);
 181 }
 182
 183 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 184                  const fs_reg &src1)
 185 {
 186    fs_reg *src = ralloc_array(this, fs_reg, 3);
 187    src[0] = src0;
 188    src[1] = src1;
 189    init(opcode, 0, dst, src, 2);
 190 }
 191
 192 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 193                  const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
 194 {
 195    fs_reg *src = ralloc_array(this, fs_reg, 3);
 196    src[0] = src0;
 197    src[1] = src1;
 198    src[2] = src2;
 199    init(opcode, exec_size, dst, src, 3);
 200 }
 201
 202 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 203                  const fs_reg &src1, const fs_reg &src2)
 204 {
 205    fs_reg *src = ralloc_array(this, fs_reg, 3);
 206    src[0] = src0;
 207    src[1] = src1;
 208    src[2] = src2;
 209    init(opcode, 0, dst, src, 3);
 210 }
 211
 212 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, fs_reg src[], int sources)
 213 {
 214    init(opcode, 0, dst, src, sources);
 215 }
 216
 217 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
 218                  fs_reg src[], int sources)
 219 {
 220    init(opcode, exec_width, dst, src, sources);
 221 }
 222
 223 fs_inst::fs_inst(const fs_inst &that)
 224 {
 225    memcpy(this, &that, sizeof(that));
 226
 227    this->src = ralloc_array(this, fs_reg, that.sources);
 228
 229    for (int i = 0; i < that.sources; i++)
 230       this->src[i] = that.src[i];
 231 }
 232
 233 void
 234 fs_inst::resize_sources(uint8_t num_sources)
 235 {
 236    if (this->sources != num_sources) {
 237       this->src = reralloc(this, this->src, fs_reg, num_sources);
 238       this->sources = num_sources;
 239    }
 240 }
 241
 242 #define ALU1(op)                                                        \
 243    fs_inst *                                                            \
 244    fs_visitor::op(const fs_reg &dst, const fs_reg &src0)                \
 245    {                                                                    \
 246       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
 247    }
 248
 249 #define ALU2(op)                                                        \
 250    fs_inst *                                                            \
 251    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 252                   const fs_reg &src1)                                   \
 253    {                                                                    \
 254       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
 255    }
 256
 257 #define ALU2_ACC(op)                                                    \
 258    fs_inst *                                                            \
 259    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 260                   const fs_reg &src1)                                   \
 261    {                                                                    \
 262       fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
 263       inst->writes_accumulator = true;                                  \
 264       return inst;                                                      \
 265    }
 266
 267 #define ALU3(op)                                                        \
 268    fs_inst *                                                            \
 269    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 270                   const fs_reg &src1, const fs_reg &src2)               \
 271    {                                                                    \
 272       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
 273    }
 274
 275 ALU1(NOT)
 276 ALU1(MOV)
 277 ALU1(FRC)
 278 ALU1(RNDD)
 279 ALU1(RNDE)
 280 ALU1(RNDZ)
 281 ALU2(ADD)
 282 ALU2(MUL)
 283 ALU2_ACC(MACH)
 284 ALU2(AND)
 285 ALU2(OR)
 286 ALU2(XOR)
 287 ALU2(SHL)
 288 ALU2(SHR)
 289 ALU2(ASR)
 290 ALU3(LRP)
 291 ALU1(BFREV)
 292 ALU3(BFE)
 293 ALU2(BFI1)
 294 ALU3(BFI2)
 295 ALU1(FBH)
 296 ALU1(FBL)
 297 ALU1(CBIT)
 298 ALU3(MAD)
 299 ALU2_ACC(ADDC)
 300 ALU2_ACC(SUBB)
 301 ALU2(SEL)
 302 ALU2(MAC)
 303
 304 /** Gen4 predicated IF. */
 305 fs_inst *
 306 fs_visitor::IF(enum brw_predicate predicate)
 307 {
 308    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width);
 309    inst->predicate = predicate;
 310    return inst;
 311 }
 312
 313 /** Gen6 IF with embedded comparison. */
 314 fs_inst *
 315 fs_visitor::IF(const fs_reg &src0, const fs_reg &src1,
 316                enum brw_conditional_mod condition)
 317 {
 318    assert(brw->gen == 6);
 319    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width,
 320                                         reg_null_d, src0, src1);
 321    inst->conditional_mod = condition;
 322    return inst;
 323 }
 324
 325 /**
 326  * CMP: Sets the low bit of the destination channels with the result
 327  * of the comparison, while the upper bits are undefined, and updates
 328  * the flag register with the packed 16 bits of the result.
 329  */
 330 fs_inst *
 331 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
 332                 enum brw_conditional_mod condition)
 333 {
 334    fs_inst *inst;
 335
 336    /* Take the instruction:
 337     *
 338     * CMP null<d> src0<f> src1<f>
 339     *
 340     * Original gen4 does type conversion to the destination type before
 341     * comparison, producing garbage results for floating point comparisons.
 342     *
 343     * The destination type doesn't matter on newer generations, so we set the
 344     * type to match src0 so we can compact the instruction.
 345     */
 346    dst.type = src0.type;
 347    if (dst.file == HW_REG)
 348       dst.fixed_hw_reg.type = dst.type;
 349
 350    resolve_ud_negate(&src0);
 351    resolve_ud_negate(&src1);
 352
 353    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
 354    inst->conditional_mod = condition;
 355
 356    return inst;
 357 }
 358
 359 fs_inst *
 360 fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources)
 361 {
 362    uint8_t exec_size = dst.width;
 363    for (int i = 0; i < sources; ++i) {
 364       assert(src[i].width % dst.width == 0);
 365       if (src[i].width > exec_size)
 366          exec_size = src[i].width;
 367    }
 368
 369    fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, exec_size,
 370                                         dst, src, sources);
 371    inst->regs_written = 0;
 372    for (int i = 0; i < sources; ++i) {
 373       /* The LOAD_PAYLOAD instruction only really makes sense if we are
 374        * dealing with whole registers.  If this ever changes, we can deal
 375        * with it later.
 376        */
 377       int size = src[i].effective_width * type_sz(src[i].type);
 378       assert(size % 32 == 0);
 379       inst->regs_written += (size + 31) / 32;
 380    }
 381
 382    return inst;
 383 }
 384
 385 exec_list
 386 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
 387                                        const fs_reg &surf_index,
 388                                        const fs_reg &varying_offset,
 389                                        uint32_t const_offset)
 390 {
 391    exec_list instructions;
 392    fs_inst *inst;
 393
 394    /* We have our constant surface use a pitch of 4 bytes, so our index can
 395     * be any component of a vector, and then we load 4 contiguous
 396     * components starting from that.
 397     *
 398     * We break down the const_offset to a portion added to the variable
 399     * offset and a portion done using reg_offset, which means that if you
 400     * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
 401     * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
 402     * CSE can later notice that those loads are all the same and eliminate
 403     * the redundant ones.
 404     */
 405    fs_reg vec4_offset = vgrf(glsl_type::int_type);
 406    instructions.push_tail(ADD(vec4_offset,
 407                               varying_offset, fs_reg(const_offset & ~3)));
 408
 409    int scale = 1;
 410    if (brw->gen == 4 && dst.width == 8) {
 411       /* Pre-gen5, we can either use a SIMD8 message that requires (header,
 412        * u, v, r) as parameters, or we can just use the SIMD16 message
 413        * consisting of (header, u).  We choose the second, at the cost of a
 414        * longer return length.
 415        */
 416       scale = 2;
 417    }
 418
 419    enum opcode op;
 420    if (brw->gen >= 7)
 421       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
 422    else
 423       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
 424
 425    assert(dst.width % 8 == 0);
 426    int regs_written = 4 * (dst.width / 8) * scale;
 427    fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(regs_written),
 428                                dst.type, dst.width);
 429    inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
 430    inst->regs_written = regs_written;
 431    instructions.push_tail(inst);
 432
 433    if (brw->gen < 7) {
 434       inst->base_mrf = 13;
 435       inst->header_present = true;
 436       if (brw->gen == 4)
 437          inst->mlen = 3;
 438       else
 439          inst->mlen = 1 + dispatch_width / 8;
 440    }
 441
 442    fs_reg result = offset(vec4_result, (const_offset & 3) * scale);
 443    instructions.push_tail(MOV(dst, result));
 444
 445    return instructions;
 446 }
 447
 448 /**
 449  * A helper for MOV generation for fixing up broken hardware SEND dependency
 450  * handling.
 451  */
 452 fs_inst *
 453 fs_visitor::DEP_RESOLVE_MOV(int grf)
 454 {
 455    fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
 456
 457    inst->ir = NULL;
 458    inst->annotation = "send dependency resolve";
 459
 460    /* The caller always wants uncompressed to emit the minimal extra
 461     * dependencies, and to avoid having to deal with aligning its regs to 2.
 462     */
 463    inst->exec_size = 8;
 464
 465    return inst;
 466 }
 467
 468 bool
 469 fs_inst::equals(fs_inst *inst) const
 470 {
 471    return (opcode == inst->opcode &&
 472            dst.equals(inst->dst) &&
 473            src[0].equals(inst->src[0]) &&
 474            src[1].equals(inst->src[1]) &&
 475            src[2].equals(inst->src[2]) &&
 476            saturate == inst->saturate &&
 477            predicate == inst->predicate &&
 478            conditional_mod == inst->conditional_mod &&
 479            mlen == inst->mlen &&
 480            base_mrf == inst->base_mrf &&
 481            target == inst->target &&
 482            eot == inst->eot &&
 483            header_present == inst->header_present &&
 484            shadow_compare == inst->shadow_compare &&
 485            exec_size == inst->exec_size &&
 486            offset == inst->offset);
 487 }
 488
 489 bool
 490 fs_inst::overwrites_reg(const fs_reg &reg) const
 491 {
 492    return (reg.file == dst.file &&
 493            reg.reg == dst.reg &&
 494            reg.reg_offset >= dst.reg_offset  &&
 495            reg.reg_offset < dst.reg_offset + regs_written);
 496 }
 497
 498 bool
 499 fs_inst::is_send_from_grf() const
 500 {
 501    switch (opcode) {
 502    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
 503    case SHADER_OPCODE_SHADER_TIME_ADD:
 504    case FS_OPCODE_INTERPOLATE_AT_CENTROID:
 505    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
 506    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
 507    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
 508    case SHADER_OPCODE_UNTYPED_ATOMIC:
 509    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
 510    case SHADER_OPCODE_URB_WRITE_SIMD8:
 511       return true;
 512    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 513       return src[1].file == GRF;
 514    case FS_OPCODE_FB_WRITE:
 515       return src[0].file == GRF;
 516    default:
 517       if (is_tex())
 518          return src[0].file == GRF;
 519
 520       return false;
 521    }
 522 }
 523
 524 bool
 525 fs_inst::can_do_source_mods(struct brw_context *brw)
 526 {
 527    if (brw->gen == 6 && is_math())
 528       return false;
 529
 530    if (is_send_from_grf())
 531       return false;
 532
 533    if (!backend_instruction::can_do_source_mods())
 534       return false;
 535
 536    return true;
 537 }
 538
 539 void
 540 fs_reg::init()
 541 {
 542    memset(this, 0, sizeof(*this));
 543    stride = 1;
 544 }
 545
 546 /** Generic unset register constructor. */
 547 fs_reg::fs_reg()
 548 {
 549    init();
 550    this->file = BAD_FILE;
 551 }
 552
 553 /** Immediate value constructor. */
 554 fs_reg::fs_reg(float f)
 555 {
 556    init();
 557    this->file = IMM;
 558    this->type = BRW_REGISTER_TYPE_F;
 559    this->fixed_hw_reg.dw1.f = f;
 560    this->width = 1;
 561 }
 562
 563 /** Immediate value constructor. */
 564 fs_reg::fs_reg(int32_t i)
 565 {
 566    init();
 567    this->file = IMM;
 568    this->type = BRW_REGISTER_TYPE_D;
 569    this->fixed_hw_reg.dw1.d = i;
 570    this->width = 1;
 571 }
 572
 573 /** Immediate value constructor. */
 574 fs_reg::fs_reg(uint32_t u)
 575 {
 576    init();
 577    this->file = IMM;
 578    this->type = BRW_REGISTER_TYPE_UD;
 579    this->fixed_hw_reg.dw1.ud = u;
 580    this->width = 1;
 581 }
 582
 583 /** Vector float immediate value constructor. */
 584 fs_reg::fs_reg(uint8_t vf[4])
 585 {
 586    init();
 587    this->file = IMM;
 588    this->type = BRW_REGISTER_TYPE_VF;
 589    memcpy(&this->fixed_hw_reg.dw1.ud, vf, sizeof(unsigned));
 590 }
 591
 592 /** Vector float immediate value constructor. */
 593 fs_reg::fs_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
 594 {
 595    init();
 596    this->file = IMM;
 597    this->type = BRW_REGISTER_TYPE_VF;
 598    this->fixed_hw_reg.dw1.ud = (vf0 <<  0) |
 599                                (vf1 <<  8) |
 600                                (vf2 << 16) |
 601                                (vf3 << 24);
 602 }
 603
 604 /** Fixed brw_reg. */
 605 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 606 {
 607    init();
 608    this->file = HW_REG;
 609    this->fixed_hw_reg = fixed_hw_reg;
 610    this->type = fixed_hw_reg.type;
 611    this->width = 1 << fixed_hw_reg.width;
 612 }
 613
 614 bool
 615 fs_reg::equals(const fs_reg &r) const
 616 {
 617    return (file == r.file &&
 618            reg == r.reg &&
 619            reg_offset == r.reg_offset &&
 620            subreg_offset == r.subreg_offset &&
 621            type == r.type &&
 622            negate == r.negate &&
 623            abs == r.abs &&
 624            !reladdr && !r.reladdr &&
 625            memcmp(&fixed_hw_reg, &r.fixed_hw_reg, sizeof(fixed_hw_reg)) == 0 &&
 626            width == r.width &&
 627            stride == r.stride);
 628 }
 629
 630 fs_reg &
 631 fs_reg::set_smear(unsigned subreg)
 632 {
 633    assert(file != HW_REG && file != IMM);
 634    subreg_offset = subreg * type_sz(type);
 635    stride = 0;
 636    return *this;
 637 }
 638
 639 bool
 640 fs_reg::is_contiguous() const
 641 {
 642    return stride == 1;
 643 }
 644
 645 int
 646 fs_visitor::type_size(const struct glsl_type *type)
 647 {
 648    unsigned int size, i;
 649
 650    switch (type->base_type) {
 651    case GLSL_TYPE_UINT:
 652    case GLSL_TYPE_INT:
 653    case GLSL_TYPE_FLOAT:
 654    case GLSL_TYPE_BOOL:
 655       return type->components();
 656    case GLSL_TYPE_ARRAY:
 657       return type_size(type->fields.array) * type->length;
 658    case GLSL_TYPE_STRUCT:
 659       size = 0;
 660       for (i = 0; i < type->length; i++) {
 661          size += type_size(type->fields.structure[i].type);
 662       }
 663       return size;
 664    case GLSL_TYPE_SAMPLER:
 665       /* Samplers take up no register space, since they're baked in at
 666        * link time.
 667        */
 668       return 0;
 669    case GLSL_TYPE_ATOMIC_UINT:
 670       return 0;
 671    case GLSL_TYPE_IMAGE:
 672    case GLSL_TYPE_VOID:
 673    case GLSL_TYPE_ERROR:
 674    case GLSL_TYPE_INTERFACE:
 675       unreachable("not reached");
 676    }
 677
 678    return 0;
 679 }
 680
 681 fs_reg
 682 fs_visitor::get_timestamp()
 683 {
 684    assert(brw->gen >= 7);
 685
 686    fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 687                                           BRW_ARF_TIMESTAMP,
 688                                           0),
 689                              BRW_REGISTER_TYPE_UD));
 690
 691    fs_reg dst = fs_reg(GRF, virtual_grf_alloc(1), BRW_REGISTER_TYPE_UD, 4);
 692
 693    fs_inst *mov = emit(MOV(dst, ts));
 694    /* We want to read the 3 fields we care about even if it's not enabled in
 695     * the dispatch.
 696     */
 697    mov->force_writemask_all = true;
 698
 699    /* The caller wants the low 32 bits of the timestamp.  Since it's running
 700     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 701     * which is plenty of time for our purposes.  It is identical across the
 702     * EUs, but since it's tracking GPU core speed it will increment at a
 703     * varying rate as render P-states change.
 704     *
 705     * The caller could also check if render P-states have changed (or anything
 706     * else that might disrupt timing) by setting smear to 2 and checking if
 707     * that field is != 0.
 708     */
 709    dst.set_smear(0);
 710
 711    return dst;
 712 }
 713
 714 void
 715 fs_visitor::emit_shader_time_begin()
 716 {
 717    current_annotation = "shader time start";
 718    shader_start_time = get_timestamp();
 719 }
 720
 721 void
 722 fs_visitor::emit_shader_time_end()
 723 {
 724    current_annotation = "shader time end";
 725
 726    enum shader_time_shader_type type, written_type, reset_type;
 727    if (dispatch_width == 8) {
 728       type = ST_FS8;
 729       written_type = ST_FS8_WRITTEN;
 730       reset_type = ST_FS8_RESET;
 731    } else {
 732       assert(dispatch_width == 16);
 733       type = ST_FS16;
 734       written_type = ST_FS16_WRITTEN;
 735       reset_type = ST_FS16_RESET;
 736    }
 737
 738    fs_reg shader_end_time = get_timestamp();
 739
 740    /* Check that there weren't any timestamp reset events (assuming these
 741     * were the only two timestamp reads that happened).
 742     */
 743    fs_reg reset = shader_end_time;
 744    reset.set_smear(2);
 745    fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
 746    test->conditional_mod = BRW_CONDITIONAL_Z;
 747    emit(IF(BRW_PREDICATE_NORMAL));
 748
 749    fs_reg start = shader_start_time;
 750    start.negate = true;
 751    fs_reg diff = fs_reg(GRF, virtual_grf_alloc(1), BRW_REGISTER_TYPE_UD, 1);
 752    emit(ADD(diff, start, shader_end_time));
 753
 754    /* If there were no instructions between the two timestamp gets, the diff
 755     * is 2 cycles.  Remove that overhead, so I can forget about that when
 756     * trying to determine the time taken for single instructions.
 757     */
 758    emit(ADD(diff, diff, fs_reg(-2u)));
 759
 760    emit_shader_time_write(type, diff);
 761    emit_shader_time_write(written_type, fs_reg(1u));
 762    emit(BRW_OPCODE_ELSE);
 763    emit_shader_time_write(reset_type, fs_reg(1u));
 764    emit(BRW_OPCODE_ENDIF);
 765 }
 766
 767 void
 768 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
 769                                    fs_reg value)
 770 {
 771    int shader_time_index =
 772       brw_get_shader_time_index(brw, shader_prog, prog, type);
 773    fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
 774
 775    fs_reg payload;
 776    if (dispatch_width == 8)
 777       payload = vgrf(glsl_type::uvec2_type);
 778    else
 779       payload = vgrf(glsl_type::uint_type);
 780
 781    emit(new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
 782                              fs_reg(), payload, offset, value));
 783 }
 784
 785 void
 786 fs_visitor::vfail(const char *format, va_list va)
 787 {
 788    char *msg;
 789
 790    if (failed)
 791       return;
 792
 793    failed = true;
 794
 795    msg = ralloc_vasprintf(mem_ctx, format, va);
 796    msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
 797
 798    this->fail_msg = msg;
 799
 800    if (INTEL_DEBUG & DEBUG_WM) {
 801       fprintf(stderr, "%s",  msg);
 802    }
 803 }
 804
 805 void
 806 fs_visitor::fail(const char *format, ...)
 807 {
 808    va_list va;
 809
 810    va_start(va, format);
 811    vfail(format, va);
 812    va_end(va);
 813 }
 814
 815 /**
 816  * Mark this program as impossible to compile in SIMD16 mode.
 817  *
 818  * During the SIMD8 compile (which happens first), we can detect and flag
 819  * things that are unsupported in SIMD16 mode, so the compiler can skip
 820  * the SIMD16 compile altogether.
 821  *
 822  * During a SIMD16 compile (if one happens anyway), this just calls fail().
 823  */
 824 void
 825 fs_visitor::no16(const char *format, ...)
 826 {
 827    va_list va;
 828
 829    va_start(va, format);
 830
 831    if (dispatch_width == 16) {
 832       vfail(format, va);
 833    } else {
 834       simd16_unsupported = true;
 835
 836       if (brw->perf_debug) {
 837          if (no16_msg)
 838             ralloc_vasprintf_append(&no16_msg, format, va);
 839          else
 840             no16_msg = ralloc_vasprintf(mem_ctx, format, va);
 841       }
 842    }
 843
 844    va_end(va);
 845 }
 846
 847 fs_inst *
 848 fs_visitor::emit(enum opcode opcode)
 849 {
 850    return emit(new(mem_ctx) fs_inst(opcode, dispatch_width));
 851 }
 852
 853 fs_inst *
 854 fs_visitor::emit(enum opcode opcode, const fs_reg &dst)
 855 {
 856    return emit(new(mem_ctx) fs_inst(opcode, dst));
 857 }
 858
 859 fs_inst *
 860 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
 861 {
 862    return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
 863 }
 864
 865 fs_inst *
 866 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 867                  const fs_reg &src1)
 868 {
 869    return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
 870 }
 871
 872 fs_inst *
 873 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 874                  const fs_reg &src1, const fs_reg &src2)
 875 {
 876    return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
 877 }
 878
 879 fs_inst *
 880 fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
 881                  fs_reg src[], int sources)
 882 {
 883    return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
 884 }
 885
 886 /**
 887  * Returns true if the instruction has a flag that means it won't
 888  * update an entire destination register.
 889  *
 890  * For example, dead code elimination and live variable analysis want to know
 891  * when a write to a variable screens off any preceding values that were in
 892  * it.
 893  */
 894 bool
 895 fs_inst::is_partial_write() const
 896 {
 897    return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
 898            (this->dst.width * type_sz(this->dst.type)) < 32 ||
 899            !this->dst.is_contiguous());
 900 }
 901
 902 int
 903 fs_inst::regs_read(fs_visitor *v, int arg) const
 904 {
 905    if (is_tex() && arg == 0 && src[0].file == GRF) {
 906       return mlen;
 907    } else if (opcode == FS_OPCODE_FB_WRITE && arg == 0) {
 908       return mlen;
 909    } else if (opcode == SHADER_OPCODE_URB_WRITE_SIMD8 && arg == 0) {
 910       return mlen;
 911    } else if (opcode == SHADER_OPCODE_UNTYPED_ATOMIC && arg == 0) {
 912       return mlen;
 913    } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ && arg == 0) {
 914       return mlen;
 915    } else if (opcode == FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET && arg == 0) {
 916       return mlen;
 917    }
 918
 919    switch (src[arg].file) {
 920    case BAD_FILE:
 921    case UNIFORM:
 922    case IMM:
 923       return 1;
 924    case GRF:
 925    case HW_REG:
 926       if (src[arg].stride == 0) {
 927          return 1;
 928       } else {
 929          int size = src[arg].width * src[arg].stride * type_sz(src[arg].type);
 930          return (size + 31) / 32;
 931       }
 932    case MRF:
 933       unreachable("MRF registers are not allowed as sources");
 934    default:
 935       unreachable("Invalid register file");
 936    }
 937 }
 938
 939 bool
 940 fs_inst::reads_flag() const
 941 {
 942    return predicate;
 943 }
 944
 945 bool
 946 fs_inst::writes_flag() const
 947 {
 948    return (conditional_mod && (opcode != BRW_OPCODE_SEL &&
 949                                opcode != BRW_OPCODE_IF &&
 950                                opcode != BRW_OPCODE_WHILE)) ||
 951           opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
 952 }
 953
 954 /**
 955  * Returns how many MRFs an FS opcode will write over.
 956  *
 957  * Note that this is not the 0 or 1 implied writes in an actual gen
 958  * instruction -- the FS opcodes often generate MOVs in addition.
 959  */
 960 int
 961 fs_visitor::implied_mrf_writes(fs_inst *inst)
 962 {
 963    if (inst->mlen == 0)
 964       return 0;
 965
 966    if (inst->base_mrf == -1)
 967       return 0;
 968
 969    switch (inst->opcode) {
 970    case SHADER_OPCODE_RCP:
 971    case SHADER_OPCODE_RSQ:
 972    case SHADER_OPCODE_SQRT:
 973    case SHADER_OPCODE_EXP2:
 974    case SHADER_OPCODE_LOG2:
 975    case SHADER_OPCODE_SIN:
 976    case SHADER_OPCODE_COS:
 977       return 1 * dispatch_width / 8;
 978    case SHADER_OPCODE_POW:
 979    case SHADER_OPCODE_INT_QUOTIENT:
 980    case SHADER_OPCODE_INT_REMAINDER:
 981       return 2 * dispatch_width / 8;
 982    case SHADER_OPCODE_TEX:
 983    case FS_OPCODE_TXB:
 984    case SHADER_OPCODE_TXD:
 985    case SHADER_OPCODE_TXF:
 986    case SHADER_OPCODE_TXF_CMS:
 987    case SHADER_OPCODE_TXF_MCS:
 988    case SHADER_OPCODE_TG4:
 989    case SHADER_OPCODE_TG4_OFFSET:
 990    case SHADER_OPCODE_TXL:
 991    case SHADER_OPCODE_TXS:
 992    case SHADER_OPCODE_LOD:
 993       return 1;
 994    case FS_OPCODE_FB_WRITE:
 995       return 2;
 996    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 997    case SHADER_OPCODE_GEN4_SCRATCH_READ:
 998       return 1;
 999    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
1000       return inst->mlen;
1001    case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
1002       return 2;
1003    case SHADER_OPCODE_UNTYPED_ATOMIC:
1004    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
1005    case SHADER_OPCODE_URB_WRITE_SIMD8:
1006    case FS_OPCODE_INTERPOLATE_AT_CENTROID:
1007    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
1008    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
1009    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
1010       return 0;
1011    default:
1012       unreachable("not reached");
1013    }
1014 }
1015
1016 int
1017 fs_visitor::virtual_grf_alloc(int size)
1018 {
1019    if (virtual_grf_array_size <= virtual_grf_count) {
1020       if (virtual_grf_array_size == 0)
1021          virtual_grf_array_size = 16;
1022       else
1023          virtual_grf_array_size *= 2;
1024       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
1025                                    virtual_grf_array_size);
1026    }
1027    virtual_grf_sizes[virtual_grf_count] = size;
1028    return virtual_grf_count++;
1029 }
1030
1031 fs_reg
1032 fs_visitor::vgrf(const glsl_type *const type)
1033 {
1034    int reg_width = dispatch_width / 8;
1035    return fs_reg(GRF, virtual_grf_alloc(type_size(type) * reg_width),
1036                  brw_type_for_base_type(type), dispatch_width);
1037 }
1038
1039 fs_reg
1040 fs_visitor::vgrf(int num_components)
1041 {
1042    int reg_width = dispatch_width / 8;
1043    return fs_reg(GRF, virtual_grf_alloc(num_components * reg_width),
1044                  BRW_REGISTER_TYPE_F, dispatch_width);
1045 }
1046
1047 /** Fixed HW reg constructor. */
1048 fs_reg::fs_reg(enum register_file file, int reg)
1049 {
1050    init();
1051    this->file = file;
1052    this->reg = reg;
1053    this->type = BRW_REGISTER_TYPE_F;
1054
1055    switch (file) {
1056    case UNIFORM:
1057       this->width = 1;
1058       break;
1059    default:
1060       this->width = 8;
1061    }
1062 }
1063
1064 /** Fixed HW reg constructor. */
1065 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
1066 {
1067    init();
1068    this->file = file;
1069    this->reg = reg;
1070    this->type = type;
1071
1072    switch (file) {
1073    case UNIFORM:
1074       this->width = 1;
1075       break;
1076    default:
1077       this->width = 8;
1078    }
1079 }
1080
1081 /** Fixed HW reg constructor. */
1082 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type,
1083                uint8_t width)
1084 {
1085    init();
1086    this->file = file;
1087    this->reg = reg;
1088    this->type = type;
1089    this->width = width;
1090 }
1091
1092 fs_reg *
1093 fs_visitor::variable_storage(ir_variable *var)
1094 {
1095    return (fs_reg *)hash_table_find(this->variable_ht, var);
1096 }
1097
1098 void
1099 import_uniforms_callback(const void *key,
1100                          void *data,
1101                          void *closure)
1102 {
1103    struct hash_table *dst_ht = (struct hash_table *)closure;
1104    const fs_reg *reg = (const fs_reg *)data;
1105
1106    if (reg->file != UNIFORM)
1107       return;
1108
1109    hash_table_insert(dst_ht, data, key);
1110 }
1111
1112 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
1113  * This brings in those uniform definitions
1114  */
1115 void
1116 fs_visitor::import_uniforms(fs_visitor *v)
1117 {
1118    hash_table_call_foreach(v->variable_ht,
1119                            import_uniforms_callback,
1120                            variable_ht);
1121    this->push_constant_loc = v->push_constant_loc;
1122    this->pull_constant_loc = v->pull_constant_loc;
1123    this->uniforms = v->uniforms;
1124    this->param_size = v->param_size;
1125 }
1126
1127 /* Our support for uniforms is piggy-backed on the struct
1128  * gl_fragment_program, because that's where the values actually
1129  * get stored, rather than in some global gl_shader_program uniform
1130  * store.
1131  */
1132 void
1133 fs_visitor::setup_uniform_values(ir_variable *ir)
1134 {
1135    int namelen = strlen(ir->name);
1136
1137    /* The data for our (non-builtin) uniforms is stored in a series of
1138     * gl_uniform_driver_storage structs for each subcomponent that
1139     * glGetUniformLocation() could name.  We know it's been set up in the same
1140     * order we'd walk the type, so walk the list of storage and find anything
1141     * with our name, or the prefix of a component that starts with our name.
1142     */
1143    unsigned params_before = uniforms;
1144    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
1145       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
1146
1147       if (strncmp(ir->name, storage->name, namelen) != 0 ||
1148           (storage->name[namelen] != 0 &&
1149            storage->name[namelen] != '.' &&
1150            storage->name[namelen] != '[')) {
1151          continue;
1152       }
1153
1154       unsigned slots = storage->type->component_slots();
1155       if (storage->array_elements)
1156          slots *= storage->array_elements;
1157
1158       for (unsigned i = 0; i < slots; i++) {
1159          stage_prog_data->param[uniforms++] = &storage->storage[i];
1160       }
1161    }
1162
1163    /* Make sure we actually initialized the right amount of stuff here. */
1164    assert(params_before + ir->type->component_slots() == uniforms);
1165    (void)params_before;
1166 }
1167
1168
1169 /* Our support for builtin uniforms is even scarier than non-builtin.
1170  * It sits on top of the PROG_STATE_VAR parameters that are
1171  * automatically updated from GL context state.
1172  */
1173 void
1174 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
1175 {
1176    const ir_state_slot *const slots = ir->get_state_slots();
1177    assert(slots != NULL);
1178
1179    for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
1180       /* This state reference has already been setup by ir_to_mesa, but we'll
1181        * get the same index back here.
1182        */
1183       int index = _mesa_add_state_reference(this->prog->Parameters,
1184                                             (gl_state_index *)slots[i].tokens);
1185
1186       /* Add each of the unique swizzles of the element as a parameter.
1187        * This'll end up matching the expected layout of the
1188        * array/matrix/structure we're trying to fill in.
1189        */
1190       int last_swiz = -1;
1191       for (unsigned int j = 0; j < 4; j++) {
1192          int swiz = GET_SWZ(slots[i].swizzle, j);
1193          if (swiz == last_swiz)
1194             break;
1195          last_swiz = swiz;
1196
1197          stage_prog_data->param[uniforms++] =
1198             &prog->Parameters->ParameterValues[index][swiz];
1199       }
1200    }
1201 }
1202
1203 fs_reg *
1204 fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
1205                                          bool origin_upper_left)
1206 {
1207    assert(stage == MESA_SHADER_FRAGMENT);
1208    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1209    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec4_type));
1210    fs_reg wpos = *reg;
1211    bool flip = !origin_upper_left ^ key->render_to_fbo;
1212
1213    /* gl_FragCoord.x */
1214    if (pixel_center_integer) {
1215       emit(MOV(wpos, this->pixel_x));
1216    } else {
1217       emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
1218    }
1219    wpos = offset(wpos, 1);
1220
1221    /* gl_FragCoord.y */
1222    if (!flip && pixel_center_integer) {
1223       emit(MOV(wpos, this->pixel_y));
1224    } else {
1225       fs_reg pixel_y = this->pixel_y;
1226       float offset = (pixel_center_integer ? 0.0 : 0.5);
1227
1228       if (flip) {
1229          pixel_y.negate = true;
1230          offset += key->drawable_height - 1.0;
1231       }
1232
1233       emit(ADD(wpos, pixel_y, fs_reg(offset)));
1234    }
1235    wpos = offset(wpos, 1);
1236
1237    /* gl_FragCoord.z */
1238    if (brw->gen >= 6) {
1239       emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
1240    } else {
1241       emit(FS_OPCODE_LINTERP, wpos,
1242            this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1243            this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1244            interp_reg(VARYING_SLOT_POS, 2));
1245    }
1246    wpos = offset(wpos, 1);
1247
1248    /* gl_FragCoord.w: Already set up in emit_interpolation */
1249    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1250
1251    return reg;
1252 }
1253
1254 fs_inst *
1255 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1256                          glsl_interp_qualifier interpolation_mode,
1257                          bool is_centroid, bool is_sample)
1258 {
1259    brw_wm_barycentric_interp_mode barycoord_mode;
1260    if (brw->gen >= 6) {
1261       if (is_centroid) {
1262          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1263             barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1264          else
1265             barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1266       } else if (is_sample) {
1267           if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1268             barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1269          else
1270             barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1271       } else {
1272          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1273             barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1274          else
1275             barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1276       }
1277    } else {
1278       /* On Ironlake and below, there is only one interpolation mode.
1279        * Centroid interpolation doesn't mean anything on this hardware --
1280        * there is no multisampling.
1281        */
1282       barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1283    }
1284    return emit(FS_OPCODE_LINTERP, attr,
1285                this->delta_x[barycoord_mode],
1286                this->delta_y[barycoord_mode], interp);
1287 }
1288
1289 void
1290 fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
1291                                        const glsl_type *type,
1292                                        glsl_interp_qualifier interpolation_mode,
1293                                        int location, bool mod_centroid,
1294                                        bool mod_sample)
1295 {
1296    attr.type = brw_type_for_base_type(type->get_scalar_type());
1297
1298    assert(stage == MESA_SHADER_FRAGMENT);
1299    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1300    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1301
1302    unsigned int array_elements;
1303
1304    if (type->is_array()) {
1305       array_elements = type->length;
1306       if (array_elements == 0) {
1307          fail("dereferenced array '%s' has length 0\n", name);
1308       }
1309       type = type->fields.array;
1310    } else {
1311       array_elements = 1;
1312    }
1313
1314    if (interpolation_mode == INTERP_QUALIFIER_NONE) {
1315       bool is_gl_Color =
1316          location == VARYING_SLOT_COL0 || location == VARYING_SLOT_COL1;
1317       if (key->flat_shade && is_gl_Color) {
1318          interpolation_mode = INTERP_QUALIFIER_FLAT;
1319       } else {
1320          interpolation_mode = INTERP_QUALIFIER_SMOOTH;
1321       }
1322    }
1323
1324    for (unsigned int i = 0; i < array_elements; i++) {
1325       for (unsigned int j = 0; j < type->matrix_columns; j++) {
1326          if (prog_data->urb_setup[location] == -1) {
1327             /* If there's no incoming setup data for this slot, don't
1328              * emit interpolation for it.
1329              */
1330             attr = offset(attr, type->vector_elements);
1331             location++;
1332             continue;
1333          }
1334
1335          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1336             /* Constant interpolation (flat shading) case. The SF has
1337              * handed us defined values in only the constant offset
1338              * field of the setup reg.
1339              */
1340             for (unsigned int k = 0; k < type->vector_elements; k++) {
1341                struct brw_reg interp = interp_reg(location, k);
1342                interp = suboffset(interp, 3);
1343                interp.type = attr.type;
1344                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1345                attr = offset(attr, 1);
1346             }
1347          } else {
1348             /* Smooth/noperspective interpolation case. */
1349             for (unsigned int k = 0; k < type->vector_elements; k++) {
1350                struct brw_reg interp = interp_reg(location, k);
1351                if (brw->needs_unlit_centroid_workaround && mod_centroid) {
1352                   /* Get the pixel/sample mask into f0 so that we know
1353                    * which pixels are lit.  Then, for each channel that is
1354                    * unlit, replace the centroid data with non-centroid
1355                    * data.
1356                    */
1357                   emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1358
1359                   fs_inst *inst;
1360                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1361                                       false, false);
1362                   inst->predicate = BRW_PREDICATE_NORMAL;
1363                   inst->predicate_inverse = true;
1364                   if (brw->has_pln)
1365                      inst->no_dd_clear = true;
1366
1367                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1368                                       mod_centroid && !key->persample_shading,
1369                                       mod_sample || key->persample_shading);
1370                   inst->predicate = BRW_PREDICATE_NORMAL;
1371                   inst->predicate_inverse = false;
1372                   if (brw->has_pln)
1373                      inst->no_dd_check = true;
1374
1375                } else {
1376                   emit_linterp(attr, fs_reg(interp), interpolation_mode,
1377                                mod_centroid && !key->persample_shading,
1378                                mod_sample || key->persample_shading);
1379                }
1380                if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1381                   emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1382                }
1383                attr = offset(attr, 1);
1384             }
1385
1386          }
1387          location++;
1388       }
1389    }
1390 }
1391
1392 fs_reg *
1393 fs_visitor::emit_frontfacing_interpolation()
1394 {
1395    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type));
1396
1397    if (brw->gen >= 6) {
1398       /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
1399        * a boolean result from this (~0/true or 0/false).
1400        *
1401        * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
1402        * this task in only one instruction:
1403        *    - a negation source modifier will flip the bit; and
1404        *    - a W -> D type conversion will sign extend the bit into the high
1405        *      word of the destination.
1406        *
1407        * An ASR 15 fills the low word of the destination.
1408        */
1409       fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
1410       g0.negate = true;
1411
1412       emit(ASR(*reg, g0, fs_reg(15)));
1413    } else {
1414       /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
1415        * a boolean result from this (1/true or 0/false).
1416        *
1417        * Like in the above case, since the bit is the MSB of g1.6:UD we can use
1418        * the negation source modifier to flip it. Unfortunately the SHR
1419        * instruction only operates on UD (or D with an abs source modifier)
1420        * sources without negation.
1421        *
1422        * Instead, use ASR (which will give ~0/true or 0/false).
1423        */
1424       fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
1425       g1_6.negate = true;
1426
1427       emit(ASR(*reg, g1_6, fs_reg(31)));
1428    }
1429
1430    return reg;
1431 }
1432
1433 void
1434 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1435 {
1436    assert(stage == MESA_SHADER_FRAGMENT);
1437    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1438    assert(dst.type == BRW_REGISTER_TYPE_F);
1439
1440    if (key->compute_pos_offset) {
1441       /* Convert int_sample_pos to floating point */
1442       emit(MOV(dst, int_sample_pos));
1443       /* Scale to the range [0, 1] */
1444       emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1445    }
1446    else {
1447       /* From ARB_sample_shading specification:
1448        * "When rendering to a non-multisample buffer, or if multisample
1449        *  rasterization is disabled, gl_SamplePosition will always be
1450        *  (0.5, 0.5).
1451        */
1452       emit(MOV(dst, fs_reg(0.5f)));
1453    }
1454 }
1455
1456 fs_reg *
1457 fs_visitor::emit_samplepos_setup()
1458 {
1459    assert(brw->gen >= 6);
1460
1461    this->current_annotation = "compute sample position";
1462    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type));
1463    fs_reg pos = *reg;
1464    fs_reg int_sample_x = vgrf(glsl_type::int_type);
1465    fs_reg int_sample_y = vgrf(glsl_type::int_type);
1466
1467    /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1468     * mode will be enabled.
1469     *
1470     * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1471     * R31.1:0         Position Offset X/Y for Slot[3:0]
1472     * R31.3:2         Position Offset X/Y for Slot[7:4]
1473     * .....
1474     *
1475     * The X, Y sample positions come in as bytes in  thread payload. So, read
1476     * the positions using vstride=16, width=8, hstride=2.
1477     */
1478    struct brw_reg sample_pos_reg =
1479       stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1480                     BRW_REGISTER_TYPE_B), 16, 8, 2);
1481
1482    if (dispatch_width == 8) {
1483       emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1484    } else {
1485       emit(MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg)));
1486       emit(MOV(half(int_sample_x, 1), fs_reg(suboffset(sample_pos_reg, 16))))
1487          ->force_sechalf = true;
1488    }
1489    /* Compute gl_SamplePosition.x */
1490    compute_sample_position(pos, int_sample_x);
1491    pos = offset(pos, 1);
1492    if (dispatch_width == 8) {
1493       emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1494    } else {
1495       emit(MOV(half(int_sample_y, 0),
1496                fs_reg(suboffset(sample_pos_reg, 1))));
1497       emit(MOV(half(int_sample_y, 1), fs_reg(suboffset(sample_pos_reg, 17))))
1498          ->force_sechalf = true;
1499    }
1500    /* Compute gl_SamplePosition.y */
1501    compute_sample_position(pos, int_sample_y);
1502    return reg;
1503 }
1504
1505 fs_reg *
1506 fs_visitor::emit_sampleid_setup()
1507 {
1508    assert(stage == MESA_SHADER_FRAGMENT);
1509    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1510    assert(brw->gen >= 6);
1511
1512    this->current_annotation = "compute sample id";
1513    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
1514
1515    if (key->compute_sample_id) {
1516       fs_reg t1 = vgrf(glsl_type::int_type);
1517       fs_reg t2 = vgrf(glsl_type::int_type);
1518       t2.type = BRW_REGISTER_TYPE_UW;
1519
1520       /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1521        * 8x multisampling, subspan 0 will represent sample N (where N
1522        * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1523        * 7. We can find the value of N by looking at R0.0 bits 7:6
1524        * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1525        * (since samples are always delivered in pairs). That is, we
1526        * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1527        * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1528        * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1529        * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1530        * populating a temporary variable with the sequence (0, 1, 2, 3),
1531        * and then reading from it using vstride=1, width=4, hstride=0.
1532        * These computations hold good for 4x multisampling as well.
1533        *
1534        * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1535        * the first four slots are sample 0 of subspan 0; the next four
1536        * are sample 1 of subspan 0; the third group is sample 0 of
1537        * subspan 1, and finally sample 1 of subspan 1.
1538        */
1539       fs_inst *inst;
1540       inst = emit(BRW_OPCODE_AND, t1,
1541                   fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1542                   fs_reg(0xc0));
1543       inst->force_writemask_all = true;
1544       inst = emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1545       inst->force_writemask_all = true;
1546       /* This works for both SIMD8 and SIMD16 */
1547       inst = emit(MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210)));
1548       inst->force_writemask_all = true;
1549       /* This special instruction takes care of setting vstride=1,
1550        * width=4, hstride=0 of t2 during an ADD instruction.
1551        */
1552       emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1553    } else {
1554       /* As per GL_ARB_sample_shading specification:
1555        * "When rendering to a non-multisample buffer, or if multisample
1556        *  rasterization is disabled, gl_SampleID will always be zero."
1557        */
1558       emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1559    }
1560
1561    return reg;
1562 }
1563
1564 fs_reg
1565 fs_visitor::fix_math_operand(fs_reg src)
1566 {
1567    /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1568     * might be able to do better by doing execsize = 1 math and then
1569     * expanding that result out, but we would need to be careful with
1570     * masking.
1571     *
1572     * The hardware ignores source modifiers (negate and abs) on math
1573     * instructions, so we also move to a temp to set those up.
1574     */
1575    if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1576        !src.abs && !src.negate)
1577       return src;
1578
1579    /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1580     * operands to math
1581     */
1582    if (brw->gen >= 7 && src.file != IMM)
1583       return src;
1584
1585    fs_reg expanded = vgrf(glsl_type::float_type);
1586    expanded.type = src.type;
1587    emit(BRW_OPCODE_MOV, expanded, src);
1588    return expanded;
1589 }
1590
1591 fs_inst *
1592 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1593 {
1594    switch (opcode) {
1595    case SHADER_OPCODE_RCP:
1596    case SHADER_OPCODE_RSQ:
1597    case SHADER_OPCODE_SQRT:
1598    case SHADER_OPCODE_EXP2:
1599    case SHADER_OPCODE_LOG2:
1600    case SHADER_OPCODE_SIN:
1601    case SHADER_OPCODE_COS:
1602       break;
1603    default:
1604       unreachable("not reached: bad math opcode");
1605    }
1606
1607    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
1608     * might be able to do better by doing execsize = 1 math and then
1609     * expanding that result out, but we would need to be careful with
1610     * masking.
1611     *
1612     * Gen 6 hardware ignores source modifiers (negate and abs) on math
1613     * instructions, so we also move to a temp to set those up.
1614     */
1615    if (brw->gen == 6 || brw->gen == 7)
1616       src = fix_math_operand(src);
1617
1618    fs_inst *inst = emit(opcode, dst, src);
1619
1620    if (brw->gen < 6) {
1621       inst->base_mrf = 2;
1622       inst->mlen = dispatch_width / 8;
1623    }
1624
1625    return inst;
1626 }
1627
1628 fs_inst *
1629 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1630 {
1631    int base_mrf = 2;
1632    fs_inst *inst;
1633
1634    if (brw->gen >= 8) {
1635       inst = emit(opcode, dst, src0, src1);
1636    } else if (brw->gen >= 6) {
1637       src0 = fix_math_operand(src0);
1638       src1 = fix_math_operand(src1);
1639
1640       inst = emit(opcode, dst, src0, src1);
1641    } else {
1642       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1643        * "Message Payload":
1644        *
1645        * "Operand0[7].  For the INT DIV functions, this operand is the
1646        *  denominator."
1647        *  ...
1648        * "Operand1[7].  For the INT DIV functions, this operand is the
1649        *  numerator."
1650        */
1651       bool is_int_div = opcode != SHADER_OPCODE_POW;
1652       fs_reg &op0 = is_int_div ? src1 : src0;
1653       fs_reg &op1 = is_int_div ? src0 : src1;
1654
1655       emit(MOV(fs_reg(MRF, base_mrf + 1, op1.type, dispatch_width), op1));
1656       inst = emit(opcode, dst, op0, reg_null_f);
1657
1658       inst->base_mrf = base_mrf;
1659       inst->mlen = 2 * dispatch_width / 8;
1660    }
1661    return inst;
1662 }
1663
1664 void
1665 fs_visitor::assign_curb_setup()
1666 {
1667    if (dispatch_width == 8) {
1668       prog_data->dispatch_grf_start_reg = payload.num_regs;
1669    } else {
1670       assert(stage == MESA_SHADER_FRAGMENT);
1671       brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1672       prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1673    }
1674
1675    prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1676
1677    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1678    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1679       for (unsigned int i = 0; i < inst->sources; i++) {
1680          if (inst->src[i].file == UNIFORM) {
1681             int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1682             int constant_nr;
1683             if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1684                constant_nr = push_constant_loc[uniform_nr];
1685             } else {
1686                /* Section 5.11 of the OpenGL 4.1 spec says:
1687                 * "Out-of-bounds reads return undefined values, which include
1688                 *  values from other variables of the active program or zero."
1689                 * Just return the first push constant.
1690                 */
1691                constant_nr = 0;
1692             }
1693
1694             struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1695                                                   constant_nr / 8,
1696                                                   constant_nr % 8);
1697
1698             inst->src[i].file = HW_REG;
1699             inst->src[i].fixed_hw_reg = byte_offset(
1700                retype(brw_reg, inst->src[i].type),
1701                inst->src[i].subreg_offset);
1702          }
1703       }
1704    }
1705 }
1706
1707 void
1708 fs_visitor::calculate_urb_setup()
1709 {
1710    assert(stage == MESA_SHADER_FRAGMENT);
1711    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1712    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1713
1714    memset(prog_data->urb_setup, -1,
1715           sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
1716
1717    int urb_next = 0;
1718    /* Figure out where each of the incoming setup attributes lands. */
1719    if (brw->gen >= 6) {
1720       if (_mesa_bitcount_64(prog->InputsRead &
1721                             BRW_FS_VARYING_INPUT_MASK) <= 16) {
1722          /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1723           * first 16 varying inputs, so we can put them wherever we want.
1724           * Just put them in order.
1725           *
1726           * This is useful because it means that (a) inputs not used by the
1727           * fragment shader won't take up valuable register space, and (b) we
1728           * won't have to recompile the fragment shader if it gets paired with
1729           * a different vertex (or geometry) shader.
1730           */
1731          for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1732             if (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1733                 BITFIELD64_BIT(i)) {
1734                prog_data->urb_setup[i] = urb_next++;
1735             }
1736          }
1737       } else {
1738          /* We have enough input varyings that the SF/SBE pipeline stage can't
1739           * arbitrarily rearrange them to suit our whim; we have to put them
1740           * in an order that matches the output of the previous pipeline stage
1741           * (geometry or vertex shader).
1742           */
1743          struct brw_vue_map prev_stage_vue_map;
1744          brw_compute_vue_map(brw, &prev_stage_vue_map,
1745                              key->input_slots_valid);
1746          int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1747          assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1748          for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1749               slot++) {
1750             int varying = prev_stage_vue_map.slot_to_varying[slot];
1751             /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1752              * unused.
1753              */
1754             if (varying != BRW_VARYING_SLOT_COUNT &&
1755                 (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1756                  BITFIELD64_BIT(varying))) {
1757                prog_data->urb_setup[varying] = slot - first_slot;
1758             }
1759          }
1760          urb_next = prev_stage_vue_map.num_slots - first_slot;
1761       }
1762    } else {
1763       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1764       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1765          /* Point size is packed into the header, not as a general attribute */
1766          if (i == VARYING_SLOT_PSIZ)
1767             continue;
1768
1769          if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1770             /* The back color slot is skipped when the front color is
1771              * also written to.  In addition, some slots can be
1772              * written in the vertex shader and not read in the
1773              * fragment shader.  So the register number must always be
1774              * incremented, mapped or not.
1775              */
1776             if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1777                prog_data->urb_setup[i] = urb_next;
1778             urb_next++;
1779          }
1780       }
1781
1782       /*
1783        * It's a FS only attribute, and we did interpolation for this attribute
1784        * in SF thread. So, count it here, too.
1785        *
1786        * See compile_sf_prog() for more info.
1787        */
1788       if (prog->InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1789          prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1790    }
1791
1792    prog_data->num_varying_inputs = urb_next;
1793 }
1794
1795 void
1796 fs_visitor::assign_urb_setup()
1797 {
1798    assert(stage == MESA_SHADER_FRAGMENT);
1799    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1800
1801    int urb_start = payload.num_regs + prog_data->base.curb_read_length;
1802
1803    /* Offset all the urb_setup[] index by the actual position of the
1804     * setup regs, now that the location of the constants has been chosen.
1805     */
1806    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1807       if (inst->opcode == FS_OPCODE_LINTERP) {
1808          assert(inst->src[2].file == HW_REG);
1809          inst->src[2].fixed_hw_reg.nr += urb_start;
1810       }
1811
1812       if (inst->opcode == FS_OPCODE_CINTERP) {
1813          assert(inst->src[0].file == HW_REG);
1814          inst->src[0].fixed_hw_reg.nr += urb_start;
1815       }
1816    }
1817
1818    /* Each attribute is 4 setup channels, each of which is half a reg. */
1819    this->first_non_payload_grf =
1820       urb_start + prog_data->num_varying_inputs * 2;
1821 }
1822
1823 void
1824 fs_visitor::assign_vs_urb_setup()
1825 {
1826    brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
1827    int grf, count, slot, channel, attr;
1828
1829    assert(stage == MESA_SHADER_VERTEX);
1830    count = _mesa_bitcount_64(vs_prog_data->inputs_read);
1831    if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid)
1832       count++;
1833
1834    /* Each attribute is 4 regs. */
1835    this->first_non_payload_grf =
1836       payload.num_regs + prog_data->curb_read_length + count * 4;
1837
1838    unsigned vue_entries =
1839       MAX2(count, vs_prog_data->base.vue_map.num_slots);
1840
1841    vs_prog_data->base.urb_entry_size = ALIGN(vue_entries, 4) / 4;
1842    vs_prog_data->base.urb_read_length = (count + 1) / 2;
1843
1844    assert(vs_prog_data->base.urb_read_length <= 15);
1845
1846    /* Rewrite all ATTR file references to the hw grf that they land in. */
1847    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1848       for (int i = 0; i < inst->sources; i++) {
1849          if (inst->src[i].file == ATTR) {
1850
1851             if (inst->src[i].reg == VERT_ATTRIB_MAX) {
1852                slot = count - 1;
1853             } else {
1854                /* Attributes come in in a contiguous block, ordered by their
1855                 * gl_vert_attrib value.  That means we can compute the slot
1856                 * number for an attribute by masking out the enabled
1857                 * attributes before it and counting the bits.
1858                 */
1859                attr = inst->src[i].reg + inst->src[i].reg_offset / 4;
1860                slot = _mesa_bitcount_64(vs_prog_data->inputs_read &
1861                                         BITFIELD64_MASK(attr));
1862             }
1863
1864             channel = inst->src[i].reg_offset & 3;
1865
1866             grf = payload.num_regs +
1867                prog_data->curb_read_length +
1868                slot * 4 + channel;
1869
1870             inst->src[i].file = HW_REG;
1871             inst->src[i].fixed_hw_reg =
1872                retype(brw_vec8_grf(grf, 0), inst->src[i].type);
1873          }
1874       }
1875    }
1876 }
1877
1878 /**
1879  * Split large virtual GRFs into separate components if we can.
1880  *
1881  * This is mostly duplicated with what brw_fs_vector_splitting does,
1882  * but that's really conservative because it's afraid of doing
1883  * splitting that doesn't result in real progress after the rest of
1884  * the optimization phases, which would cause infinite looping in
1885  * optimization.  We can do it once here, safely.  This also has the
1886  * opportunity to split interpolated values, or maybe even uniforms,
1887  * which we don't have at the IR level.
1888  *
1889  * We want to split, because virtual GRFs are what we register
1890  * allocate and spill (due to contiguousness requirements for some
1891  * instructions), and they're what we naturally generate in the
1892  * codegen process, but most virtual GRFs don't actually need to be
1893  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1894  * live intervals and better dead code elimination and coalescing.
1895  */
1896 void
1897 fs_visitor::split_virtual_grfs()
1898 {
1899    int num_vars = this->virtual_grf_count;
1900
1901    /* Count the total number of registers */
1902    int reg_count = 0;
1903    int vgrf_to_reg[num_vars];
1904    for (int i = 0; i < num_vars; i++) {
1905       vgrf_to_reg[i] = reg_count;
1906       reg_count += virtual_grf_sizes[i];
1907    }
1908
1909    /* An array of "split points".  For each register slot, this indicates
1910     * if this slot can be separated from the previous slot.  Every time an
1911     * instruction uses multiple elements of a register (as a source or
1912     * destination), we mark the used slots as inseparable.  Then we go
1913     * through and split the registers into the smallest pieces we can.
1914     */
1915    bool split_points[reg_count];
1916    memset(split_points, 0, sizeof(split_points));
1917
1918    /* Mark all used registers as fully splittable */
1919    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1920       if (inst->dst.file == GRF) {
1921          int reg = vgrf_to_reg[inst->dst.reg];
1922          for (int j = 1; j < this->virtual_grf_sizes[inst->dst.reg]; j++)
1923             split_points[reg + j] = true;
1924       }
1925
1926       for (int i = 0; i < inst->sources; i++) {
1927          if (inst->src[i].file == GRF) {
1928             int reg = vgrf_to_reg[inst->src[i].reg];
1929             for (int j = 1; j < this->virtual_grf_sizes[inst->src[i].reg]; j++)
1930                split_points[reg + j] = true;
1931          }
1932       }
1933    }
1934
1935    if (brw->has_pln &&
1936        this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1937       /* PLN opcodes rely on the delta_xy being contiguous.  We only have to
1938        * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1939        * Gen6, that was the only supported interpolation mode, and since Gen6,
1940        * delta_x and delta_y are in fixed hardware registers.
1941        */
1942       int vgrf = this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg;
1943       split_points[vgrf_to_reg[vgrf] + 1] = false;
1944    }
1945
1946    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1947       if (inst->dst.file == GRF) {
1948          int reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
1949          for (int j = 1; j < inst->regs_written; j++)
1950             split_points[reg + j] = false;
1951       }
1952       for (int i = 0; i < inst->sources; i++) {
1953          if (inst->src[i].file == GRF) {
1954             int reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
1955             for (int j = 1; j < inst->regs_read(this, i); j++)
1956                split_points[reg + j] = false;
1957          }
1958       }
1959    }
1960
1961    int new_virtual_grf[reg_count];
1962    int new_reg_offset[reg_count];
1963
1964    int reg = 0;
1965    for (int i = 0; i < num_vars; i++) {
1966       /* The first one should always be 0 as a quick sanity check. */
1967       assert(split_points[reg] == false);
1968
1969       /* j = 0 case */
1970       new_reg_offset[reg] = 0;
1971       reg++;
1972       int offset = 1;
1973
1974       /* j > 0 case */
1975       for (int j = 1; j < virtual_grf_sizes[i]; j++) {
1976          /* If this is a split point, reset the offset to 0 and allocate a
1977           * new virtual GRF for the previous offset many registers
1978           */
1979          if (split_points[reg]) {
1980             assert(offset <= MAX_VGRF_SIZE);
1981             int grf = virtual_grf_alloc(offset);
1982             for (int k = reg - offset; k < reg; k++)
1983                new_virtual_grf[k] = grf;
1984             offset = 0;
1985          }
1986          new_reg_offset[reg] = offset;
1987          offset++;
1988          reg++;
1989       }
1990
1991       /* The last one gets the original register number */
1992       assert(offset <= MAX_VGRF_SIZE);
1993       virtual_grf_sizes[i] = offset;
1994       for (int k = reg - offset; k < reg; k++)
1995          new_virtual_grf[k] = i;
1996    }
1997    assert(reg == reg_count);
1998
1999    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2000       if (inst->dst.file == GRF) {
2001          reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
2002          inst->dst.reg = new_virtual_grf[reg];
2003          inst->dst.reg_offset = new_reg_offset[reg];
2004          assert(new_reg_offset[reg] < virtual_grf_sizes[new_virtual_grf[reg]]);
2005       }
2006       for (int i = 0; i < inst->sources; i++) {
2007          if (inst->src[i].file == GRF) {
2008             reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
2009             inst->src[i].reg = new_virtual_grf[reg];
2010             inst->src[i].reg_offset = new_reg_offset[reg];
2011             assert(new_reg_offset[reg] < virtual_grf_sizes[new_virtual_grf[reg]]);
2012          }
2013       }
2014    }
2015    invalidate_live_intervals();
2016 }
2017
2018 /**
2019  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
2020  *
2021  * During code generation, we create tons of temporary variables, many of
2022  * which get immediately killed and are never used again.  Yet, in later
2023  * optimization and analysis passes, such as compute_live_intervals, we need
2024  * to loop over all the virtual GRFs.  Compacting them can save a lot of
2025  * overhead.
2026  */
2027 bool
2028 fs_visitor::compact_virtual_grfs()
2029 {
2030    bool progress = false;
2031    int remap_table[this->virtual_grf_count];
2032    memset(remap_table, -1, sizeof(remap_table));
2033
2034    /* Mark which virtual GRFs are used. */
2035    foreach_block_and_inst(block, const fs_inst, inst, cfg) {
2036       if (inst->dst.file == GRF)
2037          remap_table[inst->dst.reg] = 0;
2038
2039       for (int i = 0; i < inst->sources; i++) {
2040          if (inst->src[i].file == GRF)
2041             remap_table[inst->src[i].reg] = 0;
2042       }
2043    }
2044
2045    /* Compact the GRF arrays. */
2046    int new_index = 0;
2047    for (int i = 0; i < this->virtual_grf_count; i++) {
2048       if (remap_table[i] == -1) {
2049          /* We just found an unused register.  This means that we are
2050           * actually going to compact something.
2051           */
2052          progress = true;
2053       } else {
2054          remap_table[i] = new_index;
2055          virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
2056          invalidate_live_intervals();
2057          ++new_index;
2058       }
2059    }
2060
2061    this->virtual_grf_count = new_index;
2062
2063    /* Patch all the instructions to use the newly renumbered registers */
2064    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2065       if (inst->dst.file == GRF)
2066          inst->dst.reg = remap_table[inst->dst.reg];
2067
2068       for (int i = 0; i < inst->sources; i++) {
2069          if (inst->src[i].file == GRF)
2070             inst->src[i].reg = remap_table[inst->src[i].reg];
2071       }
2072    }
2073
2074    /* Patch all the references to delta_x/delta_y, since they're used in
2075     * register allocation.  If they're unused, switch them to BAD_FILE so
2076     * we don't think some random VGRF is delta_x/delta_y.
2077     */
2078    for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
2079       if (delta_x[i].file == GRF) {
2080          if (remap_table[delta_x[i].reg] != -1) {
2081             delta_x[i].reg = remap_table[delta_x[i].reg];
2082          } else {
2083             delta_x[i].file = BAD_FILE;
2084          }
2085       }
2086    }
2087    for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
2088       if (delta_y[i].file == GRF) {
2089          if (remap_table[delta_y[i].reg] != -1) {
2090             delta_y[i].reg = remap_table[delta_y[i].reg];
2091          } else {
2092             delta_y[i].file = BAD_FILE;
2093          }
2094       }
2095    }
2096
2097    return progress;
2098 }
2099
2100 /*
2101  * Implements array access of uniforms by inserting a
2102  * PULL_CONSTANT_LOAD instruction.
2103  *
2104  * Unlike temporary GRF array access (where we don't support it due to
2105  * the difficulty of doing relative addressing on instruction
2106  * destinations), we could potentially do array access of uniforms
2107  * that were loaded in GRF space as push constants.  In real-world
2108  * usage we've seen, though, the arrays being used are always larger
2109  * than we could load as push constants, so just always move all
2110  * uniform array access out to a pull constant buffer.
2111  */
2112 void
2113 fs_visitor::move_uniform_array_access_to_pull_constants()
2114 {
2115    if (dispatch_width != 8)
2116       return;
2117
2118    pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2119    memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
2120
2121    /* Walk through and find array access of uniforms.  Put a copy of that
2122     * uniform in the pull constant buffer.
2123     *
2124     * Note that we don't move constant-indexed accesses to arrays.  No
2125     * testing has been done of the performance impact of this choice.
2126     */
2127    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2128       for (int i = 0 ; i < inst->sources; i++) {
2129          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2130             continue;
2131
2132          int uniform = inst->src[i].reg;
2133
2134          /* If this array isn't already present in the pull constant buffer,
2135           * add it.
2136           */
2137          if (pull_constant_loc[uniform] == -1) {
2138             const gl_constant_value **values = &stage_prog_data->param[uniform];
2139
2140             assert(param_size[uniform]);
2141
2142             for (int j = 0; j < param_size[uniform]; j++) {
2143                pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
2144
2145                stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
2146                   values[j];
2147             }
2148          }
2149       }
2150    }
2151 }
2152
2153 /**
2154  * Assign UNIFORM file registers to either push constants or pull constants.
2155  *
2156  * We allow a fragment shader to have more than the specified minimum
2157  * maximum number of fragment shader uniform components (64).  If
2158  * there are too many of these, they'd fill up all of register space.
2159  * So, this will push some of them out to the pull constant buffer and
2160  * update the program to load them.
2161  */
2162 void
2163 fs_visitor::assign_constant_locations()
2164 {
2165    /* Only the first compile (SIMD8 mode) gets to decide on locations. */
2166    if (dispatch_width != 8)
2167       return;
2168
2169    /* Find which UNIFORM registers are still in use. */
2170    bool is_live[uniforms];
2171    for (unsigned int i = 0; i < uniforms; i++) {
2172       is_live[i] = false;
2173    }
2174
2175    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2176       for (int i = 0; i < inst->sources; i++) {
2177          if (inst->src[i].file != UNIFORM)
2178             continue;
2179
2180          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
2181          if (constant_nr >= 0 && constant_nr < (int) uniforms)
2182             is_live[constant_nr] = true;
2183       }
2184    }
2185
2186    /* Only allow 16 registers (128 uniform components) as push constants.
2187     *
2188     * Just demote the end of the list.  We could probably do better
2189     * here, demoting things that are rarely used in the program first.
2190     *
2191     * If changing this value, note the limitation about total_regs in
2192     * brw_curbe.c.
2193     */
2194    unsigned int max_push_components = 16 * 8;
2195    unsigned int num_push_constants = 0;
2196
2197    push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2198
2199    for (unsigned int i = 0; i < uniforms; i++) {
2200       if (!is_live[i] || pull_constant_loc[i] != -1) {
2201          /* This UNIFORM register is either dead, or has already been demoted
2202           * to a pull const.  Mark it as no longer living in the param[] array.
2203           */
2204          push_constant_loc[i] = -1;
2205          continue;
2206       }
2207
2208       if (num_push_constants < max_push_components) {
2209          /* Retain as a push constant.  Record the location in the params[]
2210           * array.
2211           */
2212          push_constant_loc[i] = num_push_constants++;
2213       } else {
2214          /* Demote to a pull constant. */
2215          push_constant_loc[i] = -1;
2216
2217          int pull_index = stage_prog_data->nr_pull_params++;
2218          stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
2219          pull_constant_loc[i] = pull_index;
2220       }
2221    }
2222
2223    stage_prog_data->nr_params = num_push_constants;
2224
2225    /* Up until now, the param[] array has been indexed by reg + reg_offset
2226     * of UNIFORM registers.  Condense it to only contain the uniforms we
2227     * chose to upload as push constants.
2228     */
2229    for (unsigned int i = 0; i < uniforms; i++) {
2230       int remapped = push_constant_loc[i];
2231
2232       if (remapped == -1)
2233          continue;
2234
2235       assert(remapped <= (int)i);
2236       stage_prog_data->param[remapped] = stage_prog_data->param[i];
2237    }
2238 }
2239
2240 /**
2241  * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
2242  * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
2243  */
2244 void
2245 fs_visitor::demote_pull_constants()
2246 {
2247    foreach_block_and_inst (block, fs_inst, inst, cfg) {
2248       for (int i = 0; i < inst->sources; i++) {
2249          if (inst->src[i].file != UNIFORM)
2250             continue;
2251
2252          int pull_index = pull_constant_loc[inst->src[i].reg +
2253                                             inst->src[i].reg_offset];
2254          if (pull_index == -1)
2255             continue;
2256
2257          /* Set up the annotation tracking for new generated instructions. */
2258          base_ir = inst->ir;
2259          current_annotation = inst->annotation;
2260
2261          fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
2262          fs_reg dst = vgrf(glsl_type::float_type);
2263
2264          /* Generate a pull load into dst. */
2265          if (inst->src[i].reladdr) {
2266             exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
2267                                                         surf_index,
2268                                                         *inst->src[i].reladdr,
2269                                                         pull_index);
2270             inst->insert_before(block, &list);
2271             inst->src[i].reladdr = NULL;
2272          } else {
2273             fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
2274             fs_inst *pull =
2275                new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 8,
2276                                     dst, surf_index, offset);
2277             inst->insert_before(block, pull);
2278             inst->src[i].set_smear(pull_index & 3);
2279          }
2280
2281          /* Rewrite the instruction to use the temporary VGRF. */
2282          inst->src[i].file = GRF;
2283          inst->src[i].reg = dst.reg;
2284          inst->src[i].reg_offset = 0;
2285          inst->src[i].width = dispatch_width;
2286       }
2287    }
2288    invalidate_live_intervals();
2289 }
2290
2291 bool
2292 fs_visitor::opt_algebraic()
2293 {
2294    bool progress = false;
2295
2296    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2297       switch (inst->opcode) {
2298       case BRW_OPCODE_MOV:
2299          if (inst->src[0].file != IMM)
2300             break;
2301
2302          if (inst->saturate) {
2303             if (inst->dst.type != inst->src[0].type)
2304                assert(!"unimplemented: saturate mixed types");
2305
2306             if (brw_saturate_immediate(inst->dst.type,
2307                                        &inst->src[0].fixed_hw_reg)) {
2308                inst->saturate = false;
2309                progress = true;
2310             }
2311          }
2312          break;
2313
2314       case BRW_OPCODE_MUL:
2315          if (inst->src[1].file != IMM)
2316             continue;
2317
2318          /* a * 1.0 = a */
2319          if (inst->src[1].is_one()) {
2320             inst->opcode = BRW_OPCODE_MOV;
2321             inst->src[1] = reg_undef;
2322             progress = true;
2323             break;
2324          }
2325
2326          /* a * 0.0 = 0.0 */
2327          if (inst->src[1].is_zero()) {
2328             inst->opcode = BRW_OPCODE_MOV;
2329             inst->src[0] = inst->src[1];
2330             inst->src[1] = reg_undef;
2331             progress = true;
2332             break;
2333          }
2334
2335          break;
2336       case BRW_OPCODE_ADD:
2337          if (inst->src[1].file != IMM)
2338             continue;
2339
2340          /* a + 0.0 = a */
2341          if (inst->src[1].is_zero()) {
2342             inst->opcode = BRW_OPCODE_MOV;
2343             inst->src[1] = reg_undef;
2344             progress = true;
2345             break;
2346          }
2347          break;
2348       case BRW_OPCODE_OR:
2349          if (inst->src[0].equals(inst->src[1])) {
2350             inst->opcode = BRW_OPCODE_MOV;
2351             inst->src[1] = reg_undef;
2352             progress = true;
2353             break;
2354          }
2355          break;
2356       case BRW_OPCODE_LRP:
2357          if (inst->src[1].equals(inst->src[2])) {
2358             inst->opcode = BRW_OPCODE_MOV;
2359             inst->src[0] = inst->src[1];
2360             inst->src[1] = reg_undef;
2361             inst->src[2] = reg_undef;
2362             progress = true;
2363             break;
2364          }
2365          break;
2366       case BRW_OPCODE_CMP:
2367          if (inst->conditional_mod == BRW_CONDITIONAL_GE &&
2368              inst->src[0].abs &&
2369              inst->src[0].negate &&
2370              inst->src[1].is_zero()) {
2371             inst->src[0].abs = false;
2372             inst->src[0].negate = false;
2373             inst->conditional_mod = BRW_CONDITIONAL_Z;
2374             progress = true;
2375             break;
2376          }
2377          break;
2378       case BRW_OPCODE_SEL:
2379          if (inst->src[0].equals(inst->src[1])) {
2380             inst->opcode = BRW_OPCODE_MOV;
2381             inst->src[1] = reg_undef;
2382             inst->predicate = BRW_PREDICATE_NONE;
2383             inst->predicate_inverse = false;
2384             progress = true;
2385          } else if (inst->saturate && inst->src[1].file == IMM) {
2386             switch (inst->conditional_mod) {
2387             case BRW_CONDITIONAL_LE:
2388             case BRW_CONDITIONAL_L:
2389                switch (inst->src[1].type) {
2390                case BRW_REGISTER_TYPE_F:
2391                   if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
2392                      inst->opcode = BRW_OPCODE_MOV;
2393                      inst->src[1] = reg_undef;
2394                      progress = true;
2395                   }
2396                   break;
2397                default:
2398                   break;
2399                }
2400                break;
2401             case BRW_CONDITIONAL_GE:
2402             case BRW_CONDITIONAL_G:
2403                switch (inst->src[1].type) {
2404                case BRW_REGISTER_TYPE_F:
2405                   if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
2406                      inst->opcode = BRW_OPCODE_MOV;
2407                      inst->src[1] = reg_undef;
2408                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
2409                      progress = true;
2410                   }
2411                   break;
2412                default:
2413                   break;
2414                }
2415             default:
2416                break;
2417             }
2418          }
2419          break;
2420       case SHADER_OPCODE_RCP: {
2421          fs_inst *prev = (fs_inst *)inst->prev;
2422          if (prev->opcode == SHADER_OPCODE_SQRT) {
2423             if (inst->src[0].equals(prev->dst)) {
2424                inst->opcode = SHADER_OPCODE_RSQ;
2425                inst->src[0] = prev->src[0];
2426                progress = true;
2427             }
2428          }
2429          break;
2430       }
2431       default:
2432          break;
2433       }
2434    }
2435
2436    return progress;
2437 }
2438
2439 bool
2440 fs_visitor::opt_register_renaming()
2441 {
2442    bool progress = false;
2443    int depth = 0;
2444
2445    int remap[virtual_grf_count];
2446    memset(remap, -1, sizeof(int) * virtual_grf_count);
2447
2448    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2449       if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
2450          depth++;
2451       } else if (inst->opcode == BRW_OPCODE_ENDIF ||
2452                  inst->opcode == BRW_OPCODE_WHILE) {
2453          depth--;
2454       }
2455
2456       /* Rewrite instruction sources. */
2457       for (int i = 0; i < inst->sources; i++) {
2458          if (inst->src[i].file == GRF &&
2459              remap[inst->src[i].reg] != -1 &&
2460              remap[inst->src[i].reg] != inst->src[i].reg) {
2461             inst->src[i].reg = remap[inst->src[i].reg];
2462             progress = true;
2463          }
2464       }
2465
2466       const int dst = inst->dst.reg;
2467
2468       if (depth == 0 &&
2469           inst->dst.file == GRF &&
2470           virtual_grf_sizes[inst->dst.reg] == inst->dst.width / 8 &&
2471           !inst->is_partial_write()) {
2472          if (remap[dst] == -1) {
2473             remap[dst] = dst;
2474          } else {
2475             remap[dst] = virtual_grf_alloc(inst->dst.width / 8);
2476             inst->dst.reg = remap[dst];
2477             progress = true;
2478          }
2479       } else if (inst->dst.file == GRF &&
2480                  remap[dst] != -1 &&
2481                  remap[dst] != dst) {
2482          inst->dst.reg = remap[dst];
2483          progress = true;
2484       }
2485    }
2486
2487    if (progress) {
2488       invalidate_live_intervals();
2489
2490       for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
2491          if (delta_x[i].file == GRF && remap[delta_x[i].reg] != -1) {
2492             delta_x[i].reg = remap[delta_x[i].reg];
2493          }
2494       }
2495       for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
2496          if (delta_y[i].file == GRF && remap[delta_y[i].reg] != -1) {
2497             delta_y[i].reg = remap[delta_y[i].reg];
2498          }
2499       }
2500    }
2501
2502    return progress;
2503 }
2504
2505 bool
2506 fs_visitor::compute_to_mrf()
2507 {
2508    bool progress = false;
2509    int next_ip = 0;
2510
2511    /* No MRFs on Gen >= 7. */
2512    if (brw->gen >= 7)
2513       return false;
2514
2515    calculate_live_intervals();
2516
2517    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2518       int ip = next_ip;
2519       next_ip++;
2520
2521       if (inst->opcode != BRW_OPCODE_MOV ||
2522           inst->is_partial_write() ||
2523           inst->dst.file != MRF || inst->src[0].file != GRF ||
2524           inst->dst.type != inst->src[0].type ||
2525           inst->src[0].abs || inst->src[0].negate ||
2526           !inst->src[0].is_contiguous() ||
2527           inst->src[0].subreg_offset)
2528          continue;
2529
2530       /* Work out which hardware MRF registers are written by this
2531        * instruction.
2532        */
2533       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2534       int mrf_high;
2535       if (inst->dst.reg & BRW_MRF_COMPR4) {
2536          mrf_high = mrf_low + 4;
2537       } else if (inst->exec_size == 16) {
2538          mrf_high = mrf_low + 1;
2539       } else {
2540          mrf_high = mrf_low;
2541       }
2542
2543       /* Can't compute-to-MRF this GRF if someone else was going to
2544        * read it later.
2545        */
2546       if (this->virtual_grf_end[inst->src[0].reg] > ip)
2547          continue;
2548
2549       /* Found a move of a GRF to a MRF.  Let's see if we can go
2550        * rewrite the thing that made this GRF to write into the MRF.
2551        */
2552       foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2553          if (scan_inst->dst.file == GRF &&
2554              scan_inst->dst.reg == inst->src[0].reg) {
2555             /* Found the last thing to write our reg we want to turn
2556              * into a compute-to-MRF.
2557              */
2558
2559             /* If this one instruction didn't populate all the
2560              * channels, bail.  We might be able to rewrite everything
2561              * that writes that reg, but it would require smarter
2562              * tracking to delay the rewriting until complete success.
2563              */
2564             if (scan_inst->is_partial_write())
2565                break;
2566
2567             /* Things returning more than one register would need us to
2568              * understand coalescing out more than one MOV at a time.
2569              */
2570             if (scan_inst->regs_written > scan_inst->dst.width / 8)
2571                break;
2572
2573             /* SEND instructions can't have MRF as a destination. */
2574             if (scan_inst->mlen)
2575                break;
2576
2577             if (brw->gen == 6) {
2578                /* gen6 math instructions must have the destination be
2579                 * GRF, so no compute-to-MRF for them.
2580                 */
2581                if (scan_inst->is_math()) {
2582                   break;
2583                }
2584             }
2585
2586             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2587                /* Found the creator of our MRF's source value. */
2588                scan_inst->dst.file = MRF;
2589                scan_inst->dst.reg = inst->dst.reg;
2590                scan_inst->saturate |= inst->saturate;
2591                inst->remove(block);
2592                progress = true;
2593             }
2594             break;
2595          }
2596
2597          /* We don't handle control flow here.  Most computation of
2598           * values that end up in MRFs are shortly before the MRF
2599           * write anyway.
2600           */
2601          if (block->start() == scan_inst)
2602             break;
2603
2604          /* You can't read from an MRF, so if someone else reads our
2605           * MRF's source GRF that we wanted to rewrite, that stops us.
2606           */
2607          bool interfered = false;
2608          for (int i = 0; i < scan_inst->sources; i++) {
2609             if (scan_inst->src[i].file == GRF &&
2610                 scan_inst->src[i].reg == inst->src[0].reg &&
2611                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2612                interfered = true;
2613             }
2614          }
2615          if (interfered)
2616             break;
2617
2618          if (scan_inst->dst.file == MRF) {
2619             /* If somebody else writes our MRF here, we can't
2620              * compute-to-MRF before that.
2621              */
2622             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2623             int scan_mrf_high;
2624
2625             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2626                scan_mrf_high = scan_mrf_low + 4;
2627             } else if (scan_inst->exec_size == 16) {
2628                scan_mrf_high = scan_mrf_low + 1;
2629             } else {
2630                scan_mrf_high = scan_mrf_low;
2631             }
2632
2633             if (mrf_low == scan_mrf_low ||
2634                 mrf_low == scan_mrf_high ||
2635                 mrf_high == scan_mrf_low ||
2636                 mrf_high == scan_mrf_high) {
2637                break;
2638             }
2639          }
2640
2641          if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2642             /* Found a SEND instruction, which means that there are
2643              * live values in MRFs from base_mrf to base_mrf +
2644              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2645              * above it.
2646              */
2647             if (mrf_low >= scan_inst->base_mrf &&
2648                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2649                break;
2650             }
2651             if (mrf_high >= scan_inst->base_mrf &&
2652                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2653                break;
2654             }
2655          }
2656       }
2657    }
2658
2659    if (progress)
2660       invalidate_live_intervals();
2661
2662    return progress;
2663 }
2664
2665 /**
2666  * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
2667  * instructions to FS_OPCODE_REP_FB_WRITE.
2668  */
2669 void
2670 fs_visitor::emit_repclear_shader()
2671 {
2672    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2673    int base_mrf = 1;
2674    int color_mrf = base_mrf + 2;
2675
2676    fs_inst *mov = emit(MOV(vec4(brw_message_reg(color_mrf)),
2677                            fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F)));
2678    mov->force_writemask_all = true;
2679
2680    fs_inst *write;
2681    if (key->nr_color_regions == 1) {
2682       write = emit(FS_OPCODE_REP_FB_WRITE);
2683       write->saturate = key->clamp_fragment_color;
2684       write->base_mrf = color_mrf;
2685       write->target = 0;
2686       write->header_present = false;
2687       write->mlen = 1;
2688    } else {
2689       assume(key->nr_color_regions > 0);
2690       for (int i = 0; i < key->nr_color_regions; ++i) {
2691          write = emit(FS_OPCODE_REP_FB_WRITE);
2692          write->saturate = key->clamp_fragment_color;
2693          write->base_mrf = base_mrf;
2694          write->target = i;
2695          write->header_present = true;
2696          write->mlen = 3;
2697       }
2698    }
2699    write->eot = true;
2700
2701    calculate_cfg();
2702
2703    assign_constant_locations();
2704    assign_curb_setup();
2705
2706    /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
2707    assert(mov->src[0].file == HW_REG);
2708    mov->src[0] = brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
2709 }
2710
2711 /**
2712  * Walks through basic blocks, looking for repeated MRF writes and
2713  * removing the later ones.
2714  */
2715 bool
2716 fs_visitor::remove_duplicate_mrf_writes()
2717 {
2718    fs_inst *last_mrf_move[16];
2719    bool progress = false;
2720
2721    /* Need to update the MRF tracking for compressed instructions. */
2722    if (dispatch_width == 16)
2723       return false;
2724
2725    memset(last_mrf_move, 0, sizeof(last_mrf_move));
2726
2727    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
2728       if (inst->is_control_flow()) {
2729          memset(last_mrf_move, 0, sizeof(last_mrf_move));
2730       }
2731
2732       if (inst->opcode == BRW_OPCODE_MOV &&
2733           inst->dst.file == MRF) {
2734          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2735          if (prev_inst && inst->equals(prev_inst)) {
2736             inst->remove(block);
2737             progress = true;
2738             continue;
2739          }
2740       }
2741
2742       /* Clear out the last-write records for MRFs that were overwritten. */
2743       if (inst->dst.file == MRF) {
2744          last_mrf_move[inst->dst.reg] = NULL;
2745       }
2746
2747       if (inst->mlen > 0 && inst->base_mrf != -1) {
2748          /* Found a SEND instruction, which will include two or fewer
2749           * implied MRF writes.  We could do better here.
2750           */
2751          for (int i = 0; i < implied_mrf_writes(inst); i++) {
2752             last_mrf_move[inst->base_mrf + i] = NULL;
2753          }
2754       }
2755
2756       /* Clear out any MRF move records whose sources got overwritten. */
2757       if (inst->dst.file == GRF) {
2758          for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2759             if (last_mrf_move[i] &&
2760                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2761                last_mrf_move[i] = NULL;
2762             }
2763          }
2764       }
2765
2766       if (inst->opcode == BRW_OPCODE_MOV &&
2767           inst->dst.file == MRF &&
2768           inst->src[0].file == GRF &&
2769           !inst->is_partial_write()) {
2770          last_mrf_move[inst->dst.reg] = inst;
2771       }
2772    }
2773
2774    if (progress)
2775       invalidate_live_intervals();
2776
2777    return progress;
2778 }
2779
2780 static void
2781 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2782                         int first_grf, int grf_len)
2783 {
2784    /* Clear the flag for registers that actually got read (as expected). */
2785    for (int i = 0; i < inst->sources; i++) {
2786       int grf;
2787       if (inst->src[i].file == GRF) {
2788          grf = inst->src[i].reg;
2789       } else if (inst->src[i].file == HW_REG &&
2790                  inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2791          grf = inst->src[i].fixed_hw_reg.nr;
2792       } else {
2793          continue;
2794       }
2795
2796       if (grf >= first_grf &&
2797           grf < first_grf + grf_len) {
2798          deps[grf - first_grf] = false;
2799          if (inst->exec_size == 16)
2800             deps[grf - first_grf + 1] = false;
2801       }
2802    }
2803 }
2804
2805 /**
2806  * Implements this workaround for the original 965:
2807  *
2808  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2809  *      check for post destination dependencies on this instruction, software
2810  *      must ensure that there is no destination hazard for the case of ‘write
2811  *      followed by a posted write’ shown in the following example.
2812  *
2813  *      1. mov r3 0
2814  *      2. send r3.xy <rest of send instruction>
2815  *      3. mov r2 r3
2816  *
2817  *      Due to no post-destination dependency check on the ‘send’, the above
2818  *      code sequence could have two instructions (1 and 2) in flight at the
2819  *      same time that both consider ‘r3’ as the target of their final writes.
2820  */
2821 void
2822 fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
2823                                                         fs_inst *inst)
2824 {
2825    int write_len = inst->regs_written;
2826    int first_write_grf = inst->dst.reg;
2827    bool needs_dep[BRW_MAX_MRF];
2828    assert(write_len < (int)sizeof(needs_dep) - 1);
2829
2830    memset(needs_dep, false, sizeof(needs_dep));
2831    memset(needs_dep, true, write_len);
2832
2833    clear_deps_for_inst_src(inst, dispatch_width,
2834                            needs_dep, first_write_grf, write_len);
2835
2836    /* Walk backwards looking for writes to registers we're writing which
2837     * aren't read since being written.  If we hit the start of the program,
2838     * we assume that there are no outstanding dependencies on entry to the
2839     * program.
2840     */
2841    foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2842       /* If we hit control flow, assume that there *are* outstanding
2843        * dependencies, and force their cleanup before our instruction.
2844        */
2845       if (block->start() == scan_inst) {
2846          for (int i = 0; i < write_len; i++) {
2847             if (needs_dep[i]) {
2848                inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
2849             }
2850          }
2851          return;
2852       }
2853
2854       /* We insert our reads as late as possible on the assumption that any
2855        * instruction but a MOV that might have left us an outstanding
2856        * dependency has more latency than a MOV.
2857        */
2858       if (scan_inst->dst.file == GRF) {
2859          for (int i = 0; i < scan_inst->regs_written; i++) {
2860             int reg = scan_inst->dst.reg + i;
2861
2862             if (reg >= first_write_grf &&
2863                 reg < first_write_grf + write_len &&
2864                 needs_dep[reg - first_write_grf]) {
2865                inst->insert_before(block, DEP_RESOLVE_MOV(reg));
2866                needs_dep[reg - first_write_grf] = false;
2867                if (scan_inst->exec_size == 16)
2868                   needs_dep[reg - first_write_grf + 1] = false;
2869             }
2870          }
2871       }
2872
2873       /* Clear the flag for registers that actually got read (as expected). */
2874       clear_deps_for_inst_src(scan_inst, dispatch_width,
2875                               needs_dep, first_write_grf, write_len);
2876
2877       /* Continue the loop only if we haven't resolved all the dependencies */
2878       int i;
2879       for (i = 0; i < write_len; i++) {
2880          if (needs_dep[i])
2881             break;
2882       }
2883       if (i == write_len)
2884          return;
2885    }
2886 }
2887
2888 /**
2889  * Implements this workaround for the original 965:
2890  *
2891  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
2892  *      used as a destination register until after it has been sourced by an
2893  *      instruction with a different destination register.
2894  */
2895 void
2896 fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
2897 {
2898    int write_len = inst->regs_written;
2899    int first_write_grf = inst->dst.reg;
2900    bool needs_dep[BRW_MAX_MRF];
2901    assert(write_len < (int)sizeof(needs_dep) - 1);
2902
2903    memset(needs_dep, false, sizeof(needs_dep));
2904    memset(needs_dep, true, write_len);
2905    /* Walk forwards looking for writes to registers we're writing which aren't
2906     * read before being written.
2907     */
2908    foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst, block) {
2909       /* If we hit control flow, force resolve all remaining dependencies. */
2910       if (block->end() == scan_inst) {
2911          for (int i = 0; i < write_len; i++) {
2912             if (needs_dep[i])
2913                scan_inst->insert_before(block,
2914                                         DEP_RESOLVE_MOV(first_write_grf + i));
2915          }
2916          return;
2917       }
2918
2919       /* Clear the flag for registers that actually got read (as expected). */
2920       clear_deps_for_inst_src(scan_inst, dispatch_width,
2921                               needs_dep, first_write_grf, write_len);
2922
2923       /* We insert our reads as late as possible since they're reading the
2924        * result of a SEND, which has massive latency.
2925        */
2926       if (scan_inst->dst.file == GRF &&
2927           scan_inst->dst.reg >= first_write_grf &&
2928           scan_inst->dst.reg < first_write_grf + write_len &&
2929           needs_dep[scan_inst->dst.reg - first_write_grf]) {
2930          scan_inst->insert_before(block, DEP_RESOLVE_MOV(scan_inst->dst.reg));
2931          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2932       }
2933
2934       /* Continue the loop only if we haven't resolved all the dependencies */
2935       int i;
2936       for (i = 0; i < write_len; i++) {
2937          if (needs_dep[i])
2938             break;
2939       }
2940       if (i == write_len)
2941          return;
2942    }
2943
2944    /* If we hit the end of the program, resolve all remaining dependencies out
2945     * of paranoia.
2946     */
2947    fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2948    assert(last_inst->eot);
2949    for (int i = 0; i < write_len; i++) {
2950       if (needs_dep[i])
2951          last_inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
2952    }
2953 }
2954
2955 void
2956 fs_visitor::insert_gen4_send_dependency_workarounds()
2957 {
2958    if (brw->gen != 4 || brw->is_g4x)
2959       return;
2960
2961    bool progress = false;
2962
2963    /* Note that we're done with register allocation, so GRF fs_regs always
2964     * have a .reg_offset of 0.
2965     */
2966
2967    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2968       if (inst->mlen != 0 && inst->dst.file == GRF) {
2969          insert_gen4_pre_send_dependency_workarounds(block, inst);
2970          insert_gen4_post_send_dependency_workarounds(block, inst);
2971          progress = true;
2972       }
2973    }
2974
2975    if (progress)
2976       invalidate_live_intervals();
2977 }
2978
2979 /**
2980  * Turns the generic expression-style uniform pull constant load instruction
2981  * into a hardware-specific series of instructions for loading a pull
2982  * constant.
2983  *
2984  * The expression style allows the CSE pass before this to optimize out
2985  * repeated loads from the same offset, and gives the pre-register-allocation
2986  * scheduling full flexibility, while the conversion to native instructions
2987  * allows the post-register-allocation scheduler the best information
2988  * possible.
2989  *
2990  * Note that execution masking for setting up pull constant loads is special:
2991  * the channels that need to be written are unrelated to the current execution
2992  * mask, since a later instruction will use one of the result channels as a
2993  * source operand for all 8 or 16 of its channels.
2994  */
2995 void
2996 fs_visitor::lower_uniform_pull_constant_loads()
2997 {
2998    foreach_block_and_inst (block, fs_inst, inst, cfg) {
2999       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
3000          continue;
3001
3002       if (brw->gen >= 7) {
3003          /* The offset arg before was a vec4-aligned byte offset.  We need to
3004           * turn it into a dword offset.
3005           */
3006          fs_reg const_offset_reg = inst->src[1];
3007          assert(const_offset_reg.file == IMM &&
3008                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
3009          const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
3010          fs_reg payload = vgrf(glsl_type::uint_type);
3011
3012          /* We have to use a message header on Skylake to get SIMD4x2 mode.
3013           * Reserve space for the register.
3014           */
3015          if (brw->gen >= 9) {
3016             payload.reg_offset++;
3017             virtual_grf_sizes[payload.reg] = 2;
3018          }
3019
3020          /* This is actually going to be a MOV, but since only the first dword
3021           * is accessed, we have a special opcode to do just that one.  Note
3022           * that this needs to be an operation that will be considered a def
3023           * by live variable analysis, or register allocation will explode.
3024           */
3025          fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
3026                                                8, payload, const_offset_reg);
3027          setup->force_writemask_all = true;
3028
3029          setup->ir = inst->ir;
3030          setup->annotation = inst->annotation;
3031          inst->insert_before(block, setup);
3032
3033          /* Similarly, this will only populate the first 4 channels of the
3034           * result register (since we only use smear values from 0-3), but we
3035           * don't tell the optimizer.
3036           */
3037          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
3038          inst->src[1] = payload;
3039
3040          invalidate_live_intervals();
3041       } else {
3042          /* Before register allocation, we didn't tell the scheduler about the
3043           * MRF we use.  We know it's safe to use this MRF because nothing
3044           * else does except for register spill/unspill, which generates and
3045           * uses its MRF within a single IR instruction.
3046           */
3047          inst->base_mrf = 14;
3048          inst->mlen = 1;
3049       }
3050    }
3051 }
3052
3053 bool
3054 fs_visitor::lower_load_payload()
3055 {
3056    bool progress = false;
3057
3058    int vgrf_to_reg[virtual_grf_count];
3059    int reg_count = 16; /* Leave room for MRF */
3060    for (int i = 0; i < virtual_grf_count; ++i) {
3061       vgrf_to_reg[i] = reg_count;
3062       reg_count += virtual_grf_sizes[i];
3063    }
3064
3065    struct {
3066       bool written:1; /* Whether this register has ever been written */
3067       bool force_writemask_all:1;
3068       bool force_sechalf:1;
3069    } metadata[reg_count];
3070    memset(metadata, 0, sizeof(metadata));
3071
3072    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3073       int dst_reg;
3074       if (inst->dst.file == GRF) {
3075          dst_reg = vgrf_to_reg[inst->dst.reg];
3076       } else {
3077          /* MRF */
3078          dst_reg = inst->dst.reg;
3079       }
3080
3081       if (inst->dst.file == MRF || inst->dst.file == GRF) {
3082          bool force_sechalf = inst->force_sechalf;
3083          bool toggle_sechalf = inst->dst.width == 16 &&
3084                                type_sz(inst->dst.type) == 4;
3085          for (int i = 0; i < inst->regs_written; ++i) {
3086             metadata[dst_reg + i].written = true;
3087             metadata[dst_reg + i].force_sechalf = force_sechalf;
3088             metadata[dst_reg + i].force_writemask_all = inst->force_writemask_all;
3089             force_sechalf = (toggle_sechalf != force_sechalf);
3090          }
3091       }
3092
3093       if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
3094          assert(inst->dst.file == MRF || inst->dst.file == GRF);
3095          fs_reg dst = inst->dst;
3096
3097          for (int i = 0; i < inst->sources; i++) {
3098             dst.width = inst->src[i].effective_width;
3099             dst.type = inst->src[i].type;
3100
3101             if (inst->src[i].file == BAD_FILE) {
3102                /* Do nothing but otherwise increment as normal */
3103             } else if (dst.file == MRF &&
3104                        dst.width == 8 &&
3105                        brw->has_compr4 &&
3106                        i + 4 < inst->sources &&
3107                        inst->src[i + 4].equals(horiz_offset(inst->src[i], 8))) {
3108                fs_reg compr4_dst = dst;
3109                compr4_dst.reg += BRW_MRF_COMPR4;
3110                compr4_dst.width = 16;
3111                fs_reg compr4_src = inst->src[i];
3112                compr4_src.width = 16;
3113                fs_inst *mov = MOV(compr4_dst, compr4_src);
3114                mov->force_writemask_all = true;
3115                inst->insert_before(block, mov);
3116                /* Mark i+4 as BAD_FILE so we don't emit a MOV for it */
3117                inst->src[i + 4].file = BAD_FILE;
3118             } else {
3119                fs_inst *mov = MOV(dst, inst->src[i]);
3120                if (inst->src[i].file == GRF) {
3121                   int src_reg = vgrf_to_reg[inst->src[i].reg] +
3122                                 inst->src[i].reg_offset;
3123                   mov->force_sechalf = metadata[src_reg].force_sechalf;
3124                   mov->force_writemask_all = metadata[src_reg].force_writemask_all;
3125                   metadata[dst_reg] = metadata[src_reg];
3126                   if (dst.width * type_sz(dst.type) > 32) {
3127                      assert((!metadata[src_reg].written ||
3128                              !metadata[src_reg].force_sechalf) &&
3129                             (!metadata[src_reg + 1].written ||
3130                              metadata[src_reg + 1].force_sechalf));
3131                      metadata[dst_reg + 1] = metadata[src_reg + 1];
3132                   }
3133                } else {
3134                   metadata[dst_reg].force_writemask_all = false;
3135                   metadata[dst_reg].force_sechalf = false;
3136                   if (dst.width == 16) {
3137                      metadata[dst_reg + 1].force_writemask_all = false;
3138                      metadata[dst_reg + 1].force_sechalf = true;
3139                   }
3140                }
3141                inst->insert_before(block, mov);
3142             }
3143
3144             dst = offset(dst, 1);
3145          }
3146
3147          inst->remove(block);
3148          progress = true;
3149       }
3150    }
3151
3152    if (progress)
3153       invalidate_live_intervals();
3154
3155    return progress;
3156 }
3157
3158 void
3159 fs_visitor::dump_instructions()
3160 {
3161    dump_instructions(NULL);
3162 }
3163
3164 void
3165 fs_visitor::dump_instructions(const char *name)
3166 {
3167    calculate_register_pressure();
3168    FILE *file = stderr;
3169    if (name && geteuid() != 0) {
3170       file = fopen(name, "w");
3171       if (!file)
3172          file = stderr;
3173    }
3174
3175    int ip = 0, max_pressure = 0;
3176    foreach_block_and_inst(block, backend_instruction, inst, cfg) {
3177       max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
3178       fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
3179       dump_instruction(inst, file);
3180       ++ip;
3181    }
3182    fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
3183
3184    if (file != stderr) {
3185       fclose(file);
3186    }
3187 }
3188
3189 void
3190 fs_visitor::dump_instruction(backend_instruction *be_inst)
3191 {
3192    dump_instruction(be_inst, stderr);
3193 }
3194
3195 void
3196 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
3197 {
3198    fs_inst *inst = (fs_inst *)be_inst;
3199
3200    if (inst->predicate) {
3201       fprintf(file, "(%cf0.%d) ",
3202              inst->predicate_inverse ? '-' : '+',
3203              inst->flag_subreg);
3204    }
3205
3206    fprintf(file, "%s", brw_instruction_name(inst->opcode));
3207    if (inst->saturate)
3208       fprintf(file, ".sat");
3209    if (inst->conditional_mod) {
3210       fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
3211       if (!inst->predicate &&
3212           (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
3213                               inst->opcode != BRW_OPCODE_IF &&
3214                               inst->opcode != BRW_OPCODE_WHILE))) {
3215          fprintf(file, ".f0.%d", inst->flag_subreg);
3216       }
3217    }
3218    fprintf(file, "(%d) ", inst->exec_size);
3219
3220
3221    switch (inst->dst.file) {
3222    case GRF:
3223       fprintf(file, "vgrf%d", inst->dst.reg);
3224       if (inst->dst.width != dispatch_width)
3225          fprintf(file, "@%d", inst->dst.width);
3226       if (virtual_grf_sizes[inst->dst.reg] != inst->dst.width / 8 ||
3227           inst->dst.subreg_offset)
3228          fprintf(file, "+%d.%d",
3229                  inst->dst.reg_offset, inst->dst.subreg_offset);
3230       break;
3231    case MRF:
3232       fprintf(file, "m%d", inst->dst.reg);
3233       break;
3234    case BAD_FILE:
3235       fprintf(file, "(null)");
3236       break;
3237    case UNIFORM:
3238       fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
3239       break;
3240    case ATTR:
3241       fprintf(file, "***attr%d***", inst->dst.reg + inst->dst.reg_offset);
3242       break;
3243    case HW_REG:
3244       if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3245          switch (inst->dst.fixed_hw_reg.nr) {
3246          case BRW_ARF_NULL:
3247             fprintf(file, "null");
3248             break;
3249          case BRW_ARF_ADDRESS:
3250             fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
3251             break;
3252          case BRW_ARF_ACCUMULATOR:
3253             fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
3254             break;
3255          case BRW_ARF_FLAG:
3256             fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3257                              inst->dst.fixed_hw_reg.subnr);
3258             break;
3259          default:
3260             fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3261                                inst->dst.fixed_hw_reg.subnr);
3262             break;
3263          }
3264       } else {
3265          fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
3266       }
3267       if (inst->dst.fixed_hw_reg.subnr)
3268          fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
3269       break;
3270    default:
3271       fprintf(file, "???");
3272       break;
3273    }
3274    fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
3275
3276    for (int i = 0; i < inst->sources; i++) {
3277       if (inst->src[i].negate)
3278          fprintf(file, "-");
3279       if (inst->src[i].abs)
3280          fprintf(file, "|");
3281       switch (inst->src[i].file) {
3282       case GRF:
3283          fprintf(file, "vgrf%d", inst->src[i].reg);
3284          if (inst->src[i].width != dispatch_width)
3285             fprintf(file, "@%d", inst->src[i].width);
3286          if (virtual_grf_sizes[inst->src[i].reg] != inst->src[i].width / 8 ||
3287              inst->src[i].subreg_offset)
3288             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3289                     inst->src[i].subreg_offset);
3290          break;
3291       case MRF:
3292          fprintf(file, "***m%d***", inst->src[i].reg);
3293          break;
3294       case ATTR:
3295          fprintf(file, "attr%d", inst->src[i].reg + inst->src[i].reg_offset);
3296          break;
3297       case UNIFORM:
3298          fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
3299          if (inst->src[i].reladdr) {
3300             fprintf(file, "+reladdr");
3301          } else if (inst->src[i].subreg_offset) {
3302             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3303                     inst->src[i].subreg_offset);
3304          }
3305          break;
3306       case BAD_FILE:
3307          fprintf(file, "(null)");
3308          break;
3309       case IMM:
3310          switch (inst->src[i].type) {
3311          case BRW_REGISTER_TYPE_F:
3312             fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
3313             break;
3314          case BRW_REGISTER_TYPE_D:
3315             fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
3316             break;
3317          case BRW_REGISTER_TYPE_UD:
3318             fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
3319             break;
3320          case BRW_REGISTER_TYPE_VF:
3321             fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
3322                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >>  0) & 0xff),
3323                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >>  8) & 0xff),
3324                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 16) & 0xff),
3325                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 24) & 0xff));
3326             break;
3327          default:
3328             fprintf(file, "???");
3329             break;
3330          }
3331          break;
3332       case HW_REG:
3333          if (inst->src[i].fixed_hw_reg.negate)
3334             fprintf(file, "-");
3335          if (inst->src[i].fixed_hw_reg.abs)
3336             fprintf(file, "|");
3337          if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3338             switch (inst->src[i].fixed_hw_reg.nr) {
3339             case BRW_ARF_NULL:
3340                fprintf(file, "null");
3341                break;
3342             case BRW_ARF_ADDRESS:
3343                fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
3344                break;
3345             case BRW_ARF_ACCUMULATOR:
3346                fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
3347                break;
3348             case BRW_ARF_FLAG:
3349                fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3350                                 inst->src[i].fixed_hw_reg.subnr);
3351                break;
3352             default:
3353                fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3354                                   inst->src[i].fixed_hw_reg.subnr);
3355                break;
3356             }
3357          } else {
3358             fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3359          }
3360          if (inst->src[i].fixed_hw_reg.subnr)
3361             fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
3362          if (inst->src[i].fixed_hw_reg.abs)
3363             fprintf(file, "|");
3364          break;
3365       default:
3366          fprintf(file, "???");
3367          break;
3368       }
3369       if (inst->src[i].abs)
3370          fprintf(file, "|");
3371
3372       if (inst->src[i].file != IMM) {
3373          fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
3374       }
3375
3376       if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
3377          fprintf(file, ", ");
3378    }
3379
3380    fprintf(file, " ");
3381
3382    if (dispatch_width == 16 && inst->exec_size == 8) {
3383       if (inst->force_sechalf)
3384          fprintf(file, "2ndhalf ");
3385       else
3386          fprintf(file, "1sthalf ");
3387    }
3388
3389    fprintf(file, "\n");
3390 }
3391
3392 /**
3393  * Possibly returns an instruction that set up @param reg.
3394  *
3395  * Sometimes we want to take the result of some expression/variable
3396  * dereference tree and rewrite the instruction generating the result
3397  * of the tree.  When processing the tree, we know that the
3398  * instructions generated are all writing temporaries that are dead
3399  * outside of this tree.  So, if we have some instructions that write
3400  * a temporary, we're free to point that temp write somewhere else.
3401  *
3402  * Note that this doesn't guarantee that the instruction generated
3403  * only reg -- it might be the size=4 destination of a texture instruction.
3404  */
3405 fs_inst *
3406 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3407                                            fs_inst *end,
3408                                            const fs_reg &reg)
3409 {
3410    if (end == start ||
3411        end->is_partial_write() ||
3412        reg.reladdr ||
3413        !reg.equals(end->dst)) {
3414       return NULL;
3415    } else {
3416       return end;
3417    }
3418 }
3419
3420 void
3421 fs_visitor::setup_payload_gen6()
3422 {
3423    bool uses_depth =
3424       (prog->InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3425    unsigned barycentric_interp_modes =
3426       (stage == MESA_SHADER_FRAGMENT) ?
3427       ((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
3428
3429    assert(brw->gen >= 6);
3430
3431    /* R0-1: masks, pixel X/Y coordinates. */
3432    payload.num_regs = 2;
3433    /* R2: only for 32-pixel dispatch.*/
3434
3435    /* R3-26: barycentric interpolation coordinates.  These appear in the
3436     * same order that they appear in the brw_wm_barycentric_interp_mode
3437     * enum.  Each set of coordinates occupies 2 registers if dispatch width
3438     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
3439     * appear if they were enabled using the "Barycentric Interpolation
3440     * Mode" bits in WM_STATE.
3441     */
3442    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3443       if (barycentric_interp_modes & (1 << i)) {
3444          payload.barycentric_coord_reg[i] = payload.num_regs;
3445          payload.num_regs += 2;
3446          if (dispatch_width == 16) {
3447             payload.num_regs += 2;
3448          }
3449       }
3450    }
3451
3452    /* R27: interpolated depth if uses source depth */
3453    if (uses_depth) {
3454       payload.source_depth_reg = payload.num_regs;
3455       payload.num_regs++;
3456       if (dispatch_width == 16) {
3457          /* R28: interpolated depth if not SIMD8. */
3458          payload.num_regs++;
3459       }
3460    }
3461    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3462    if (uses_depth) {
3463       payload.source_w_reg = payload.num_regs;
3464       payload.num_regs++;
3465       if (dispatch_width == 16) {
3466          /* R30: interpolated W if not SIMD8. */
3467          payload.num_regs++;
3468       }
3469    }
3470
3471    if (stage == MESA_SHADER_FRAGMENT) {
3472       brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3473       brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3474       prog_data->uses_pos_offset = key->compute_pos_offset;
3475       /* R31: MSAA position offsets. */
3476       if (prog_data->uses_pos_offset) {
3477          payload.sample_pos_reg = payload.num_regs;
3478          payload.num_regs++;
3479       }
3480    }
3481
3482    /* R32: MSAA input coverage mask */
3483    if (prog->SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3484       assert(brw->gen >= 7);
3485       payload.sample_mask_in_reg = payload.num_regs;
3486       payload.num_regs++;
3487       if (dispatch_width == 16) {
3488          /* R33: input coverage mask if not SIMD8. */
3489          payload.num_regs++;
3490       }
3491    }
3492
3493    /* R34-: bary for 32-pixel. */
3494    /* R58-59: interp W for 32-pixel. */
3495
3496    if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3497       source_depth_to_render_target = true;
3498    }
3499 }
3500
3501 void
3502 fs_visitor::setup_vs_payload()
3503 {
3504    /* R0: thread header, R1: urb handles */
3505    payload.num_regs = 2;
3506 }
3507
3508 void
3509 fs_visitor::assign_binding_table_offsets()
3510 {
3511    assert(stage == MESA_SHADER_FRAGMENT);
3512    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3513    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3514    uint32_t next_binding_table_offset = 0;
3515
3516    /* If there are no color regions, we still perform an FB write to a null
3517     * renderbuffer, which we place at surface index 0.
3518     */
3519    prog_data->binding_table.render_target_start = next_binding_table_offset;
3520    next_binding_table_offset += MAX2(key->nr_color_regions, 1);
3521
3522    assign_common_binding_table_offsets(next_binding_table_offset);
3523 }
3524
3525 void
3526 fs_visitor::calculate_register_pressure()
3527 {
3528    invalidate_live_intervals();
3529    calculate_live_intervals();
3530
3531    unsigned num_instructions = 0;
3532    foreach_block(block, cfg)
3533       num_instructions += block->instructions.length();
3534
3535    regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3536
3537    for (int reg = 0; reg < virtual_grf_count; reg++) {
3538       for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3539          regs_live_at_ip[ip] += virtual_grf_sizes[reg];
3540    }
3541 }
3542
3543 void
3544 fs_visitor::optimize()
3545 {
3546    const char *stage_name = stage == MESA_SHADER_VERTEX ? "vs" : "fs";
3547
3548    calculate_cfg();
3549
3550    split_virtual_grfs();
3551
3552    move_uniform_array_access_to_pull_constants();
3553    assign_constant_locations();
3554    demote_pull_constants();
3555
3556 #define OPT(pass, args...) ({                                           \
3557       pass_num++;                                                       \
3558       bool this_progress = pass(args);                                  \
3559                                                                         \
3560       if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) {   \
3561          char filename[64];                                             \
3562          snprintf(filename, 64, "%s%d-%04d-%02d-%02d-" #pass,              \
3563                   stage_name, dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
3564                                                                         \
3565          backend_visitor::dump_instructions(filename);                  \
3566       }                                                                 \
3567                                                                         \
3568       progress = progress || this_progress;                             \
3569       this_progress;                                                    \
3570    })
3571
3572    if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
3573       char filename[64];
3574       snprintf(filename, 64, "%s%d-%04d-00-start",
3575                stage_name, dispatch_width, shader_prog ? shader_prog->Name : 0);
3576
3577       backend_visitor::dump_instructions(filename);
3578    }
3579
3580    bool progress;
3581    int iteration = 0;
3582    int pass_num = 0;
3583    do {
3584       progress = false;
3585       pass_num = 0;
3586       iteration++;
3587
3588       OPT(remove_duplicate_mrf_writes);
3589
3590       OPT(opt_algebraic);
3591       OPT(opt_cse);
3592       OPT(opt_copy_propagate);
3593       OPT(opt_peephole_predicated_break);
3594       OPT(opt_cmod_propagation);
3595       OPT(dead_code_eliminate);
3596       OPT(opt_peephole_sel);
3597       OPT(dead_control_flow_eliminate, this);
3598       OPT(opt_register_renaming);
3599       OPT(opt_saturate_propagation);
3600       OPT(register_coalesce);
3601       OPT(compute_to_mrf);
3602
3603       OPT(compact_virtual_grfs);
3604    } while (progress);
3605
3606    pass_num = 0;
3607
3608    if (OPT(lower_load_payload)) {
3609       split_virtual_grfs();
3610       OPT(register_coalesce);
3611       OPT(compute_to_mrf);
3612       OPT(dead_code_eliminate);
3613    }
3614
3615    lower_uniform_pull_constant_loads();
3616 }
3617
3618 /**
3619  * Three source instruction must have a GRF/MRF destination register.
3620  * ARF NULL is not allowed.  Fix that up by allocating a temporary GRF.
3621  */
3622 void
3623 fs_visitor::fixup_3src_null_dest()
3624 {
3625    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3626       if (inst->is_3src() && inst->dst.is_null()) {
3627          inst->dst = fs_reg(GRF, virtual_grf_alloc(dispatch_width / 8),
3628                             inst->dst.type);
3629       }
3630    }
3631 }
3632
3633 void
3634 fs_visitor::allocate_registers()
3635 {
3636    bool allocated_without_spills;
3637
3638    static const enum instruction_scheduler_mode pre_modes[] = {
3639       SCHEDULE_PRE,
3640       SCHEDULE_PRE_NON_LIFO,
3641       SCHEDULE_PRE_LIFO,
3642    };
3643
3644    /* Try each scheduling heuristic to see if it can successfully register
3645     * allocate without spilling.  They should be ordered by decreasing
3646     * performance but increasing likelihood of allocating.
3647     */
3648    for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3649       schedule_instructions(pre_modes[i]);
3650
3651       if (0) {
3652          assign_regs_trivial();
3653          allocated_without_spills = true;
3654       } else {
3655          allocated_without_spills = assign_regs(false);
3656       }
3657       if (allocated_without_spills)
3658          break;
3659    }
3660
3661    if (!allocated_without_spills) {
3662       const char *stage_name = stage == MESA_SHADER_VERTEX ?
3663          "Vertex" : "Fragment";
3664
3665       /* We assume that any spilling is worse than just dropping back to
3666        * SIMD8.  There's probably actually some intermediate point where
3667        * SIMD16 with a couple of spills is still better.
3668        */
3669       if (dispatch_width == 16) {
3670          fail("Failure to register allocate.  Reduce number of "
3671               "live scalar values to avoid this.");
3672       } else {
3673          perf_debug("%s shader triggered register spilling.  "
3674                     "Try reducing the number of live scalar values to "
3675                     "improve performance.\n", stage_name);
3676       }
3677
3678       /* Since we're out of heuristics, just go spill registers until we
3679        * get an allocation.
3680        */
3681       while (!assign_regs(true)) {
3682          if (failed)
3683             break;
3684       }
3685    }
3686
3687    /* This must come after all optimization and register allocation, since
3688     * it inserts dead code that happens to have side effects, and it does
3689     * so based on the actual physical registers in use.
3690     */
3691    insert_gen4_send_dependency_workarounds();
3692
3693    if (failed)
3694       return;
3695
3696    if (!allocated_without_spills)
3697       schedule_instructions(SCHEDULE_POST);
3698
3699    if (last_scratch > 0)
3700       prog_data->total_scratch = brw_get_scratch_size(last_scratch);
3701 }
3702
3703 bool
3704 fs_visitor::run_vs()
3705 {
3706    assert(stage == MESA_SHADER_VERTEX);
3707
3708    assign_common_binding_table_offsets(0);
3709    setup_vs_payload();
3710
3711    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3712       emit_shader_time_begin();
3713
3714    foreach_in_list(ir_instruction, ir, shader->base.ir) {
3715       base_ir = ir;
3716       this->result = reg_undef;
3717       ir->accept(this);
3718    }
3719    base_ir = NULL;
3720    if (failed)
3721       return false;
3722
3723    emit_urb_writes();
3724
3725    optimize();
3726
3727    assign_curb_setup();
3728    assign_vs_urb_setup();
3729
3730    fixup_3src_null_dest();
3731    allocate_registers();
3732
3733    return !failed;
3734 }
3735
3736 bool
3737 fs_visitor::run_fs()
3738 {
3739    brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
3740    brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
3741
3742    assert(stage == MESA_SHADER_FRAGMENT);
3743
3744    sanity_param_count = prog->Parameters->NumParameters;
3745
3746    assign_binding_table_offsets();
3747
3748    if (brw->gen >= 6)
3749       setup_payload_gen6();
3750    else
3751       setup_payload_gen4();
3752
3753    if (0) {
3754       emit_dummy_fs();
3755    } else if (brw->use_rep_send && dispatch_width == 16) {
3756       emit_repclear_shader();
3757    } else {
3758       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3759          emit_shader_time_begin();
3760
3761       calculate_urb_setup();
3762       if (prog->InputsRead > 0) {
3763          if (brw->gen < 6)
3764             emit_interpolation_setup_gen4();
3765          else
3766             emit_interpolation_setup_gen6();
3767       }
3768
3769       /* We handle discards by keeping track of the still-live pixels in f0.1.
3770        * Initialize it with the dispatched pixels.
3771        */
3772       if (wm_prog_data->uses_kill) {
3773          fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3774          discard_init->flag_subreg = 1;
3775       }
3776
3777       /* Generate FS IR for main().  (the visitor only descends into
3778        * functions called "main").
3779        */
3780       if (shader) {
3781          if (getenv("INTEL_USE_NIR") != NULL) {
3782             emit_nir_code();
3783          } else {
3784             foreach_in_list(ir_instruction, ir, shader->base.ir) {
3785                base_ir = ir;
3786                this->result = reg_undef;
3787                ir->accept(this);
3788             }
3789          }
3790       } else {
3791          emit_fragment_program_code();
3792       }
3793       base_ir = NULL;
3794       if (failed)
3795          return false;
3796
3797       emit(FS_OPCODE_PLACEHOLDER_HALT);
3798
3799       if (wm_key->alpha_test_func)
3800          emit_alpha_test();
3801
3802       emit_fb_writes();
3803
3804       optimize();
3805
3806       assign_curb_setup();
3807       assign_urb_setup();
3808
3809       fixup_3src_null_dest();
3810       allocate_registers();
3811
3812       if (failed)
3813          return false;
3814    }
3815
3816    if (dispatch_width == 8)
3817       wm_prog_data->reg_blocks = brw_register_blocks(grf_used);
3818    else
3819       wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
3820
3821    /* If any state parameters were appended, then ParameterValues could have
3822     * been realloced, in which case the driver uniform storage set up by
3823     * _mesa_associate_uniform_storage() would point to freed memory.  Make
3824     * sure that didn't happen.
3825     */
3826    assert(sanity_param_count == prog->Parameters->NumParameters);
3827
3828    return !failed;
3829 }
3830
3831 const unsigned *
3832 brw_wm_fs_emit(struct brw_context *brw,
3833                void *mem_ctx,
3834                const struct brw_wm_prog_key *key,
3835                struct brw_wm_prog_data *prog_data,
3836                struct gl_fragment_program *fp,
3837                struct gl_shader_program *prog,
3838                unsigned *final_assembly_size)
3839 {
3840    bool start_busy = false;
3841    double start_time = 0;
3842
3843    if (unlikely(brw->perf_debug)) {
3844       start_busy = (brw->batch.last_bo &&
3845                     drm_intel_bo_busy(brw->batch.last_bo));
3846       start_time = get_time();
3847    }
3848
3849    struct brw_shader *shader = NULL;
3850    if (prog)
3851       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3852
3853    if (unlikely(INTEL_DEBUG & DEBUG_WM))
3854       brw_dump_ir("fragment", prog, &shader->base, &fp->Base);
3855
3856    /* Now the main event: Visit the shader IR and generate our FS IR for it.
3857     */
3858    fs_visitor v(brw, mem_ctx, key, prog_data, prog, fp, 8);
3859    if (!v.run_fs()) {
3860       if (prog) {
3861          prog->LinkStatus = false;
3862          ralloc_strcat(&prog->InfoLog, v.fail_msg);
3863       }
3864
3865       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3866                     v.fail_msg);
3867
3868       return NULL;
3869    }
3870
3871    cfg_t *simd16_cfg = NULL;
3872    fs_visitor v2(brw, mem_ctx, key, prog_data, prog, fp, 16);
3873    if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16) ||
3874                                brw->use_rep_send)) {
3875       if (!v.simd16_unsupported) {
3876          /* Try a SIMD16 compile */
3877          v2.import_uniforms(&v);
3878          if (!v2.run_fs()) {
3879             perf_debug("SIMD16 shader failed to compile, falling back to "
3880                        "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
3881          } else {
3882             simd16_cfg = v2.cfg;
3883          }
3884       } else {
3885          perf_debug("SIMD16 shader unsupported, falling back to "
3886                     "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
3887       }
3888    }
3889
3890    cfg_t *simd8_cfg;
3891    int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8;
3892    if (no_simd8 && simd16_cfg) {
3893       simd8_cfg = NULL;
3894       prog_data->no_8 = true;
3895    } else {
3896       simd8_cfg = v.cfg;
3897       prog_data->no_8 = false;
3898    }
3899
3900    fs_generator g(brw, mem_ctx, (void *) key, &prog_data->base,
3901                   &fp->Base, v.runtime_check_aads_emit, "FS");
3902
3903    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3904       char *name;
3905       if (prog)
3906          name = ralloc_asprintf(mem_ctx, "%s fragment shader %d",
3907                                 prog->Label ? prog->Label : "unnamed",
3908                                 prog->Name);
3909       else
3910          name = ralloc_asprintf(mem_ctx, "fragment program %d", fp->Base.Id);
3911
3912       g.enable_debug(name);
3913    }
3914
3915    if (simd8_cfg)
3916       g.generate_code(simd8_cfg, 8);
3917    if (simd16_cfg)
3918       prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
3919
3920    if (unlikely(brw->perf_debug) && shader) {
3921       if (shader->compiled_once)
3922          brw_wm_debug_recompile(brw, prog, key);
3923       shader->compiled_once = true;
3924
3925       if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3926          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3927                     (get_time() - start_time) * 1000);
3928       }
3929    }
3930
3931    return g.get_assembly(final_assembly_size);
3932 }
3933
3934 extern "C" bool
3935 brw_fs_precompile(struct gl_context *ctx,
3936                   struct gl_shader_program *shader_prog,
3937                   struct gl_program *prog)
3938 {
3939    struct brw_context *brw = brw_context(ctx);
3940    struct brw_wm_prog_key key;
3941
3942    struct gl_fragment_program *fp = (struct gl_fragment_program *) prog;
3943    struct brw_fragment_program *bfp = brw_fragment_program(fp);
3944    bool program_uses_dfdy = fp->UsesDFdy;
3945
3946    memset(&key, 0, sizeof(key));
3947
3948    if (brw->gen < 6) {
3949       if (fp->UsesKill)
3950          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3951
3952       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3953          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3954
3955       /* Just assume depth testing. */
3956       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3957       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3958    }
3959
3960    if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
3961                                          BRW_FS_VARYING_INPUT_MASK) > 16)
3962       key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
3963
3964    const bool has_shader_channel_select = brw->is_haswell || brw->gen >= 8;
3965    unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
3966    for (unsigned i = 0; i < sampler_count; i++) {
3967       if (!has_shader_channel_select && (fp->Base.ShadowSamplers & (1 << i))) {
3968          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3969          key.tex.swizzles[i] =
3970             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3971       } else {
3972          /* Color sampler: assume no swizzling. */
3973          key.tex.swizzles[i] = SWIZZLE_XYZW;
3974       }
3975    }
3976
3977    if (fp->Base.InputsRead & VARYING_BIT_POS) {
3978       key.drawable_height = ctx->DrawBuffer->Height;
3979    }
3980
3981    key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
3982          ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
3983          BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
3984
3985    if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3986       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
3987                           key.nr_color_regions > 1;
3988    }
3989
3990    key.program_string_id = bfp->id;
3991
3992    uint32_t old_prog_offset = brw->wm.base.prog_offset;
3993    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3994
3995    bool success = do_wm_prog(brw, shader_prog, bfp, &key);
3996
3997    brw->wm.base.prog_offset = old_prog_offset;
3998    brw->wm.prog_data = old_prog_data;
3999
4000    return success;
4001 }