src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 extern "C" {
  32
  33 #include <sys/types.h>
  34
  35 #include "util/hash_table.h"
  36 #include "main/macros.h"
  37 #include "main/shaderobj.h"
  38 #include "main/fbobject.h"
  39 #include "program/prog_parameter.h"
  40 #include "program/prog_print.h"
  41 #include "util/register_allocate.h"
  42 #include "program/hash_table.h"
  43 #include "brw_context.h"
  44 #include "brw_eu.h"
  45 #include "brw_wm.h"
  46 }
  47 #include "brw_fs.h"
  48 #include "brw_cfg.h"
  49 #include "brw_dead_control_flow.h"
  50 #include "main/uniforms.h"
  51 #include "brw_fs_live_variables.h"
  52 #include "glsl/glsl_types.h"
  53 #include "program/sampler.h"
  54
  55 void
  56 fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
  57               fs_reg *src, int sources)
  58 {
  59    memset(this, 0, sizeof(*this));
  60
  61    this->opcode = opcode;
  62    this->dst = dst;
  63    this->src = src;
  64    this->sources = sources;
  65    this->exec_size = exec_size;
  66
  67    assert(dst.file != IMM && dst.file != UNIFORM);
  68
  69    /* If exec_size == 0, try to guess it from the registers.  Since all
  70     * manner of things may use hardware registers, we first try to guess
  71     * based on GRF registers.  If this fails, we will go ahead and take the
  72     * width from the destination register.
  73     */
  74    if (this->exec_size == 0) {
  75       if (dst.file == GRF) {
  76          this->exec_size = dst.width;
  77       } else {
  78          for (int i = 0; i < sources; ++i) {
  79             if (src[i].file != GRF && src[i].file != ATTR)
  80                continue;
  81
  82             if (this->exec_size <= 1)
  83                this->exec_size = src[i].width;
  84             assert(src[i].width == 1 || src[i].width == this->exec_size);
  85          }
  86       }
  87
  88       if (this->exec_size == 0 && dst.file != BAD_FILE)
  89          this->exec_size = dst.width;
  90    }
  91    assert(this->exec_size != 0);
  92
  93    for (int i = 0; i < sources; ++i) {
  94       switch (this->src[i].file) {
  95       case BAD_FILE:
  96          this->src[i].effective_width = 8;
  97          break;
  98       case GRF:
  99       case HW_REG:
 100       case ATTR:
 101          assert(this->src[i].width > 0);
 102          if (this->src[i].width == 1) {
 103             this->src[i].effective_width = this->exec_size;
 104          } else {
 105             this->src[i].effective_width = this->src[i].width;
 106          }
 107          break;
 108       case IMM:
 109       case UNIFORM:
 110          this->src[i].effective_width = this->exec_size;
 111          break;
 112       default:
 113          unreachable("Invalid source register file");
 114       }
 115    }
 116    this->dst.effective_width = this->exec_size;
 117
 118    this->conditional_mod = BRW_CONDITIONAL_NONE;
 119
 120    /* This will be the case for almost all instructions. */
 121    switch (dst.file) {
 122    case GRF:
 123    case HW_REG:
 124    case MRF:
 125    case ATTR:
 126       this->regs_written = (dst.width * dst.stride * type_sz(dst.type) + 31) / 32;
 127       break;
 128    case BAD_FILE:
 129       this->regs_written = 0;
 130       break;
 131    case IMM:
 132    case UNIFORM:
 133       unreachable("Invalid destination register file");
 134    default:
 135       unreachable("Invalid register file");
 136    }
 137
 138    this->writes_accumulator = false;
 139 }
 140
 141 fs_inst::fs_inst()
 142 {
 143    fs_reg *src = ralloc_array(this, fs_reg, 3);
 144    init(BRW_OPCODE_NOP, 8, dst, src, 0);
 145 }
 146
 147 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
 148 {
 149    fs_reg *src = ralloc_array(this, fs_reg, 3);
 150    init(opcode, exec_size, reg_undef, src, 0);
 151 }
 152
 153 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
 154 {
 155    fs_reg *src = ralloc_array(this, fs_reg, 3);
 156    init(opcode, 0, dst, src, 0);
 157 }
 158
 159 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 160                  const fs_reg &src0)
 161 {
 162    fs_reg *src = ralloc_array(this, fs_reg, 3);
 163    src[0] = src0;
 164    init(opcode, exec_size, dst, src, 1);
 165 }
 166
 167 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
 168 {
 169    fs_reg *src = ralloc_array(this, fs_reg, 3);
 170    src[0] = src0;
 171    init(opcode, 0, dst, src, 1);
 172 }
 173
 174 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 175                  const fs_reg &src0, const fs_reg &src1)
 176 {
 177    fs_reg *src = ralloc_array(this, fs_reg, 3);
 178    src[0] = src0;
 179    src[1] = src1;
 180    init(opcode, exec_size, dst, src, 2);
 181 }
 182
 183 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 184                  const fs_reg &src1)
 185 {
 186    fs_reg *src = ralloc_array(this, fs_reg, 3);
 187    src[0] = src0;
 188    src[1] = src1;
 189    init(opcode, 0, dst, src, 2);
 190 }
 191
 192 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 193                  const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
 194 {
 195    fs_reg *src = ralloc_array(this, fs_reg, 3);
 196    src[0] = src0;
 197    src[1] = src1;
 198    src[2] = src2;
 199    init(opcode, exec_size, dst, src, 3);
 200 }
 201
 202 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 203                  const fs_reg &src1, const fs_reg &src2)
 204 {
 205    fs_reg *src = ralloc_array(this, fs_reg, 3);
 206    src[0] = src0;
 207    src[1] = src1;
 208    src[2] = src2;
 209    init(opcode, 0, dst, src, 3);
 210 }
 211
 212 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, fs_reg src[], int sources)
 213 {
 214    init(opcode, 0, dst, src, sources);
 215 }
 216
 217 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
 218                  fs_reg src[], int sources)
 219 {
 220    init(opcode, exec_width, dst, src, sources);
 221 }
 222
 223 fs_inst::fs_inst(const fs_inst &that)
 224 {
 225    memcpy(this, &that, sizeof(that));
 226
 227    this->src = ralloc_array(this, fs_reg, that.sources);
 228
 229    for (int i = 0; i < that.sources; i++)
 230       this->src[i] = that.src[i];
 231 }
 232
 233 void
 234 fs_inst::resize_sources(uint8_t num_sources)
 235 {
 236    if (this->sources != num_sources) {
 237       this->src = reralloc(this, this->src, fs_reg, num_sources);
 238       this->sources = num_sources;
 239    }
 240 }
 241
 242 #define ALU1(op)                                                        \
 243    fs_inst *                                                            \
 244    fs_visitor::op(const fs_reg &dst, const fs_reg &src0)                \
 245    {                                                                    \
 246       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
 247    }
 248
 249 #define ALU2(op)                                                        \
 250    fs_inst *                                                            \
 251    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 252                   const fs_reg &src1)                                   \
 253    {                                                                    \
 254       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
 255    }
 256
 257 #define ALU2_ACC(op)                                                    \
 258    fs_inst *                                                            \
 259    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 260                   const fs_reg &src1)                                   \
 261    {                                                                    \
 262       fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
 263       inst->writes_accumulator = true;                                  \
 264       return inst;                                                      \
 265    }
 266
 267 #define ALU3(op)                                                        \
 268    fs_inst *                                                            \
 269    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 270                   const fs_reg &src1, const fs_reg &src2)               \
 271    {                                                                    \
 272       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
 273    }
 274
 275 ALU1(NOT)
 276 ALU1(MOV)
 277 ALU1(FRC)
 278 ALU1(RNDD)
 279 ALU1(RNDE)
 280 ALU1(RNDZ)
 281 ALU2(ADD)
 282 ALU2(MUL)
 283 ALU2_ACC(MACH)
 284 ALU2(AND)
 285 ALU2(OR)
 286 ALU2(XOR)
 287 ALU2(SHL)
 288 ALU2(SHR)
 289 ALU2(ASR)
 290 ALU3(LRP)
 291 ALU1(BFREV)
 292 ALU3(BFE)
 293 ALU2(BFI1)
 294 ALU3(BFI2)
 295 ALU1(FBH)
 296 ALU1(FBL)
 297 ALU1(CBIT)
 298 ALU3(MAD)
 299 ALU2_ACC(ADDC)
 300 ALU2_ACC(SUBB)
 301 ALU2(SEL)
 302 ALU2(MAC)
 303
 304 /** Gen4 predicated IF. */
 305 fs_inst *
 306 fs_visitor::IF(enum brw_predicate predicate)
 307 {
 308    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width);
 309    inst->predicate = predicate;
 310    return inst;
 311 }
 312
 313 /** Gen6 IF with embedded comparison. */
 314 fs_inst *
 315 fs_visitor::IF(const fs_reg &src0, const fs_reg &src1,
 316                enum brw_conditional_mod condition)
 317 {
 318    assert(brw->gen == 6);
 319    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width,
 320                                         reg_null_d, src0, src1);
 321    inst->conditional_mod = condition;
 322    return inst;
 323 }
 324
 325 /**
 326  * CMP: Sets the low bit of the destination channels with the result
 327  * of the comparison, while the upper bits are undefined, and updates
 328  * the flag register with the packed 16 bits of the result.
 329  */
 330 fs_inst *
 331 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
 332                 enum brw_conditional_mod condition)
 333 {
 334    fs_inst *inst;
 335
 336    /* Take the instruction:
 337     *
 338     * CMP null<d> src0<f> src1<f>
 339     *
 340     * Original gen4 does type conversion to the destination type before
 341     * comparison, producing garbage results for floating point comparisons.
 342     *
 343     * The destination type doesn't matter on newer generations, so we set the
 344     * type to match src0 so we can compact the instruction.
 345     */
 346    dst.type = src0.type;
 347    if (dst.file == HW_REG)
 348       dst.fixed_hw_reg.type = dst.type;
 349
 350    resolve_ud_negate(&src0);
 351    resolve_ud_negate(&src1);
 352
 353    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
 354    inst->conditional_mod = condition;
 355
 356    return inst;
 357 }
 358
 359 fs_inst *
 360 fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources)
 361 {
 362    uint8_t exec_size = dst.width;
 363    for (int i = 0; i < sources; ++i) {
 364       assert(src[i].width % dst.width == 0);
 365       if (src[i].width > exec_size)
 366          exec_size = src[i].width;
 367    }
 368
 369    fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, exec_size,
 370                                         dst, src, sources);
 371    inst->regs_written = 0;
 372    for (int i = 0; i < sources; ++i) {
 373       /* The LOAD_PAYLOAD instruction only really makes sense if we are
 374        * dealing with whole registers.  If this ever changes, we can deal
 375        * with it later.
 376        */
 377       int size = src[i].effective_width * type_sz(src[i].type);
 378       assert(size % 32 == 0);
 379       inst->regs_written += (size + 31) / 32;
 380    }
 381
 382    return inst;
 383 }
 384
 385 exec_list
 386 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
 387                                        const fs_reg &surf_index,
 388                                        const fs_reg &varying_offset,
 389                                        uint32_t const_offset)
 390 {
 391    exec_list instructions;
 392    fs_inst *inst;
 393
 394    /* We have our constant surface use a pitch of 4 bytes, so our index can
 395     * be any component of a vector, and then we load 4 contiguous
 396     * components starting from that.
 397     *
 398     * We break down the const_offset to a portion added to the variable
 399     * offset and a portion done using reg_offset, which means that if you
 400     * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
 401     * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
 402     * CSE can later notice that those loads are all the same and eliminate
 403     * the redundant ones.
 404     */
 405    fs_reg vec4_offset = vgrf(glsl_type::int_type);
 406    instructions.push_tail(ADD(vec4_offset,
 407                               varying_offset, fs_reg(const_offset & ~3)));
 408
 409    int scale = 1;
 410    if (brw->gen == 4 && dst.width == 8) {
 411       /* Pre-gen5, we can either use a SIMD8 message that requires (header,
 412        * u, v, r) as parameters, or we can just use the SIMD16 message
 413        * consisting of (header, u).  We choose the second, at the cost of a
 414        * longer return length.
 415        */
 416       scale = 2;
 417    }
 418
 419    enum opcode op;
 420    if (brw->gen >= 7)
 421       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
 422    else
 423       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
 424
 425    assert(dst.width % 8 == 0);
 426    int regs_written = 4 * (dst.width / 8) * scale;
 427    fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(regs_written),
 428                                dst.type, dst.width);
 429    inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
 430    inst->regs_written = regs_written;
 431    instructions.push_tail(inst);
 432
 433    if (brw->gen < 7) {
 434       inst->base_mrf = 13;
 435       inst->header_present = true;
 436       if (brw->gen == 4)
 437          inst->mlen = 3;
 438       else
 439          inst->mlen = 1 + dispatch_width / 8;
 440    }
 441
 442    fs_reg result = offset(vec4_result, (const_offset & 3) * scale);
 443    instructions.push_tail(MOV(dst, result));
 444
 445    return instructions;
 446 }
 447
 448 /**
 449  * A helper for MOV generation for fixing up broken hardware SEND dependency
 450  * handling.
 451  */
 452 fs_inst *
 453 fs_visitor::DEP_RESOLVE_MOV(int grf)
 454 {
 455    fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
 456
 457    inst->ir = NULL;
 458    inst->annotation = "send dependency resolve";
 459
 460    /* The caller always wants uncompressed to emit the minimal extra
 461     * dependencies, and to avoid having to deal with aligning its regs to 2.
 462     */
 463    inst->exec_size = 8;
 464
 465    return inst;
 466 }
 467
 468 bool
 469 fs_inst::equals(fs_inst *inst) const
 470 {
 471    return (opcode == inst->opcode &&
 472            dst.equals(inst->dst) &&
 473            src[0].equals(inst->src[0]) &&
 474            src[1].equals(inst->src[1]) &&
 475            src[2].equals(inst->src[2]) &&
 476            saturate == inst->saturate &&
 477            predicate == inst->predicate &&
 478            conditional_mod == inst->conditional_mod &&
 479            mlen == inst->mlen &&
 480            base_mrf == inst->base_mrf &&
 481            target == inst->target &&
 482            eot == inst->eot &&
 483            header_present == inst->header_present &&
 484            shadow_compare == inst->shadow_compare &&
 485            exec_size == inst->exec_size &&
 486            offset == inst->offset);
 487 }
 488
 489 bool
 490 fs_inst::overwrites_reg(const fs_reg &reg) const
 491 {
 492    return (reg.file == dst.file &&
 493            reg.reg == dst.reg &&
 494            reg.reg_offset >= dst.reg_offset  &&
 495            reg.reg_offset < dst.reg_offset + regs_written);
 496 }
 497
 498 bool
 499 fs_inst::is_send_from_grf() const
 500 {
 501    switch (opcode) {
 502    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
 503    case SHADER_OPCODE_SHADER_TIME_ADD:
 504    case FS_OPCODE_INTERPOLATE_AT_CENTROID:
 505    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
 506    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
 507    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
 508    case SHADER_OPCODE_UNTYPED_ATOMIC:
 509    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
 510    case SHADER_OPCODE_URB_WRITE_SIMD8:
 511       return true;
 512    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 513       return src[1].file == GRF;
 514    case FS_OPCODE_FB_WRITE:
 515       return src[0].file == GRF;
 516    default:
 517       if (is_tex())
 518          return src[0].file == GRF;
 519
 520       return false;
 521    }
 522 }
 523
 524 bool
 525 fs_inst::can_do_source_mods(struct brw_context *brw)
 526 {
 527    if (brw->gen == 6 && is_math())
 528       return false;
 529
 530    if (is_send_from_grf())
 531       return false;
 532
 533    if (!backend_instruction::can_do_source_mods())
 534       return false;
 535
 536    return true;
 537 }
 538
 539 void
 540 fs_reg::init()
 541 {
 542    memset(this, 0, sizeof(*this));
 543    stride = 1;
 544 }
 545
 546 /** Generic unset register constructor. */
 547 fs_reg::fs_reg()
 548 {
 549    init();
 550    this->file = BAD_FILE;
 551 }
 552
 553 /** Immediate value constructor. */
 554 fs_reg::fs_reg(float f)
 555 {
 556    init();
 557    this->file = IMM;
 558    this->type = BRW_REGISTER_TYPE_F;
 559    this->fixed_hw_reg.dw1.f = f;
 560    this->width = 1;
 561 }
 562
 563 /** Immediate value constructor. */
 564 fs_reg::fs_reg(int32_t i)
 565 {
 566    init();
 567    this->file = IMM;
 568    this->type = BRW_REGISTER_TYPE_D;
 569    this->fixed_hw_reg.dw1.d = i;
 570    this->width = 1;
 571 }
 572
 573 /** Immediate value constructor. */
 574 fs_reg::fs_reg(uint32_t u)
 575 {
 576    init();
 577    this->file = IMM;
 578    this->type = BRW_REGISTER_TYPE_UD;
 579    this->fixed_hw_reg.dw1.ud = u;
 580    this->width = 1;
 581 }
 582
 583 /** Vector float immediate value constructor. */
 584 fs_reg::fs_reg(uint8_t vf[4])
 585 {
 586    init();
 587    this->file = IMM;
 588    this->type = BRW_REGISTER_TYPE_VF;
 589    memcpy(&this->fixed_hw_reg.dw1.ud, vf, sizeof(unsigned));
 590 }
 591
 592 /** Vector float immediate value constructor. */
 593 fs_reg::fs_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
 594 {
 595    init();
 596    this->file = IMM;
 597    this->type = BRW_REGISTER_TYPE_VF;
 598    this->fixed_hw_reg.dw1.ud = (vf0 <<  0) |
 599                                (vf1 <<  8) |
 600                                (vf2 << 16) |
 601                                (vf3 << 24);
 602 }
 603
 604 /** Fixed brw_reg. */
 605 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 606 {
 607    init();
 608    this->file = HW_REG;
 609    this->fixed_hw_reg = fixed_hw_reg;
 610    this->type = fixed_hw_reg.type;
 611    this->width = 1 << fixed_hw_reg.width;
 612 }
 613
 614 bool
 615 fs_reg::equals(const fs_reg &r) const
 616 {
 617    return (file == r.file &&
 618            reg == r.reg &&
 619            reg_offset == r.reg_offset &&
 620            subreg_offset == r.subreg_offset &&
 621            type == r.type &&
 622            negate == r.negate &&
 623            abs == r.abs &&
 624            !reladdr && !r.reladdr &&
 625            memcmp(&fixed_hw_reg, &r.fixed_hw_reg, sizeof(fixed_hw_reg)) == 0 &&
 626            width == r.width &&
 627            stride == r.stride);
 628 }
 629
 630 fs_reg &
 631 fs_reg::set_smear(unsigned subreg)
 632 {
 633    assert(file != HW_REG && file != IMM);
 634    subreg_offset = subreg * type_sz(type);
 635    stride = 0;
 636    return *this;
 637 }
 638
 639 bool
 640 fs_reg::is_contiguous() const
 641 {
 642    return stride == 1;
 643 }
 644
 645 int
 646 fs_visitor::type_size(const struct glsl_type *type)
 647 {
 648    unsigned int size, i;
 649
 650    switch (type->base_type) {
 651    case GLSL_TYPE_UINT:
 652    case GLSL_TYPE_INT:
 653    case GLSL_TYPE_FLOAT:
 654    case GLSL_TYPE_BOOL:
 655       return type->components();
 656    case GLSL_TYPE_ARRAY:
 657       return type_size(type->fields.array) * type->length;
 658    case GLSL_TYPE_STRUCT:
 659       size = 0;
 660       for (i = 0; i < type->length; i++) {
 661          size += type_size(type->fields.structure[i].type);
 662       }
 663       return size;
 664    case GLSL_TYPE_SAMPLER:
 665       /* Samplers take up no register space, since they're baked in at
 666        * link time.
 667        */
 668       return 0;
 669    case GLSL_TYPE_ATOMIC_UINT:
 670       return 0;
 671    case GLSL_TYPE_IMAGE:
 672    case GLSL_TYPE_VOID:
 673    case GLSL_TYPE_ERROR:
 674    case GLSL_TYPE_INTERFACE:
 675       unreachable("not reached");
 676    }
 677
 678    return 0;
 679 }
 680
 681 fs_reg
 682 fs_visitor::get_timestamp()
 683 {
 684    assert(brw->gen >= 7);
 685
 686    fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 687                                           BRW_ARF_TIMESTAMP,
 688                                           0),
 689                              BRW_REGISTER_TYPE_UD));
 690
 691    fs_reg dst = fs_reg(GRF, virtual_grf_alloc(1), BRW_REGISTER_TYPE_UD, 4);
 692
 693    fs_inst *mov = emit(MOV(dst, ts));
 694    /* We want to read the 3 fields we care about even if it's not enabled in
 695     * the dispatch.
 696     */
 697    mov->force_writemask_all = true;
 698
 699    /* The caller wants the low 32 bits of the timestamp.  Since it's running
 700     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 701     * which is plenty of time for our purposes.  It is identical across the
 702     * EUs, but since it's tracking GPU core speed it will increment at a
 703     * varying rate as render P-states change.
 704     *
 705     * The caller could also check if render P-states have changed (or anything
 706     * else that might disrupt timing) by setting smear to 2 and checking if
 707     * that field is != 0.
 708     */
 709    dst.set_smear(0);
 710
 711    return dst;
 712 }
 713
 714 void
 715 fs_visitor::emit_shader_time_begin()
 716 {
 717    current_annotation = "shader time start";
 718    shader_start_time = get_timestamp();
 719 }
 720
 721 void
 722 fs_visitor::emit_shader_time_end()
 723 {
 724    current_annotation = "shader time end";
 725
 726    enum shader_time_shader_type type, written_type, reset_type;
 727    switch (stage) {
 728    case MESA_SHADER_VERTEX:
 729       type = ST_VS;
 730       written_type = ST_VS_WRITTEN;
 731       reset_type = ST_VS_RESET;
 732       break;
 733    case MESA_SHADER_GEOMETRY:
 734       type = ST_GS;
 735       written_type = ST_GS_WRITTEN;
 736       reset_type = ST_GS_RESET;
 737       break;
 738    case MESA_SHADER_FRAGMENT:
 739       if (dispatch_width == 8) {
 740          type = ST_FS8;
 741          written_type = ST_FS8_WRITTEN;
 742          reset_type = ST_FS8_RESET;
 743       } else {
 744          assert(dispatch_width == 16);
 745          type = ST_FS16;
 746          written_type = ST_FS16_WRITTEN;
 747          reset_type = ST_FS16_RESET;
 748       }
 749       break;
 750    default:
 751       unreachable("fs_visitor::emit_shader_time_end missing code");
 752    }
 753
 754    fs_reg shader_end_time = get_timestamp();
 755
 756    /* Check that there weren't any timestamp reset events (assuming these
 757     * were the only two timestamp reads that happened).
 758     */
 759    fs_reg reset = shader_end_time;
 760    reset.set_smear(2);
 761    fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
 762    test->conditional_mod = BRW_CONDITIONAL_Z;
 763    emit(IF(BRW_PREDICATE_NORMAL));
 764
 765    fs_reg start = shader_start_time;
 766    start.negate = true;
 767    fs_reg diff = fs_reg(GRF, virtual_grf_alloc(1), BRW_REGISTER_TYPE_UD, 1);
 768    emit(ADD(diff, start, shader_end_time));
 769
 770    /* If there were no instructions between the two timestamp gets, the diff
 771     * is 2 cycles.  Remove that overhead, so I can forget about that when
 772     * trying to determine the time taken for single instructions.
 773     */
 774    emit(ADD(diff, diff, fs_reg(-2u)));
 775
 776    emit_shader_time_write(type, diff);
 777    emit_shader_time_write(written_type, fs_reg(1u));
 778    emit(BRW_OPCODE_ELSE);
 779    emit_shader_time_write(reset_type, fs_reg(1u));
 780    emit(BRW_OPCODE_ENDIF);
 781 }
 782
 783 void
 784 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
 785                                    fs_reg value)
 786 {
 787    int shader_time_index =
 788       brw_get_shader_time_index(brw, shader_prog, prog, type);
 789    fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
 790
 791    fs_reg payload;
 792    if (dispatch_width == 8)
 793       payload = vgrf(glsl_type::uvec2_type);
 794    else
 795       payload = vgrf(glsl_type::uint_type);
 796
 797    emit(new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
 798                              fs_reg(), payload, offset, value));
 799 }
 800
 801 void
 802 fs_visitor::vfail(const char *format, va_list va)
 803 {
 804    char *msg;
 805
 806    if (failed)
 807       return;
 808
 809    failed = true;
 810
 811    msg = ralloc_vasprintf(mem_ctx, format, va);
 812    msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
 813
 814    this->fail_msg = msg;
 815
 816    if (INTEL_DEBUG & DEBUG_WM) {
 817       fprintf(stderr, "%s",  msg);
 818    }
 819 }
 820
 821 void
 822 fs_visitor::fail(const char *format, ...)
 823 {
 824    va_list va;
 825
 826    va_start(va, format);
 827    vfail(format, va);
 828    va_end(va);
 829 }
 830
 831 /**
 832  * Mark this program as impossible to compile in SIMD16 mode.
 833  *
 834  * During the SIMD8 compile (which happens first), we can detect and flag
 835  * things that are unsupported in SIMD16 mode, so the compiler can skip
 836  * the SIMD16 compile altogether.
 837  *
 838  * During a SIMD16 compile (if one happens anyway), this just calls fail().
 839  */
 840 void
 841 fs_visitor::no16(const char *format, ...)
 842 {
 843    va_list va;
 844
 845    va_start(va, format);
 846
 847    if (dispatch_width == 16) {
 848       vfail(format, va);
 849    } else {
 850       simd16_unsupported = true;
 851
 852       if (brw->perf_debug) {
 853          if (no16_msg)
 854             ralloc_vasprintf_append(&no16_msg, format, va);
 855          else
 856             no16_msg = ralloc_vasprintf(mem_ctx, format, va);
 857       }
 858    }
 859
 860    va_end(va);
 861 }
 862
 863 fs_inst *
 864 fs_visitor::emit(enum opcode opcode)
 865 {
 866    return emit(new(mem_ctx) fs_inst(opcode, dispatch_width));
 867 }
 868
 869 fs_inst *
 870 fs_visitor::emit(enum opcode opcode, const fs_reg &dst)
 871 {
 872    return emit(new(mem_ctx) fs_inst(opcode, dst));
 873 }
 874
 875 fs_inst *
 876 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
 877 {
 878    return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
 879 }
 880
 881 fs_inst *
 882 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 883                  const fs_reg &src1)
 884 {
 885    return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
 886 }
 887
 888 fs_inst *
 889 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 890                  const fs_reg &src1, const fs_reg &src2)
 891 {
 892    return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
 893 }
 894
 895 fs_inst *
 896 fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
 897                  fs_reg src[], int sources)
 898 {
 899    return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
 900 }
 901
 902 /**
 903  * Returns true if the instruction has a flag that means it won't
 904  * update an entire destination register.
 905  *
 906  * For example, dead code elimination and live variable analysis want to know
 907  * when a write to a variable screens off any preceding values that were in
 908  * it.
 909  */
 910 bool
 911 fs_inst::is_partial_write() const
 912 {
 913    return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
 914            (this->dst.width * type_sz(this->dst.type)) < 32 ||
 915            !this->dst.is_contiguous());
 916 }
 917
 918 int
 919 fs_inst::regs_read(fs_visitor *v, int arg) const
 920 {
 921    if (is_tex() && arg == 0 && src[0].file == GRF) {
 922       return mlen;
 923    } else if (opcode == FS_OPCODE_FB_WRITE && arg == 0) {
 924       return mlen;
 925    } else if (opcode == SHADER_OPCODE_URB_WRITE_SIMD8 && arg == 0) {
 926       return mlen;
 927    } else if (opcode == SHADER_OPCODE_UNTYPED_ATOMIC && arg == 0) {
 928       return mlen;
 929    } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ && arg == 0) {
 930       return mlen;
 931    } else if (opcode == FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET && arg == 0) {
 932       return mlen;
 933    }
 934
 935    switch (src[arg].file) {
 936    case BAD_FILE:
 937    case UNIFORM:
 938    case IMM:
 939       return 1;
 940    case GRF:
 941    case HW_REG:
 942       if (src[arg].stride == 0) {
 943          return 1;
 944       } else {
 945          int size = src[arg].width * src[arg].stride * type_sz(src[arg].type);
 946          return (size + 31) / 32;
 947       }
 948    case MRF:
 949       unreachable("MRF registers are not allowed as sources");
 950    default:
 951       unreachable("Invalid register file");
 952    }
 953 }
 954
 955 bool
 956 fs_inst::reads_flag() const
 957 {
 958    return predicate;
 959 }
 960
 961 bool
 962 fs_inst::writes_flag() const
 963 {
 964    return (conditional_mod && (opcode != BRW_OPCODE_SEL &&
 965                                opcode != BRW_OPCODE_IF &&
 966                                opcode != BRW_OPCODE_WHILE)) ||
 967           opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
 968 }
 969
 970 /**
 971  * Returns how many MRFs an FS opcode will write over.
 972  *
 973  * Note that this is not the 0 or 1 implied writes in an actual gen
 974  * instruction -- the FS opcodes often generate MOVs in addition.
 975  */
 976 int
 977 fs_visitor::implied_mrf_writes(fs_inst *inst)
 978 {
 979    if (inst->mlen == 0)
 980       return 0;
 981
 982    if (inst->base_mrf == -1)
 983       return 0;
 984
 985    switch (inst->opcode) {
 986    case SHADER_OPCODE_RCP:
 987    case SHADER_OPCODE_RSQ:
 988    case SHADER_OPCODE_SQRT:
 989    case SHADER_OPCODE_EXP2:
 990    case SHADER_OPCODE_LOG2:
 991    case SHADER_OPCODE_SIN:
 992    case SHADER_OPCODE_COS:
 993       return 1 * dispatch_width / 8;
 994    case SHADER_OPCODE_POW:
 995    case SHADER_OPCODE_INT_QUOTIENT:
 996    case SHADER_OPCODE_INT_REMAINDER:
 997       return 2 * dispatch_width / 8;
 998    case SHADER_OPCODE_TEX:
 999    case FS_OPCODE_TXB:
1000    case SHADER_OPCODE_TXD:
1001    case SHADER_OPCODE_TXF:
1002    case SHADER_OPCODE_TXF_CMS:
1003    case SHADER_OPCODE_TXF_MCS:
1004    case SHADER_OPCODE_TG4:
1005    case SHADER_OPCODE_TG4_OFFSET:
1006    case SHADER_OPCODE_TXL:
1007    case SHADER_OPCODE_TXS:
1008    case SHADER_OPCODE_LOD:
1009       return 1;
1010    case FS_OPCODE_FB_WRITE:
1011       return 2;
1012    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
1013    case SHADER_OPCODE_GEN4_SCRATCH_READ:
1014       return 1;
1015    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
1016       return inst->mlen;
1017    case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
1018       return 2;
1019    case SHADER_OPCODE_UNTYPED_ATOMIC:
1020    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
1021    case SHADER_OPCODE_URB_WRITE_SIMD8:
1022    case FS_OPCODE_INTERPOLATE_AT_CENTROID:
1023    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
1024    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
1025    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
1026       return 0;
1027    default:
1028       unreachable("not reached");
1029    }
1030 }
1031
1032 int
1033 fs_visitor::virtual_grf_alloc(int size)
1034 {
1035    if (virtual_grf_array_size <= virtual_grf_count) {
1036       if (virtual_grf_array_size == 0)
1037          virtual_grf_array_size = 16;
1038       else
1039          virtual_grf_array_size *= 2;
1040       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
1041                                    virtual_grf_array_size);
1042    }
1043    virtual_grf_sizes[virtual_grf_count] = size;
1044    return virtual_grf_count++;
1045 }
1046
1047 fs_reg
1048 fs_visitor::vgrf(const glsl_type *const type)
1049 {
1050    int reg_width = dispatch_width / 8;
1051    return fs_reg(GRF, virtual_grf_alloc(type_size(type) * reg_width),
1052                  brw_type_for_base_type(type), dispatch_width);
1053 }
1054
1055 fs_reg
1056 fs_visitor::vgrf(int num_components)
1057 {
1058    int reg_width = dispatch_width / 8;
1059    return fs_reg(GRF, virtual_grf_alloc(num_components * reg_width),
1060                  BRW_REGISTER_TYPE_F, dispatch_width);
1061 }
1062
1063 /** Fixed HW reg constructor. */
1064 fs_reg::fs_reg(enum register_file file, int reg)
1065 {
1066    init();
1067    this->file = file;
1068    this->reg = reg;
1069    this->type = BRW_REGISTER_TYPE_F;
1070
1071    switch (file) {
1072    case UNIFORM:
1073       this->width = 1;
1074       break;
1075    default:
1076       this->width = 8;
1077    }
1078 }
1079
1080 /** Fixed HW reg constructor. */
1081 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
1082 {
1083    init();
1084    this->file = file;
1085    this->reg = reg;
1086    this->type = type;
1087
1088    switch (file) {
1089    case UNIFORM:
1090       this->width = 1;
1091       break;
1092    default:
1093       this->width = 8;
1094    }
1095 }
1096
1097 /** Fixed HW reg constructor. */
1098 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type,
1099                uint8_t width)
1100 {
1101    init();
1102    this->file = file;
1103    this->reg = reg;
1104    this->type = type;
1105    this->width = width;
1106 }
1107
1108 fs_reg *
1109 fs_visitor::variable_storage(ir_variable *var)
1110 {
1111    return (fs_reg *)hash_table_find(this->variable_ht, var);
1112 }
1113
1114 void
1115 import_uniforms_callback(const void *key,
1116                          void *data,
1117                          void *closure)
1118 {
1119    struct hash_table *dst_ht = (struct hash_table *)closure;
1120    const fs_reg *reg = (const fs_reg *)data;
1121
1122    if (reg->file != UNIFORM)
1123       return;
1124
1125    hash_table_insert(dst_ht, data, key);
1126 }
1127
1128 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
1129  * This brings in those uniform definitions
1130  */
1131 void
1132 fs_visitor::import_uniforms(fs_visitor *v)
1133 {
1134    hash_table_call_foreach(v->variable_ht,
1135                            import_uniforms_callback,
1136                            variable_ht);
1137    this->push_constant_loc = v->push_constant_loc;
1138    this->pull_constant_loc = v->pull_constant_loc;
1139    this->uniforms = v->uniforms;
1140    this->param_size = v->param_size;
1141 }
1142
1143 /* Our support for uniforms is piggy-backed on the struct
1144  * gl_fragment_program, because that's where the values actually
1145  * get stored, rather than in some global gl_shader_program uniform
1146  * store.
1147  */
1148 void
1149 fs_visitor::setup_uniform_values(ir_variable *ir)
1150 {
1151    int namelen = strlen(ir->name);
1152
1153    /* The data for our (non-builtin) uniforms is stored in a series of
1154     * gl_uniform_driver_storage structs for each subcomponent that
1155     * glGetUniformLocation() could name.  We know it's been set up in the same
1156     * order we'd walk the type, so walk the list of storage and find anything
1157     * with our name, or the prefix of a component that starts with our name.
1158     */
1159    unsigned params_before = uniforms;
1160    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
1161       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
1162
1163       if (strncmp(ir->name, storage->name, namelen) != 0 ||
1164           (storage->name[namelen] != 0 &&
1165            storage->name[namelen] != '.' &&
1166            storage->name[namelen] != '[')) {
1167          continue;
1168       }
1169
1170       unsigned slots = storage->type->component_slots();
1171       if (storage->array_elements)
1172          slots *= storage->array_elements;
1173
1174       for (unsigned i = 0; i < slots; i++) {
1175          stage_prog_data->param[uniforms++] = &storage->storage[i];
1176       }
1177    }
1178
1179    /* Make sure we actually initialized the right amount of stuff here. */
1180    assert(params_before + ir->type->component_slots() == uniforms);
1181    (void)params_before;
1182 }
1183
1184
1185 /* Our support for builtin uniforms is even scarier than non-builtin.
1186  * It sits on top of the PROG_STATE_VAR parameters that are
1187  * automatically updated from GL context state.
1188  */
1189 void
1190 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
1191 {
1192    const ir_state_slot *const slots = ir->get_state_slots();
1193    assert(slots != NULL);
1194
1195    for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
1196       /* This state reference has already been setup by ir_to_mesa, but we'll
1197        * get the same index back here.
1198        */
1199       int index = _mesa_add_state_reference(this->prog->Parameters,
1200                                             (gl_state_index *)slots[i].tokens);
1201
1202       /* Add each of the unique swizzles of the element as a parameter.
1203        * This'll end up matching the expected layout of the
1204        * array/matrix/structure we're trying to fill in.
1205        */
1206       int last_swiz = -1;
1207       for (unsigned int j = 0; j < 4; j++) {
1208          int swiz = GET_SWZ(slots[i].swizzle, j);
1209          if (swiz == last_swiz)
1210             break;
1211          last_swiz = swiz;
1212
1213          stage_prog_data->param[uniforms++] =
1214             &prog->Parameters->ParameterValues[index][swiz];
1215       }
1216    }
1217 }
1218
1219 fs_reg *
1220 fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
1221                                          bool origin_upper_left)
1222 {
1223    assert(stage == MESA_SHADER_FRAGMENT);
1224    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1225    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec4_type));
1226    fs_reg wpos = *reg;
1227    bool flip = !origin_upper_left ^ key->render_to_fbo;
1228
1229    /* gl_FragCoord.x */
1230    if (pixel_center_integer) {
1231       emit(MOV(wpos, this->pixel_x));
1232    } else {
1233       emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
1234    }
1235    wpos = offset(wpos, 1);
1236
1237    /* gl_FragCoord.y */
1238    if (!flip && pixel_center_integer) {
1239       emit(MOV(wpos, this->pixel_y));
1240    } else {
1241       fs_reg pixel_y = this->pixel_y;
1242       float offset = (pixel_center_integer ? 0.0 : 0.5);
1243
1244       if (flip) {
1245          pixel_y.negate = true;
1246          offset += key->drawable_height - 1.0;
1247       }
1248
1249       emit(ADD(wpos, pixel_y, fs_reg(offset)));
1250    }
1251    wpos = offset(wpos, 1);
1252
1253    /* gl_FragCoord.z */
1254    if (brw->gen >= 6) {
1255       emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
1256    } else {
1257       emit(FS_OPCODE_LINTERP, wpos,
1258            this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1259            this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1260            interp_reg(VARYING_SLOT_POS, 2));
1261    }
1262    wpos = offset(wpos, 1);
1263
1264    /* gl_FragCoord.w: Already set up in emit_interpolation */
1265    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1266
1267    return reg;
1268 }
1269
1270 fs_inst *
1271 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1272                          glsl_interp_qualifier interpolation_mode,
1273                          bool is_centroid, bool is_sample)
1274 {
1275    brw_wm_barycentric_interp_mode barycoord_mode;
1276    if (brw->gen >= 6) {
1277       if (is_centroid) {
1278          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1279             barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1280          else
1281             barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1282       } else if (is_sample) {
1283           if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1284             barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1285          else
1286             barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1287       } else {
1288          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1289             barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1290          else
1291             barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1292       }
1293    } else {
1294       /* On Ironlake and below, there is only one interpolation mode.
1295        * Centroid interpolation doesn't mean anything on this hardware --
1296        * there is no multisampling.
1297        */
1298       barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1299    }
1300    return emit(FS_OPCODE_LINTERP, attr,
1301                this->delta_x[barycoord_mode],
1302                this->delta_y[barycoord_mode], interp);
1303 }
1304
1305 void
1306 fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
1307                                        const glsl_type *type,
1308                                        glsl_interp_qualifier interpolation_mode,
1309                                        int location, bool mod_centroid,
1310                                        bool mod_sample)
1311 {
1312    attr.type = brw_type_for_base_type(type->get_scalar_type());
1313
1314    assert(stage == MESA_SHADER_FRAGMENT);
1315    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1316    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1317
1318    unsigned int array_elements;
1319
1320    if (type->is_array()) {
1321       array_elements = type->length;
1322       if (array_elements == 0) {
1323          fail("dereferenced array '%s' has length 0\n", name);
1324       }
1325       type = type->fields.array;
1326    } else {
1327       array_elements = 1;
1328    }
1329
1330    if (interpolation_mode == INTERP_QUALIFIER_NONE) {
1331       bool is_gl_Color =
1332          location == VARYING_SLOT_COL0 || location == VARYING_SLOT_COL1;
1333       if (key->flat_shade && is_gl_Color) {
1334          interpolation_mode = INTERP_QUALIFIER_FLAT;
1335       } else {
1336          interpolation_mode = INTERP_QUALIFIER_SMOOTH;
1337       }
1338    }
1339
1340    for (unsigned int i = 0; i < array_elements; i++) {
1341       for (unsigned int j = 0; j < type->matrix_columns; j++) {
1342          if (prog_data->urb_setup[location] == -1) {
1343             /* If there's no incoming setup data for this slot, don't
1344              * emit interpolation for it.
1345              */
1346             attr = offset(attr, type->vector_elements);
1347             location++;
1348             continue;
1349          }
1350
1351          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1352             /* Constant interpolation (flat shading) case. The SF has
1353              * handed us defined values in only the constant offset
1354              * field of the setup reg.
1355              */
1356             for (unsigned int k = 0; k < type->vector_elements; k++) {
1357                struct brw_reg interp = interp_reg(location, k);
1358                interp = suboffset(interp, 3);
1359                interp.type = attr.type;
1360                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1361                attr = offset(attr, 1);
1362             }
1363          } else {
1364             /* Smooth/noperspective interpolation case. */
1365             for (unsigned int k = 0; k < type->vector_elements; k++) {
1366                struct brw_reg interp = interp_reg(location, k);
1367                if (brw->needs_unlit_centroid_workaround && mod_centroid) {
1368                   /* Get the pixel/sample mask into f0 so that we know
1369                    * which pixels are lit.  Then, for each channel that is
1370                    * unlit, replace the centroid data with non-centroid
1371                    * data.
1372                    */
1373                   emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1374
1375                   fs_inst *inst;
1376                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1377                                       false, false);
1378                   inst->predicate = BRW_PREDICATE_NORMAL;
1379                   inst->predicate_inverse = true;
1380                   if (brw->has_pln)
1381                      inst->no_dd_clear = true;
1382
1383                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1384                                       mod_centroid && !key->persample_shading,
1385                                       mod_sample || key->persample_shading);
1386                   inst->predicate = BRW_PREDICATE_NORMAL;
1387                   inst->predicate_inverse = false;
1388                   if (brw->has_pln)
1389                      inst->no_dd_check = true;
1390
1391                } else {
1392                   emit_linterp(attr, fs_reg(interp), interpolation_mode,
1393                                mod_centroid && !key->persample_shading,
1394                                mod_sample || key->persample_shading);
1395                }
1396                if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1397                   emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1398                }
1399                attr = offset(attr, 1);
1400             }
1401
1402          }
1403          location++;
1404       }
1405    }
1406 }
1407
1408 fs_reg *
1409 fs_visitor::emit_frontfacing_interpolation()
1410 {
1411    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type));
1412
1413    if (brw->gen >= 6) {
1414       /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
1415        * a boolean result from this (~0/true or 0/false).
1416        *
1417        * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
1418        * this task in only one instruction:
1419        *    - a negation source modifier will flip the bit; and
1420        *    - a W -> D type conversion will sign extend the bit into the high
1421        *      word of the destination.
1422        *
1423        * An ASR 15 fills the low word of the destination.
1424        */
1425       fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
1426       g0.negate = true;
1427
1428       emit(ASR(*reg, g0, fs_reg(15)));
1429    } else {
1430       /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
1431        * a boolean result from this (1/true or 0/false).
1432        *
1433        * Like in the above case, since the bit is the MSB of g1.6:UD we can use
1434        * the negation source modifier to flip it. Unfortunately the SHR
1435        * instruction only operates on UD (or D with an abs source modifier)
1436        * sources without negation.
1437        *
1438        * Instead, use ASR (which will give ~0/true or 0/false).
1439        */
1440       fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
1441       g1_6.negate = true;
1442
1443       emit(ASR(*reg, g1_6, fs_reg(31)));
1444    }
1445
1446    return reg;
1447 }
1448
1449 void
1450 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1451 {
1452    assert(stage == MESA_SHADER_FRAGMENT);
1453    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1454    assert(dst.type == BRW_REGISTER_TYPE_F);
1455
1456    if (key->compute_pos_offset) {
1457       /* Convert int_sample_pos to floating point */
1458       emit(MOV(dst, int_sample_pos));
1459       /* Scale to the range [0, 1] */
1460       emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1461    }
1462    else {
1463       /* From ARB_sample_shading specification:
1464        * "When rendering to a non-multisample buffer, or if multisample
1465        *  rasterization is disabled, gl_SamplePosition will always be
1466        *  (0.5, 0.5).
1467        */
1468       emit(MOV(dst, fs_reg(0.5f)));
1469    }
1470 }
1471
1472 fs_reg *
1473 fs_visitor::emit_samplepos_setup()
1474 {
1475    assert(brw->gen >= 6);
1476
1477    this->current_annotation = "compute sample position";
1478    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type));
1479    fs_reg pos = *reg;
1480    fs_reg int_sample_x = vgrf(glsl_type::int_type);
1481    fs_reg int_sample_y = vgrf(glsl_type::int_type);
1482
1483    /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1484     * mode will be enabled.
1485     *
1486     * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1487     * R31.1:0         Position Offset X/Y for Slot[3:0]
1488     * R31.3:2         Position Offset X/Y for Slot[7:4]
1489     * .....
1490     *
1491     * The X, Y sample positions come in as bytes in  thread payload. So, read
1492     * the positions using vstride=16, width=8, hstride=2.
1493     */
1494    struct brw_reg sample_pos_reg =
1495       stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1496                     BRW_REGISTER_TYPE_B), 16, 8, 2);
1497
1498    if (dispatch_width == 8) {
1499       emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1500    } else {
1501       emit(MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg)));
1502       emit(MOV(half(int_sample_x, 1), fs_reg(suboffset(sample_pos_reg, 16))))
1503          ->force_sechalf = true;
1504    }
1505    /* Compute gl_SamplePosition.x */
1506    compute_sample_position(pos, int_sample_x);
1507    pos = offset(pos, 1);
1508    if (dispatch_width == 8) {
1509       emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1510    } else {
1511       emit(MOV(half(int_sample_y, 0),
1512                fs_reg(suboffset(sample_pos_reg, 1))));
1513       emit(MOV(half(int_sample_y, 1), fs_reg(suboffset(sample_pos_reg, 17))))
1514          ->force_sechalf = true;
1515    }
1516    /* Compute gl_SamplePosition.y */
1517    compute_sample_position(pos, int_sample_y);
1518    return reg;
1519 }
1520
1521 fs_reg *
1522 fs_visitor::emit_sampleid_setup()
1523 {
1524    assert(stage == MESA_SHADER_FRAGMENT);
1525    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1526    assert(brw->gen >= 6);
1527
1528    this->current_annotation = "compute sample id";
1529    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
1530
1531    if (key->compute_sample_id) {
1532       fs_reg t1 = vgrf(glsl_type::int_type);
1533       fs_reg t2 = vgrf(glsl_type::int_type);
1534       t2.type = BRW_REGISTER_TYPE_UW;
1535
1536       /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1537        * 8x multisampling, subspan 0 will represent sample N (where N
1538        * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1539        * 7. We can find the value of N by looking at R0.0 bits 7:6
1540        * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1541        * (since samples are always delivered in pairs). That is, we
1542        * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1543        * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1544        * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1545        * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1546        * populating a temporary variable with the sequence (0, 1, 2, 3),
1547        * and then reading from it using vstride=1, width=4, hstride=0.
1548        * These computations hold good for 4x multisampling as well.
1549        *
1550        * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1551        * the first four slots are sample 0 of subspan 0; the next four
1552        * are sample 1 of subspan 0; the third group is sample 0 of
1553        * subspan 1, and finally sample 1 of subspan 1.
1554        */
1555       fs_inst *inst;
1556       inst = emit(BRW_OPCODE_AND, t1,
1557                   fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1558                   fs_reg(0xc0));
1559       inst->force_writemask_all = true;
1560       inst = emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1561       inst->force_writemask_all = true;
1562       /* This works for both SIMD8 and SIMD16 */
1563       inst = emit(MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210)));
1564       inst->force_writemask_all = true;
1565       /* This special instruction takes care of setting vstride=1,
1566        * width=4, hstride=0 of t2 during an ADD instruction.
1567        */
1568       emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1569    } else {
1570       /* As per GL_ARB_sample_shading specification:
1571        * "When rendering to a non-multisample buffer, or if multisample
1572        *  rasterization is disabled, gl_SampleID will always be zero."
1573        */
1574       emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1575    }
1576
1577    return reg;
1578 }
1579
1580 fs_reg
1581 fs_visitor::fix_math_operand(fs_reg src)
1582 {
1583    /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1584     * might be able to do better by doing execsize = 1 math and then
1585     * expanding that result out, but we would need to be careful with
1586     * masking.
1587     *
1588     * The hardware ignores source modifiers (negate and abs) on math
1589     * instructions, so we also move to a temp to set those up.
1590     */
1591    if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1592        !src.abs && !src.negate)
1593       return src;
1594
1595    /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1596     * operands to math
1597     */
1598    if (brw->gen >= 7 && src.file != IMM)
1599       return src;
1600
1601    fs_reg expanded = vgrf(glsl_type::float_type);
1602    expanded.type = src.type;
1603    emit(BRW_OPCODE_MOV, expanded, src);
1604    return expanded;
1605 }
1606
1607 fs_inst *
1608 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1609 {
1610    switch (opcode) {
1611    case SHADER_OPCODE_RCP:
1612    case SHADER_OPCODE_RSQ:
1613    case SHADER_OPCODE_SQRT:
1614    case SHADER_OPCODE_EXP2:
1615    case SHADER_OPCODE_LOG2:
1616    case SHADER_OPCODE_SIN:
1617    case SHADER_OPCODE_COS:
1618       break;
1619    default:
1620       unreachable("not reached: bad math opcode");
1621    }
1622
1623    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
1624     * might be able to do better by doing execsize = 1 math and then
1625     * expanding that result out, but we would need to be careful with
1626     * masking.
1627     *
1628     * Gen 6 hardware ignores source modifiers (negate and abs) on math
1629     * instructions, so we also move to a temp to set those up.
1630     */
1631    if (brw->gen == 6 || brw->gen == 7)
1632       src = fix_math_operand(src);
1633
1634    fs_inst *inst = emit(opcode, dst, src);
1635
1636    if (brw->gen < 6) {
1637       inst->base_mrf = 2;
1638       inst->mlen = dispatch_width / 8;
1639    }
1640
1641    return inst;
1642 }
1643
1644 fs_inst *
1645 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1646 {
1647    int base_mrf = 2;
1648    fs_inst *inst;
1649
1650    if (brw->gen >= 8) {
1651       inst = emit(opcode, dst, src0, src1);
1652    } else if (brw->gen >= 6) {
1653       src0 = fix_math_operand(src0);
1654       src1 = fix_math_operand(src1);
1655
1656       inst = emit(opcode, dst, src0, src1);
1657    } else {
1658       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1659        * "Message Payload":
1660        *
1661        * "Operand0[7].  For the INT DIV functions, this operand is the
1662        *  denominator."
1663        *  ...
1664        * "Operand1[7].  For the INT DIV functions, this operand is the
1665        *  numerator."
1666        */
1667       bool is_int_div = opcode != SHADER_OPCODE_POW;
1668       fs_reg &op0 = is_int_div ? src1 : src0;
1669       fs_reg &op1 = is_int_div ? src0 : src1;
1670
1671       emit(MOV(fs_reg(MRF, base_mrf + 1, op1.type, dispatch_width), op1));
1672       inst = emit(opcode, dst, op0, reg_null_f);
1673
1674       inst->base_mrf = base_mrf;
1675       inst->mlen = 2 * dispatch_width / 8;
1676    }
1677    return inst;
1678 }
1679
1680 void
1681 fs_visitor::assign_curb_setup()
1682 {
1683    if (dispatch_width == 8) {
1684       prog_data->dispatch_grf_start_reg = payload.num_regs;
1685    } else {
1686       assert(stage == MESA_SHADER_FRAGMENT);
1687       brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1688       prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1689    }
1690
1691    prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1692
1693    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1694    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1695       for (unsigned int i = 0; i < inst->sources; i++) {
1696          if (inst->src[i].file == UNIFORM) {
1697             int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1698             int constant_nr;
1699             if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1700                constant_nr = push_constant_loc[uniform_nr];
1701             } else {
1702                /* Section 5.11 of the OpenGL 4.1 spec says:
1703                 * "Out-of-bounds reads return undefined values, which include
1704                 *  values from other variables of the active program or zero."
1705                 * Just return the first push constant.
1706                 */
1707                constant_nr = 0;
1708             }
1709
1710             struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1711                                                   constant_nr / 8,
1712                                                   constant_nr % 8);
1713
1714             inst->src[i].file = HW_REG;
1715             inst->src[i].fixed_hw_reg = byte_offset(
1716                retype(brw_reg, inst->src[i].type),
1717                inst->src[i].subreg_offset);
1718          }
1719       }
1720    }
1721 }
1722
1723 void
1724 fs_visitor::calculate_urb_setup()
1725 {
1726    assert(stage == MESA_SHADER_FRAGMENT);
1727    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1728    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1729
1730    memset(prog_data->urb_setup, -1,
1731           sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
1732
1733    int urb_next = 0;
1734    /* Figure out where each of the incoming setup attributes lands. */
1735    if (brw->gen >= 6) {
1736       if (_mesa_bitcount_64(prog->InputsRead &
1737                             BRW_FS_VARYING_INPUT_MASK) <= 16) {
1738          /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1739           * first 16 varying inputs, so we can put them wherever we want.
1740           * Just put them in order.
1741           *
1742           * This is useful because it means that (a) inputs not used by the
1743           * fragment shader won't take up valuable register space, and (b) we
1744           * won't have to recompile the fragment shader if it gets paired with
1745           * a different vertex (or geometry) shader.
1746           */
1747          for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1748             if (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1749                 BITFIELD64_BIT(i)) {
1750                prog_data->urb_setup[i] = urb_next++;
1751             }
1752          }
1753       } else {
1754          /* We have enough input varyings that the SF/SBE pipeline stage can't
1755           * arbitrarily rearrange them to suit our whim; we have to put them
1756           * in an order that matches the output of the previous pipeline stage
1757           * (geometry or vertex shader).
1758           */
1759          struct brw_vue_map prev_stage_vue_map;
1760          brw_compute_vue_map(brw, &prev_stage_vue_map,
1761                              key->input_slots_valid);
1762          int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1763          assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1764          for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1765               slot++) {
1766             int varying = prev_stage_vue_map.slot_to_varying[slot];
1767             /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1768              * unused.
1769              */
1770             if (varying != BRW_VARYING_SLOT_COUNT &&
1771                 (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1772                  BITFIELD64_BIT(varying))) {
1773                prog_data->urb_setup[varying] = slot - first_slot;
1774             }
1775          }
1776          urb_next = prev_stage_vue_map.num_slots - first_slot;
1777       }
1778    } else {
1779       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1780       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1781          /* Point size is packed into the header, not as a general attribute */
1782          if (i == VARYING_SLOT_PSIZ)
1783             continue;
1784
1785          if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1786             /* The back color slot is skipped when the front color is
1787              * also written to.  In addition, some slots can be
1788              * written in the vertex shader and not read in the
1789              * fragment shader.  So the register number must always be
1790              * incremented, mapped or not.
1791              */
1792             if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1793                prog_data->urb_setup[i] = urb_next;
1794             urb_next++;
1795          }
1796       }
1797
1798       /*
1799        * It's a FS only attribute, and we did interpolation for this attribute
1800        * in SF thread. So, count it here, too.
1801        *
1802        * See compile_sf_prog() for more info.
1803        */
1804       if (prog->InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1805          prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1806    }
1807
1808    prog_data->num_varying_inputs = urb_next;
1809 }
1810
1811 void
1812 fs_visitor::assign_urb_setup()
1813 {
1814    assert(stage == MESA_SHADER_FRAGMENT);
1815    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1816
1817    int urb_start = payload.num_regs + prog_data->base.curb_read_length;
1818
1819    /* Offset all the urb_setup[] index by the actual position of the
1820     * setup regs, now that the location of the constants has been chosen.
1821     */
1822    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1823       if (inst->opcode == FS_OPCODE_LINTERP) {
1824          assert(inst->src[2].file == HW_REG);
1825          inst->src[2].fixed_hw_reg.nr += urb_start;
1826       }
1827
1828       if (inst->opcode == FS_OPCODE_CINTERP) {
1829          assert(inst->src[0].file == HW_REG);
1830          inst->src[0].fixed_hw_reg.nr += urb_start;
1831       }
1832    }
1833
1834    /* Each attribute is 4 setup channels, each of which is half a reg. */
1835    this->first_non_payload_grf =
1836       urb_start + prog_data->num_varying_inputs * 2;
1837 }
1838
1839 void
1840 fs_visitor::assign_vs_urb_setup()
1841 {
1842    brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
1843    int grf, count, slot, channel, attr;
1844
1845    assert(stage == MESA_SHADER_VERTEX);
1846    count = _mesa_bitcount_64(vs_prog_data->inputs_read);
1847    if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid)
1848       count++;
1849
1850    /* Each attribute is 4 regs. */
1851    this->first_non_payload_grf =
1852       payload.num_regs + prog_data->curb_read_length + count * 4;
1853
1854    unsigned vue_entries =
1855       MAX2(count, vs_prog_data->base.vue_map.num_slots);
1856
1857    vs_prog_data->base.urb_entry_size = ALIGN(vue_entries, 4) / 4;
1858    vs_prog_data->base.urb_read_length = (count + 1) / 2;
1859
1860    assert(vs_prog_data->base.urb_read_length <= 15);
1861
1862    /* Rewrite all ATTR file references to the hw grf that they land in. */
1863    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1864       for (int i = 0; i < inst->sources; i++) {
1865          if (inst->src[i].file == ATTR) {
1866
1867             if (inst->src[i].reg == VERT_ATTRIB_MAX) {
1868                slot = count - 1;
1869             } else {
1870                /* Attributes come in in a contiguous block, ordered by their
1871                 * gl_vert_attrib value.  That means we can compute the slot
1872                 * number for an attribute by masking out the enabled
1873                 * attributes before it and counting the bits.
1874                 */
1875                attr = inst->src[i].reg + inst->src[i].reg_offset / 4;
1876                slot = _mesa_bitcount_64(vs_prog_data->inputs_read &
1877                                         BITFIELD64_MASK(attr));
1878             }
1879
1880             channel = inst->src[i].reg_offset & 3;
1881
1882             grf = payload.num_regs +
1883                prog_data->curb_read_length +
1884                slot * 4 + channel;
1885
1886             inst->src[i].file = HW_REG;
1887             inst->src[i].fixed_hw_reg =
1888                retype(brw_vec8_grf(grf, 0), inst->src[i].type);
1889          }
1890       }
1891    }
1892 }
1893
1894 /**
1895  * Split large virtual GRFs into separate components if we can.
1896  *
1897  * This is mostly duplicated with what brw_fs_vector_splitting does,
1898  * but that's really conservative because it's afraid of doing
1899  * splitting that doesn't result in real progress after the rest of
1900  * the optimization phases, which would cause infinite looping in
1901  * optimization.  We can do it once here, safely.  This also has the
1902  * opportunity to split interpolated values, or maybe even uniforms,
1903  * which we don't have at the IR level.
1904  *
1905  * We want to split, because virtual GRFs are what we register
1906  * allocate and spill (due to contiguousness requirements for some
1907  * instructions), and they're what we naturally generate in the
1908  * codegen process, but most virtual GRFs don't actually need to be
1909  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1910  * live intervals and better dead code elimination and coalescing.
1911  */
1912 void
1913 fs_visitor::split_virtual_grfs()
1914 {
1915    int num_vars = this->virtual_grf_count;
1916
1917    /* Count the total number of registers */
1918    int reg_count = 0;
1919    int vgrf_to_reg[num_vars];
1920    for (int i = 0; i < num_vars; i++) {
1921       vgrf_to_reg[i] = reg_count;
1922       reg_count += virtual_grf_sizes[i];
1923    }
1924
1925    /* An array of "split points".  For each register slot, this indicates
1926     * if this slot can be separated from the previous slot.  Every time an
1927     * instruction uses multiple elements of a register (as a source or
1928     * destination), we mark the used slots as inseparable.  Then we go
1929     * through and split the registers into the smallest pieces we can.
1930     */
1931    bool split_points[reg_count];
1932    memset(split_points, 0, sizeof(split_points));
1933
1934    /* Mark all used registers as fully splittable */
1935    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1936       if (inst->dst.file == GRF) {
1937          int reg = vgrf_to_reg[inst->dst.reg];
1938          for (int j = 1; j < this->virtual_grf_sizes[inst->dst.reg]; j++)
1939             split_points[reg + j] = true;
1940       }
1941
1942       for (int i = 0; i < inst->sources; i++) {
1943          if (inst->src[i].file == GRF) {
1944             int reg = vgrf_to_reg[inst->src[i].reg];
1945             for (int j = 1; j < this->virtual_grf_sizes[inst->src[i].reg]; j++)
1946                split_points[reg + j] = true;
1947          }
1948       }
1949    }
1950
1951    if (brw->has_pln &&
1952        this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1953       /* PLN opcodes rely on the delta_xy being contiguous.  We only have to
1954        * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1955        * Gen6, that was the only supported interpolation mode, and since Gen6,
1956        * delta_x and delta_y are in fixed hardware registers.
1957        */
1958       int vgrf = this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg;
1959       split_points[vgrf_to_reg[vgrf] + 1] = false;
1960    }
1961
1962    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1963       if (inst->dst.file == GRF) {
1964          int reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
1965          for (int j = 1; j < inst->regs_written; j++)
1966             split_points[reg + j] = false;
1967       }
1968       for (int i = 0; i < inst->sources; i++) {
1969          if (inst->src[i].file == GRF) {
1970             int reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
1971             for (int j = 1; j < inst->regs_read(this, i); j++)
1972                split_points[reg + j] = false;
1973          }
1974       }
1975    }
1976
1977    int new_virtual_grf[reg_count];
1978    int new_reg_offset[reg_count];
1979
1980    int reg = 0;
1981    for (int i = 0; i < num_vars; i++) {
1982       /* The first one should always be 0 as a quick sanity check. */
1983       assert(split_points[reg] == false);
1984
1985       /* j = 0 case */
1986       new_reg_offset[reg] = 0;
1987       reg++;
1988       int offset = 1;
1989
1990       /* j > 0 case */
1991       for (int j = 1; j < virtual_grf_sizes[i]; j++) {
1992          /* If this is a split point, reset the offset to 0 and allocate a
1993           * new virtual GRF for the previous offset many registers
1994           */
1995          if (split_points[reg]) {
1996             assert(offset <= MAX_VGRF_SIZE);
1997             int grf = virtual_grf_alloc(offset);
1998             for (int k = reg - offset; k < reg; k++)
1999                new_virtual_grf[k] = grf;
2000             offset = 0;
2001          }
2002          new_reg_offset[reg] = offset;
2003          offset++;
2004          reg++;
2005       }
2006
2007       /* The last one gets the original register number */
2008       assert(offset <= MAX_VGRF_SIZE);
2009       virtual_grf_sizes[i] = offset;
2010       for (int k = reg - offset; k < reg; k++)
2011          new_virtual_grf[k] = i;
2012    }
2013    assert(reg == reg_count);
2014
2015    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2016       if (inst->dst.file == GRF) {
2017          reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
2018          inst->dst.reg = new_virtual_grf[reg];
2019          inst->dst.reg_offset = new_reg_offset[reg];
2020          assert(new_reg_offset[reg] < virtual_grf_sizes[new_virtual_grf[reg]]);
2021       }
2022       for (int i = 0; i < inst->sources; i++) {
2023          if (inst->src[i].file == GRF) {
2024             reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
2025             inst->src[i].reg = new_virtual_grf[reg];
2026             inst->src[i].reg_offset = new_reg_offset[reg];
2027             assert(new_reg_offset[reg] < virtual_grf_sizes[new_virtual_grf[reg]]);
2028          }
2029       }
2030    }
2031    invalidate_live_intervals();
2032 }
2033
2034 /**
2035  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
2036  *
2037  * During code generation, we create tons of temporary variables, many of
2038  * which get immediately killed and are never used again.  Yet, in later
2039  * optimization and analysis passes, such as compute_live_intervals, we need
2040  * to loop over all the virtual GRFs.  Compacting them can save a lot of
2041  * overhead.
2042  */
2043 bool
2044 fs_visitor::compact_virtual_grfs()
2045 {
2046    bool progress = false;
2047    int remap_table[this->virtual_grf_count];
2048    memset(remap_table, -1, sizeof(remap_table));
2049
2050    /* Mark which virtual GRFs are used. */
2051    foreach_block_and_inst(block, const fs_inst, inst, cfg) {
2052       if (inst->dst.file == GRF)
2053          remap_table[inst->dst.reg] = 0;
2054
2055       for (int i = 0; i < inst->sources; i++) {
2056          if (inst->src[i].file == GRF)
2057             remap_table[inst->src[i].reg] = 0;
2058       }
2059    }
2060
2061    /* Compact the GRF arrays. */
2062    int new_index = 0;
2063    for (int i = 0; i < this->virtual_grf_count; i++) {
2064       if (remap_table[i] == -1) {
2065          /* We just found an unused register.  This means that we are
2066           * actually going to compact something.
2067           */
2068          progress = true;
2069       } else {
2070          remap_table[i] = new_index;
2071          virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
2072          invalidate_live_intervals();
2073          ++new_index;
2074       }
2075    }
2076
2077    this->virtual_grf_count = new_index;
2078
2079    /* Patch all the instructions to use the newly renumbered registers */
2080    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2081       if (inst->dst.file == GRF)
2082          inst->dst.reg = remap_table[inst->dst.reg];
2083
2084       for (int i = 0; i < inst->sources; i++) {
2085          if (inst->src[i].file == GRF)
2086             inst->src[i].reg = remap_table[inst->src[i].reg];
2087       }
2088    }
2089
2090    /* Patch all the references to delta_x/delta_y, since they're used in
2091     * register allocation.  If they're unused, switch them to BAD_FILE so
2092     * we don't think some random VGRF is delta_x/delta_y.
2093     */
2094    for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
2095       if (delta_x[i].file == GRF) {
2096          if (remap_table[delta_x[i].reg] != -1) {
2097             delta_x[i].reg = remap_table[delta_x[i].reg];
2098          } else {
2099             delta_x[i].file = BAD_FILE;
2100          }
2101       }
2102    }
2103    for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
2104       if (delta_y[i].file == GRF) {
2105          if (remap_table[delta_y[i].reg] != -1) {
2106             delta_y[i].reg = remap_table[delta_y[i].reg];
2107          } else {
2108             delta_y[i].file = BAD_FILE;
2109          }
2110       }
2111    }
2112
2113    return progress;
2114 }
2115
2116 /*
2117  * Implements array access of uniforms by inserting a
2118  * PULL_CONSTANT_LOAD instruction.
2119  *
2120  * Unlike temporary GRF array access (where we don't support it due to
2121  * the difficulty of doing relative addressing on instruction
2122  * destinations), we could potentially do array access of uniforms
2123  * that were loaded in GRF space as push constants.  In real-world
2124  * usage we've seen, though, the arrays being used are always larger
2125  * than we could load as push constants, so just always move all
2126  * uniform array access out to a pull constant buffer.
2127  */
2128 void
2129 fs_visitor::move_uniform_array_access_to_pull_constants()
2130 {
2131    if (dispatch_width != 8)
2132       return;
2133
2134    pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2135    memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
2136
2137    /* Walk through and find array access of uniforms.  Put a copy of that
2138     * uniform in the pull constant buffer.
2139     *
2140     * Note that we don't move constant-indexed accesses to arrays.  No
2141     * testing has been done of the performance impact of this choice.
2142     */
2143    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2144       for (int i = 0 ; i < inst->sources; i++) {
2145          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2146             continue;
2147
2148          int uniform = inst->src[i].reg;
2149
2150          /* If this array isn't already present in the pull constant buffer,
2151           * add it.
2152           */
2153          if (pull_constant_loc[uniform] == -1) {
2154             const gl_constant_value **values = &stage_prog_data->param[uniform];
2155
2156             assert(param_size[uniform]);
2157
2158             for (int j = 0; j < param_size[uniform]; j++) {
2159                pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
2160
2161                stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
2162                   values[j];
2163             }
2164          }
2165       }
2166    }
2167 }
2168
2169 /**
2170  * Assign UNIFORM file registers to either push constants or pull constants.
2171  *
2172  * We allow a fragment shader to have more than the specified minimum
2173  * maximum number of fragment shader uniform components (64).  If
2174  * there are too many of these, they'd fill up all of register space.
2175  * So, this will push some of them out to the pull constant buffer and
2176  * update the program to load them.
2177  */
2178 void
2179 fs_visitor::assign_constant_locations()
2180 {
2181    /* Only the first compile (SIMD8 mode) gets to decide on locations. */
2182    if (dispatch_width != 8)
2183       return;
2184
2185    /* Find which UNIFORM registers are still in use. */
2186    bool is_live[uniforms];
2187    for (unsigned int i = 0; i < uniforms; i++) {
2188       is_live[i] = false;
2189    }
2190
2191    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2192       for (int i = 0; i < inst->sources; i++) {
2193          if (inst->src[i].file != UNIFORM)
2194             continue;
2195
2196          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
2197          if (constant_nr >= 0 && constant_nr < (int) uniforms)
2198             is_live[constant_nr] = true;
2199       }
2200    }
2201
2202    /* Only allow 16 registers (128 uniform components) as push constants.
2203     *
2204     * Just demote the end of the list.  We could probably do better
2205     * here, demoting things that are rarely used in the program first.
2206     *
2207     * If changing this value, note the limitation about total_regs in
2208     * brw_curbe.c.
2209     */
2210    unsigned int max_push_components = 16 * 8;
2211    unsigned int num_push_constants = 0;
2212
2213    push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2214
2215    for (unsigned int i = 0; i < uniforms; i++) {
2216       if (!is_live[i] || pull_constant_loc[i] != -1) {
2217          /* This UNIFORM register is either dead, or has already been demoted
2218           * to a pull const.  Mark it as no longer living in the param[] array.
2219           */
2220          push_constant_loc[i] = -1;
2221          continue;
2222       }
2223
2224       if (num_push_constants < max_push_components) {
2225          /* Retain as a push constant.  Record the location in the params[]
2226           * array.
2227           */
2228          push_constant_loc[i] = num_push_constants++;
2229       } else {
2230          /* Demote to a pull constant. */
2231          push_constant_loc[i] = -1;
2232
2233          int pull_index = stage_prog_data->nr_pull_params++;
2234          stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
2235          pull_constant_loc[i] = pull_index;
2236       }
2237    }
2238
2239    stage_prog_data->nr_params = num_push_constants;
2240
2241    /* Up until now, the param[] array has been indexed by reg + reg_offset
2242     * of UNIFORM registers.  Condense it to only contain the uniforms we
2243     * chose to upload as push constants.
2244     */
2245    for (unsigned int i = 0; i < uniforms; i++) {
2246       int remapped = push_constant_loc[i];
2247
2248       if (remapped == -1)
2249          continue;
2250
2251       assert(remapped <= (int)i);
2252       stage_prog_data->param[remapped] = stage_prog_data->param[i];
2253    }
2254 }
2255
2256 /**
2257  * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
2258  * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
2259  */
2260 void
2261 fs_visitor::demote_pull_constants()
2262 {
2263    foreach_block_and_inst (block, fs_inst, inst, cfg) {
2264       for (int i = 0; i < inst->sources; i++) {
2265          if (inst->src[i].file != UNIFORM)
2266             continue;
2267
2268          int pull_index = pull_constant_loc[inst->src[i].reg +
2269                                             inst->src[i].reg_offset];
2270          if (pull_index == -1)
2271             continue;
2272
2273          /* Set up the annotation tracking for new generated instructions. */
2274          base_ir = inst->ir;
2275          current_annotation = inst->annotation;
2276
2277          fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
2278          fs_reg dst = vgrf(glsl_type::float_type);
2279
2280          /* Generate a pull load into dst. */
2281          if (inst->src[i].reladdr) {
2282             exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
2283                                                         surf_index,
2284                                                         *inst->src[i].reladdr,
2285                                                         pull_index);
2286             inst->insert_before(block, &list);
2287             inst->src[i].reladdr = NULL;
2288          } else {
2289             fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
2290             fs_inst *pull =
2291                new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 8,
2292                                     dst, surf_index, offset);
2293             inst->insert_before(block, pull);
2294             inst->src[i].set_smear(pull_index & 3);
2295          }
2296
2297          /* Rewrite the instruction to use the temporary VGRF. */
2298          inst->src[i].file = GRF;
2299          inst->src[i].reg = dst.reg;
2300          inst->src[i].reg_offset = 0;
2301          inst->src[i].width = dispatch_width;
2302       }
2303    }
2304    invalidate_live_intervals();
2305 }
2306
2307 bool
2308 fs_visitor::opt_algebraic()
2309 {
2310    bool progress = false;
2311
2312    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2313       switch (inst->opcode) {
2314       case BRW_OPCODE_MOV:
2315          if (inst->src[0].file != IMM)
2316             break;
2317
2318          if (inst->saturate) {
2319             if (inst->dst.type != inst->src[0].type)
2320                assert(!"unimplemented: saturate mixed types");
2321
2322             if (brw_saturate_immediate(inst->dst.type,
2323                                        &inst->src[0].fixed_hw_reg)) {
2324                inst->saturate = false;
2325                progress = true;
2326             }
2327          }
2328          break;
2329
2330       case BRW_OPCODE_MUL:
2331          if (inst->src[1].file != IMM)
2332             continue;
2333
2334          /* a * 1.0 = a */
2335          if (inst->src[1].is_one()) {
2336             inst->opcode = BRW_OPCODE_MOV;
2337             inst->src[1] = reg_undef;
2338             progress = true;
2339             break;
2340          }
2341
2342          /* a * 0.0 = 0.0 */
2343          if (inst->src[1].is_zero()) {
2344             inst->opcode = BRW_OPCODE_MOV;
2345             inst->src[0] = inst->src[1];
2346             inst->src[1] = reg_undef;
2347             progress = true;
2348             break;
2349          }
2350
2351          break;
2352       case BRW_OPCODE_ADD:
2353          if (inst->src[1].file != IMM)
2354             continue;
2355
2356          /* a + 0.0 = a */
2357          if (inst->src[1].is_zero()) {
2358             inst->opcode = BRW_OPCODE_MOV;
2359             inst->src[1] = reg_undef;
2360             progress = true;
2361             break;
2362          }
2363          break;
2364       case BRW_OPCODE_OR:
2365          if (inst->src[0].equals(inst->src[1])) {
2366             inst->opcode = BRW_OPCODE_MOV;
2367             inst->src[1] = reg_undef;
2368             progress = true;
2369             break;
2370          }
2371          break;
2372       case BRW_OPCODE_LRP:
2373          if (inst->src[1].equals(inst->src[2])) {
2374             inst->opcode = BRW_OPCODE_MOV;
2375             inst->src[0] = inst->src[1];
2376             inst->src[1] = reg_undef;
2377             inst->src[2] = reg_undef;
2378             progress = true;
2379             break;
2380          }
2381          break;
2382       case BRW_OPCODE_CMP:
2383          if (inst->conditional_mod == BRW_CONDITIONAL_GE &&
2384              inst->src[0].abs &&
2385              inst->src[0].negate &&
2386              inst->src[1].is_zero()) {
2387             inst->src[0].abs = false;
2388             inst->src[0].negate = false;
2389             inst->conditional_mod = BRW_CONDITIONAL_Z;
2390             progress = true;
2391             break;
2392          }
2393          break;
2394       case BRW_OPCODE_SEL:
2395          if (inst->src[0].equals(inst->src[1])) {
2396             inst->opcode = BRW_OPCODE_MOV;
2397             inst->src[1] = reg_undef;
2398             inst->predicate = BRW_PREDICATE_NONE;
2399             inst->predicate_inverse = false;
2400             progress = true;
2401          } else if (inst->saturate && inst->src[1].file == IMM) {
2402             switch (inst->conditional_mod) {
2403             case BRW_CONDITIONAL_LE:
2404             case BRW_CONDITIONAL_L:
2405                switch (inst->src[1].type) {
2406                case BRW_REGISTER_TYPE_F:
2407                   if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
2408                      inst->opcode = BRW_OPCODE_MOV;
2409                      inst->src[1] = reg_undef;
2410                      progress = true;
2411                   }
2412                   break;
2413                default:
2414                   break;
2415                }
2416                break;
2417             case BRW_CONDITIONAL_GE:
2418             case BRW_CONDITIONAL_G:
2419                switch (inst->src[1].type) {
2420                case BRW_REGISTER_TYPE_F:
2421                   if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
2422                      inst->opcode = BRW_OPCODE_MOV;
2423                      inst->src[1] = reg_undef;
2424                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
2425                      progress = true;
2426                   }
2427                   break;
2428                default:
2429                   break;
2430                }
2431             default:
2432                break;
2433             }
2434          }
2435          break;
2436       case SHADER_OPCODE_RCP: {
2437          fs_inst *prev = (fs_inst *)inst->prev;
2438          if (prev->opcode == SHADER_OPCODE_SQRT) {
2439             if (inst->src[0].equals(prev->dst)) {
2440                inst->opcode = SHADER_OPCODE_RSQ;
2441                inst->src[0] = prev->src[0];
2442                progress = true;
2443             }
2444          }
2445          break;
2446       }
2447       default:
2448          break;
2449       }
2450    }
2451
2452    return progress;
2453 }
2454
2455 bool
2456 fs_visitor::opt_register_renaming()
2457 {
2458    bool progress = false;
2459    int depth = 0;
2460
2461    int remap[virtual_grf_count];
2462    memset(remap, -1, sizeof(int) * virtual_grf_count);
2463
2464    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2465       if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
2466          depth++;
2467       } else if (inst->opcode == BRW_OPCODE_ENDIF ||
2468                  inst->opcode == BRW_OPCODE_WHILE) {
2469          depth--;
2470       }
2471
2472       /* Rewrite instruction sources. */
2473       for (int i = 0; i < inst->sources; i++) {
2474          if (inst->src[i].file == GRF &&
2475              remap[inst->src[i].reg] != -1 &&
2476              remap[inst->src[i].reg] != inst->src[i].reg) {
2477             inst->src[i].reg = remap[inst->src[i].reg];
2478             progress = true;
2479          }
2480       }
2481
2482       const int dst = inst->dst.reg;
2483
2484       if (depth == 0 &&
2485           inst->dst.file == GRF &&
2486           virtual_grf_sizes[inst->dst.reg] == inst->dst.width / 8 &&
2487           !inst->is_partial_write()) {
2488          if (remap[dst] == -1) {
2489             remap[dst] = dst;
2490          } else {
2491             remap[dst] = virtual_grf_alloc(inst->dst.width / 8);
2492             inst->dst.reg = remap[dst];
2493             progress = true;
2494          }
2495       } else if (inst->dst.file == GRF &&
2496                  remap[dst] != -1 &&
2497                  remap[dst] != dst) {
2498          inst->dst.reg = remap[dst];
2499          progress = true;
2500       }
2501    }
2502
2503    if (progress) {
2504       invalidate_live_intervals();
2505
2506       for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
2507          if (delta_x[i].file == GRF && remap[delta_x[i].reg] != -1) {
2508             delta_x[i].reg = remap[delta_x[i].reg];
2509          }
2510       }
2511       for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
2512          if (delta_y[i].file == GRF && remap[delta_y[i].reg] != -1) {
2513             delta_y[i].reg = remap[delta_y[i].reg];
2514          }
2515       }
2516    }
2517
2518    return progress;
2519 }
2520
2521 bool
2522 fs_visitor::compute_to_mrf()
2523 {
2524    bool progress = false;
2525    int next_ip = 0;
2526
2527    /* No MRFs on Gen >= 7. */
2528    if (brw->gen >= 7)
2529       return false;
2530
2531    calculate_live_intervals();
2532
2533    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2534       int ip = next_ip;
2535       next_ip++;
2536
2537       if (inst->opcode != BRW_OPCODE_MOV ||
2538           inst->is_partial_write() ||
2539           inst->dst.file != MRF || inst->src[0].file != GRF ||
2540           inst->dst.type != inst->src[0].type ||
2541           inst->src[0].abs || inst->src[0].negate ||
2542           !inst->src[0].is_contiguous() ||
2543           inst->src[0].subreg_offset)
2544          continue;
2545
2546       /* Work out which hardware MRF registers are written by this
2547        * instruction.
2548        */
2549       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2550       int mrf_high;
2551       if (inst->dst.reg & BRW_MRF_COMPR4) {
2552          mrf_high = mrf_low + 4;
2553       } else if (inst->exec_size == 16) {
2554          mrf_high = mrf_low + 1;
2555       } else {
2556          mrf_high = mrf_low;
2557       }
2558
2559       /* Can't compute-to-MRF this GRF if someone else was going to
2560        * read it later.
2561        */
2562       if (this->virtual_grf_end[inst->src[0].reg] > ip)
2563          continue;
2564
2565       /* Found a move of a GRF to a MRF.  Let's see if we can go
2566        * rewrite the thing that made this GRF to write into the MRF.
2567        */
2568       foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2569          if (scan_inst->dst.file == GRF &&
2570              scan_inst->dst.reg == inst->src[0].reg) {
2571             /* Found the last thing to write our reg we want to turn
2572              * into a compute-to-MRF.
2573              */
2574
2575             /* If this one instruction didn't populate all the
2576              * channels, bail.  We might be able to rewrite everything
2577              * that writes that reg, but it would require smarter
2578              * tracking to delay the rewriting until complete success.
2579              */
2580             if (scan_inst->is_partial_write())
2581                break;
2582
2583             /* Things returning more than one register would need us to
2584              * understand coalescing out more than one MOV at a time.
2585              */
2586             if (scan_inst->regs_written > scan_inst->dst.width / 8)
2587                break;
2588
2589             /* SEND instructions can't have MRF as a destination. */
2590             if (scan_inst->mlen)
2591                break;
2592
2593             if (brw->gen == 6) {
2594                /* gen6 math instructions must have the destination be
2595                 * GRF, so no compute-to-MRF for them.
2596                 */
2597                if (scan_inst->is_math()) {
2598                   break;
2599                }
2600             }
2601
2602             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2603                /* Found the creator of our MRF's source value. */
2604                scan_inst->dst.file = MRF;
2605                scan_inst->dst.reg = inst->dst.reg;
2606                scan_inst->saturate |= inst->saturate;
2607                inst->remove(block);
2608                progress = true;
2609             }
2610             break;
2611          }
2612
2613          /* We don't handle control flow here.  Most computation of
2614           * values that end up in MRFs are shortly before the MRF
2615           * write anyway.
2616           */
2617          if (block->start() == scan_inst)
2618             break;
2619
2620          /* You can't read from an MRF, so if someone else reads our
2621           * MRF's source GRF that we wanted to rewrite, that stops us.
2622           */
2623          bool interfered = false;
2624          for (int i = 0; i < scan_inst->sources; i++) {
2625             if (scan_inst->src[i].file == GRF &&
2626                 scan_inst->src[i].reg == inst->src[0].reg &&
2627                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2628                interfered = true;
2629             }
2630          }
2631          if (interfered)
2632             break;
2633
2634          if (scan_inst->dst.file == MRF) {
2635             /* If somebody else writes our MRF here, we can't
2636              * compute-to-MRF before that.
2637              */
2638             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2639             int scan_mrf_high;
2640
2641             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2642                scan_mrf_high = scan_mrf_low + 4;
2643             } else if (scan_inst->exec_size == 16) {
2644                scan_mrf_high = scan_mrf_low + 1;
2645             } else {
2646                scan_mrf_high = scan_mrf_low;
2647             }
2648
2649             if (mrf_low == scan_mrf_low ||
2650                 mrf_low == scan_mrf_high ||
2651                 mrf_high == scan_mrf_low ||
2652                 mrf_high == scan_mrf_high) {
2653                break;
2654             }
2655          }
2656
2657          if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2658             /* Found a SEND instruction, which means that there are
2659              * live values in MRFs from base_mrf to base_mrf +
2660              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2661              * above it.
2662              */
2663             if (mrf_low >= scan_inst->base_mrf &&
2664                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2665                break;
2666             }
2667             if (mrf_high >= scan_inst->base_mrf &&
2668                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2669                break;
2670             }
2671          }
2672       }
2673    }
2674
2675    if (progress)
2676       invalidate_live_intervals();
2677
2678    return progress;
2679 }
2680
2681 /**
2682  * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
2683  * instructions to FS_OPCODE_REP_FB_WRITE.
2684  */
2685 void
2686 fs_visitor::emit_repclear_shader()
2687 {
2688    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2689    int base_mrf = 1;
2690    int color_mrf = base_mrf + 2;
2691
2692    fs_inst *mov = emit(MOV(vec4(brw_message_reg(color_mrf)),
2693                            fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F)));
2694    mov->force_writemask_all = true;
2695
2696    fs_inst *write;
2697    if (key->nr_color_regions == 1) {
2698       write = emit(FS_OPCODE_REP_FB_WRITE);
2699       write->saturate = key->clamp_fragment_color;
2700       write->base_mrf = color_mrf;
2701       write->target = 0;
2702       write->header_present = false;
2703       write->mlen = 1;
2704    } else {
2705       assume(key->nr_color_regions > 0);
2706       for (int i = 0; i < key->nr_color_regions; ++i) {
2707          write = emit(FS_OPCODE_REP_FB_WRITE);
2708          write->saturate = key->clamp_fragment_color;
2709          write->base_mrf = base_mrf;
2710          write->target = i;
2711          write->header_present = true;
2712          write->mlen = 3;
2713       }
2714    }
2715    write->eot = true;
2716
2717    calculate_cfg();
2718
2719    assign_constant_locations();
2720    assign_curb_setup();
2721
2722    /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
2723    assert(mov->src[0].file == HW_REG);
2724    mov->src[0] = brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
2725 }
2726
2727 /**
2728  * Walks through basic blocks, looking for repeated MRF writes and
2729  * removing the later ones.
2730  */
2731 bool
2732 fs_visitor::remove_duplicate_mrf_writes()
2733 {
2734    fs_inst *last_mrf_move[16];
2735    bool progress = false;
2736
2737    /* Need to update the MRF tracking for compressed instructions. */
2738    if (dispatch_width == 16)
2739       return false;
2740
2741    memset(last_mrf_move, 0, sizeof(last_mrf_move));
2742
2743    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
2744       if (inst->is_control_flow()) {
2745          memset(last_mrf_move, 0, sizeof(last_mrf_move));
2746       }
2747
2748       if (inst->opcode == BRW_OPCODE_MOV &&
2749           inst->dst.file == MRF) {
2750          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2751          if (prev_inst && inst->equals(prev_inst)) {
2752             inst->remove(block);
2753             progress = true;
2754             continue;
2755          }
2756       }
2757
2758       /* Clear out the last-write records for MRFs that were overwritten. */
2759       if (inst->dst.file == MRF) {
2760          last_mrf_move[inst->dst.reg] = NULL;
2761       }
2762
2763       if (inst->mlen > 0 && inst->base_mrf != -1) {
2764          /* Found a SEND instruction, which will include two or fewer
2765           * implied MRF writes.  We could do better here.
2766           */
2767          for (int i = 0; i < implied_mrf_writes(inst); i++) {
2768             last_mrf_move[inst->base_mrf + i] = NULL;
2769          }
2770       }
2771
2772       /* Clear out any MRF move records whose sources got overwritten. */
2773       if (inst->dst.file == GRF) {
2774          for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2775             if (last_mrf_move[i] &&
2776                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2777                last_mrf_move[i] = NULL;
2778             }
2779          }
2780       }
2781
2782       if (inst->opcode == BRW_OPCODE_MOV &&
2783           inst->dst.file == MRF &&
2784           inst->src[0].file == GRF &&
2785           !inst->is_partial_write()) {
2786          last_mrf_move[inst->dst.reg] = inst;
2787       }
2788    }
2789
2790    if (progress)
2791       invalidate_live_intervals();
2792
2793    return progress;
2794 }
2795
2796 static void
2797 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2798                         int first_grf, int grf_len)
2799 {
2800    /* Clear the flag for registers that actually got read (as expected). */
2801    for (int i = 0; i < inst->sources; i++) {
2802       int grf;
2803       if (inst->src[i].file == GRF) {
2804          grf = inst->src[i].reg;
2805       } else if (inst->src[i].file == HW_REG &&
2806                  inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2807          grf = inst->src[i].fixed_hw_reg.nr;
2808       } else {
2809          continue;
2810       }
2811
2812       if (grf >= first_grf &&
2813           grf < first_grf + grf_len) {
2814          deps[grf - first_grf] = false;
2815          if (inst->exec_size == 16)
2816             deps[grf - first_grf + 1] = false;
2817       }
2818    }
2819 }
2820
2821 /**
2822  * Implements this workaround for the original 965:
2823  *
2824  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2825  *      check for post destination dependencies on this instruction, software
2826  *      must ensure that there is no destination hazard for the case of ‘write
2827  *      followed by a posted write’ shown in the following example.
2828  *
2829  *      1. mov r3 0
2830  *      2. send r3.xy <rest of send instruction>
2831  *      3. mov r2 r3
2832  *
2833  *      Due to no post-destination dependency check on the ‘send’, the above
2834  *      code sequence could have two instructions (1 and 2) in flight at the
2835  *      same time that both consider ‘r3’ as the target of their final writes.
2836  */
2837 void
2838 fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
2839                                                         fs_inst *inst)
2840 {
2841    int write_len = inst->regs_written;
2842    int first_write_grf = inst->dst.reg;
2843    bool needs_dep[BRW_MAX_MRF];
2844    assert(write_len < (int)sizeof(needs_dep) - 1);
2845
2846    memset(needs_dep, false, sizeof(needs_dep));
2847    memset(needs_dep, true, write_len);
2848
2849    clear_deps_for_inst_src(inst, dispatch_width,
2850                            needs_dep, first_write_grf, write_len);
2851
2852    /* Walk backwards looking for writes to registers we're writing which
2853     * aren't read since being written.  If we hit the start of the program,
2854     * we assume that there are no outstanding dependencies on entry to the
2855     * program.
2856     */
2857    foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2858       /* If we hit control flow, assume that there *are* outstanding
2859        * dependencies, and force their cleanup before our instruction.
2860        */
2861       if (block->start() == scan_inst) {
2862          for (int i = 0; i < write_len; i++) {
2863             if (needs_dep[i]) {
2864                inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
2865             }
2866          }
2867          return;
2868       }
2869
2870       /* We insert our reads as late as possible on the assumption that any
2871        * instruction but a MOV that might have left us an outstanding
2872        * dependency has more latency than a MOV.
2873        */
2874       if (scan_inst->dst.file == GRF) {
2875          for (int i = 0; i < scan_inst->regs_written; i++) {
2876             int reg = scan_inst->dst.reg + i;
2877
2878             if (reg >= first_write_grf &&
2879                 reg < first_write_grf + write_len &&
2880                 needs_dep[reg - first_write_grf]) {
2881                inst->insert_before(block, DEP_RESOLVE_MOV(reg));
2882                needs_dep[reg - first_write_grf] = false;
2883                if (scan_inst->exec_size == 16)
2884                   needs_dep[reg - first_write_grf + 1] = false;
2885             }
2886          }
2887       }
2888
2889       /* Clear the flag for registers that actually got read (as expected). */
2890       clear_deps_for_inst_src(scan_inst, dispatch_width,
2891                               needs_dep, first_write_grf, write_len);
2892
2893       /* Continue the loop only if we haven't resolved all the dependencies */
2894       int i;
2895       for (i = 0; i < write_len; i++) {
2896          if (needs_dep[i])
2897             break;
2898       }
2899       if (i == write_len)
2900          return;
2901    }
2902 }
2903
2904 /**
2905  * Implements this workaround for the original 965:
2906  *
2907  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
2908  *      used as a destination register until after it has been sourced by an
2909  *      instruction with a different destination register.
2910  */
2911 void
2912 fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
2913 {
2914    int write_len = inst->regs_written;
2915    int first_write_grf = inst->dst.reg;
2916    bool needs_dep[BRW_MAX_MRF];
2917    assert(write_len < (int)sizeof(needs_dep) - 1);
2918
2919    memset(needs_dep, false, sizeof(needs_dep));
2920    memset(needs_dep, true, write_len);
2921    /* Walk forwards looking for writes to registers we're writing which aren't
2922     * read before being written.
2923     */
2924    foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst, block) {
2925       /* If we hit control flow, force resolve all remaining dependencies. */
2926       if (block->end() == scan_inst) {
2927          for (int i = 0; i < write_len; i++) {
2928             if (needs_dep[i])
2929                scan_inst->insert_before(block,
2930                                         DEP_RESOLVE_MOV(first_write_grf + i));
2931          }
2932          return;
2933       }
2934
2935       /* Clear the flag for registers that actually got read (as expected). */
2936       clear_deps_for_inst_src(scan_inst, dispatch_width,
2937                               needs_dep, first_write_grf, write_len);
2938
2939       /* We insert our reads as late as possible since they're reading the
2940        * result of a SEND, which has massive latency.
2941        */
2942       if (scan_inst->dst.file == GRF &&
2943           scan_inst->dst.reg >= first_write_grf &&
2944           scan_inst->dst.reg < first_write_grf + write_len &&
2945           needs_dep[scan_inst->dst.reg - first_write_grf]) {
2946          scan_inst->insert_before(block, DEP_RESOLVE_MOV(scan_inst->dst.reg));
2947          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2948       }
2949
2950       /* Continue the loop only if we haven't resolved all the dependencies */
2951       int i;
2952       for (i = 0; i < write_len; i++) {
2953          if (needs_dep[i])
2954             break;
2955       }
2956       if (i == write_len)
2957          return;
2958    }
2959
2960    /* If we hit the end of the program, resolve all remaining dependencies out
2961     * of paranoia.
2962     */
2963    fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2964    assert(last_inst->eot);
2965    for (int i = 0; i < write_len; i++) {
2966       if (needs_dep[i])
2967          last_inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
2968    }
2969 }
2970
2971 void
2972 fs_visitor::insert_gen4_send_dependency_workarounds()
2973 {
2974    if (brw->gen != 4 || brw->is_g4x)
2975       return;
2976
2977    bool progress = false;
2978
2979    /* Note that we're done with register allocation, so GRF fs_regs always
2980     * have a .reg_offset of 0.
2981     */
2982
2983    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2984       if (inst->mlen != 0 && inst->dst.file == GRF) {
2985          insert_gen4_pre_send_dependency_workarounds(block, inst);
2986          insert_gen4_post_send_dependency_workarounds(block, inst);
2987          progress = true;
2988       }
2989    }
2990
2991    if (progress)
2992       invalidate_live_intervals();
2993 }
2994
2995 /**
2996  * Turns the generic expression-style uniform pull constant load instruction
2997  * into a hardware-specific series of instructions for loading a pull
2998  * constant.
2999  *
3000  * The expression style allows the CSE pass before this to optimize out
3001  * repeated loads from the same offset, and gives the pre-register-allocation
3002  * scheduling full flexibility, while the conversion to native instructions
3003  * allows the post-register-allocation scheduler the best information
3004  * possible.
3005  *
3006  * Note that execution masking for setting up pull constant loads is special:
3007  * the channels that need to be written are unrelated to the current execution
3008  * mask, since a later instruction will use one of the result channels as a
3009  * source operand for all 8 or 16 of its channels.
3010  */
3011 void
3012 fs_visitor::lower_uniform_pull_constant_loads()
3013 {
3014    foreach_block_and_inst (block, fs_inst, inst, cfg) {
3015       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
3016          continue;
3017
3018       if (brw->gen >= 7) {
3019          /* The offset arg before was a vec4-aligned byte offset.  We need to
3020           * turn it into a dword offset.
3021           */
3022          fs_reg const_offset_reg = inst->src[1];
3023          assert(const_offset_reg.file == IMM &&
3024                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
3025          const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
3026          fs_reg payload = vgrf(glsl_type::uint_type);
3027
3028          /* We have to use a message header on Skylake to get SIMD4x2 mode.
3029           * Reserve space for the register.
3030           */
3031          if (brw->gen >= 9) {
3032             payload.reg_offset++;
3033             virtual_grf_sizes[payload.reg] = 2;
3034          }
3035
3036          /* This is actually going to be a MOV, but since only the first dword
3037           * is accessed, we have a special opcode to do just that one.  Note
3038           * that this needs to be an operation that will be considered a def
3039           * by live variable analysis, or register allocation will explode.
3040           */
3041          fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
3042                                                8, payload, const_offset_reg);
3043          setup->force_writemask_all = true;
3044
3045          setup->ir = inst->ir;
3046          setup->annotation = inst->annotation;
3047          inst->insert_before(block, setup);
3048
3049          /* Similarly, this will only populate the first 4 channels of the
3050           * result register (since we only use smear values from 0-3), but we
3051           * don't tell the optimizer.
3052           */
3053          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
3054          inst->src[1] = payload;
3055
3056          invalidate_live_intervals();
3057       } else {
3058          /* Before register allocation, we didn't tell the scheduler about the
3059           * MRF we use.  We know it's safe to use this MRF because nothing
3060           * else does except for register spill/unspill, which generates and
3061           * uses its MRF within a single IR instruction.
3062           */
3063          inst->base_mrf = 14;
3064          inst->mlen = 1;
3065       }
3066    }
3067 }
3068
3069 bool
3070 fs_visitor::lower_load_payload()
3071 {
3072    bool progress = false;
3073
3074    int vgrf_to_reg[virtual_grf_count];
3075    int reg_count = 16; /* Leave room for MRF */
3076    for (int i = 0; i < virtual_grf_count; ++i) {
3077       vgrf_to_reg[i] = reg_count;
3078       reg_count += virtual_grf_sizes[i];
3079    }
3080
3081    struct {
3082       bool written:1; /* Whether this register has ever been written */
3083       bool force_writemask_all:1;
3084       bool force_sechalf:1;
3085    } metadata[reg_count];
3086    memset(metadata, 0, sizeof(metadata));
3087
3088    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3089       int dst_reg;
3090       if (inst->dst.file == GRF) {
3091          dst_reg = vgrf_to_reg[inst->dst.reg];
3092       } else {
3093          /* MRF */
3094          dst_reg = inst->dst.reg;
3095       }
3096
3097       if (inst->dst.file == MRF || inst->dst.file == GRF) {
3098          bool force_sechalf = inst->force_sechalf;
3099          bool toggle_sechalf = inst->dst.width == 16 &&
3100                                type_sz(inst->dst.type) == 4;
3101          for (int i = 0; i < inst->regs_written; ++i) {
3102             metadata[dst_reg + i].written = true;
3103             metadata[dst_reg + i].force_sechalf = force_sechalf;
3104             metadata[dst_reg + i].force_writemask_all = inst->force_writemask_all;
3105             force_sechalf = (toggle_sechalf != force_sechalf);
3106          }
3107       }
3108
3109       if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
3110          assert(inst->dst.file == MRF || inst->dst.file == GRF);
3111          fs_reg dst = inst->dst;
3112
3113          for (int i = 0; i < inst->sources; i++) {
3114             dst.width = inst->src[i].effective_width;
3115             dst.type = inst->src[i].type;
3116
3117             if (inst->src[i].file == BAD_FILE) {
3118                /* Do nothing but otherwise increment as normal */
3119             } else if (dst.file == MRF &&
3120                        dst.width == 8 &&
3121                        brw->has_compr4 &&
3122                        i + 4 < inst->sources &&
3123                        inst->src[i + 4].equals(horiz_offset(inst->src[i], 8))) {
3124                fs_reg compr4_dst = dst;
3125                compr4_dst.reg += BRW_MRF_COMPR4;
3126                compr4_dst.width = 16;
3127                fs_reg compr4_src = inst->src[i];
3128                compr4_src.width = 16;
3129                fs_inst *mov = MOV(compr4_dst, compr4_src);
3130                mov->force_writemask_all = true;
3131                inst->insert_before(block, mov);
3132                /* Mark i+4 as BAD_FILE so we don't emit a MOV for it */
3133                inst->src[i + 4].file = BAD_FILE;
3134             } else {
3135                fs_inst *mov = MOV(dst, inst->src[i]);
3136                if (inst->src[i].file == GRF) {
3137                   int src_reg = vgrf_to_reg[inst->src[i].reg] +
3138                                 inst->src[i].reg_offset;
3139                   mov->force_sechalf = metadata[src_reg].force_sechalf;
3140                   mov->force_writemask_all = metadata[src_reg].force_writemask_all;
3141                   metadata[dst_reg] = metadata[src_reg];
3142                   if (dst.width * type_sz(dst.type) > 32) {
3143                      assert((!metadata[src_reg].written ||
3144                              !metadata[src_reg].force_sechalf) &&
3145                             (!metadata[src_reg + 1].written ||
3146                              metadata[src_reg + 1].force_sechalf));
3147                      metadata[dst_reg + 1] = metadata[src_reg + 1];
3148                   }
3149                } else {
3150                   metadata[dst_reg].force_writemask_all = false;
3151                   metadata[dst_reg].force_sechalf = false;
3152                   if (dst.width == 16) {
3153                      metadata[dst_reg + 1].force_writemask_all = false;
3154                      metadata[dst_reg + 1].force_sechalf = true;
3155                   }
3156                }
3157                inst->insert_before(block, mov);
3158             }
3159
3160             dst = offset(dst, 1);
3161          }
3162
3163          inst->remove(block);
3164          progress = true;
3165       }
3166    }
3167
3168    if (progress)
3169       invalidate_live_intervals();
3170
3171    return progress;
3172 }
3173
3174 void
3175 fs_visitor::dump_instructions()
3176 {
3177    dump_instructions(NULL);
3178 }
3179
3180 void
3181 fs_visitor::dump_instructions(const char *name)
3182 {
3183    calculate_register_pressure();
3184    FILE *file = stderr;
3185    if (name && geteuid() != 0) {
3186       file = fopen(name, "w");
3187       if (!file)
3188          file = stderr;
3189    }
3190
3191    int ip = 0, max_pressure = 0;
3192    foreach_block_and_inst(block, backend_instruction, inst, cfg) {
3193       max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
3194       fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
3195       dump_instruction(inst, file);
3196       ++ip;
3197    }
3198    fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
3199
3200    if (file != stderr) {
3201       fclose(file);
3202    }
3203 }
3204
3205 void
3206 fs_visitor::dump_instruction(backend_instruction *be_inst)
3207 {
3208    dump_instruction(be_inst, stderr);
3209 }
3210
3211 void
3212 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
3213 {
3214    fs_inst *inst = (fs_inst *)be_inst;
3215
3216    if (inst->predicate) {
3217       fprintf(file, "(%cf0.%d) ",
3218              inst->predicate_inverse ? '-' : '+',
3219              inst->flag_subreg);
3220    }
3221
3222    fprintf(file, "%s", brw_instruction_name(inst->opcode));
3223    if (inst->saturate)
3224       fprintf(file, ".sat");
3225    if (inst->conditional_mod) {
3226       fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
3227       if (!inst->predicate &&
3228           (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
3229                               inst->opcode != BRW_OPCODE_IF &&
3230                               inst->opcode != BRW_OPCODE_WHILE))) {
3231          fprintf(file, ".f0.%d", inst->flag_subreg);
3232       }
3233    }
3234    fprintf(file, "(%d) ", inst->exec_size);
3235
3236
3237    switch (inst->dst.file) {
3238    case GRF:
3239       fprintf(file, "vgrf%d", inst->dst.reg);
3240       if (inst->dst.width != dispatch_width)
3241          fprintf(file, "@%d", inst->dst.width);
3242       if (virtual_grf_sizes[inst->dst.reg] != inst->dst.width / 8 ||
3243           inst->dst.subreg_offset)
3244          fprintf(file, "+%d.%d",
3245                  inst->dst.reg_offset, inst->dst.subreg_offset);
3246       break;
3247    case MRF:
3248       fprintf(file, "m%d", inst->dst.reg);
3249       break;
3250    case BAD_FILE:
3251       fprintf(file, "(null)");
3252       break;
3253    case UNIFORM:
3254       fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
3255       break;
3256    case ATTR:
3257       fprintf(file, "***attr%d***", inst->dst.reg + inst->dst.reg_offset);
3258       break;
3259    case HW_REG:
3260       if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3261          switch (inst->dst.fixed_hw_reg.nr) {
3262          case BRW_ARF_NULL:
3263             fprintf(file, "null");
3264             break;
3265          case BRW_ARF_ADDRESS:
3266             fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
3267             break;
3268          case BRW_ARF_ACCUMULATOR:
3269             fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
3270             break;
3271          case BRW_ARF_FLAG:
3272             fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3273                              inst->dst.fixed_hw_reg.subnr);
3274             break;
3275          default:
3276             fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3277                                inst->dst.fixed_hw_reg.subnr);
3278             break;
3279          }
3280       } else {
3281          fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
3282       }
3283       if (inst->dst.fixed_hw_reg.subnr)
3284          fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
3285       break;
3286    default:
3287       fprintf(file, "???");
3288       break;
3289    }
3290    fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
3291
3292    for (int i = 0; i < inst->sources; i++) {
3293       if (inst->src[i].negate)
3294          fprintf(file, "-");
3295       if (inst->src[i].abs)
3296          fprintf(file, "|");
3297       switch (inst->src[i].file) {
3298       case GRF:
3299          fprintf(file, "vgrf%d", inst->src[i].reg);
3300          if (inst->src[i].width != dispatch_width)
3301             fprintf(file, "@%d", inst->src[i].width);
3302          if (virtual_grf_sizes[inst->src[i].reg] != inst->src[i].width / 8 ||
3303              inst->src[i].subreg_offset)
3304             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3305                     inst->src[i].subreg_offset);
3306          break;
3307       case MRF:
3308          fprintf(file, "***m%d***", inst->src[i].reg);
3309          break;
3310       case ATTR:
3311          fprintf(file, "attr%d", inst->src[i].reg + inst->src[i].reg_offset);
3312          break;
3313       case UNIFORM:
3314          fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
3315          if (inst->src[i].reladdr) {
3316             fprintf(file, "+reladdr");
3317          } else if (inst->src[i].subreg_offset) {
3318             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3319                     inst->src[i].subreg_offset);
3320          }
3321          break;
3322       case BAD_FILE:
3323          fprintf(file, "(null)");
3324          break;
3325       case IMM:
3326          switch (inst->src[i].type) {
3327          case BRW_REGISTER_TYPE_F:
3328             fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
3329             break;
3330          case BRW_REGISTER_TYPE_D:
3331             fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
3332             break;
3333          case BRW_REGISTER_TYPE_UD:
3334             fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
3335             break;
3336          case BRW_REGISTER_TYPE_VF:
3337             fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
3338                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >>  0) & 0xff),
3339                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >>  8) & 0xff),
3340                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 16) & 0xff),
3341                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 24) & 0xff));
3342             break;
3343          default:
3344             fprintf(file, "???");
3345             break;
3346          }
3347          break;
3348       case HW_REG:
3349          if (inst->src[i].fixed_hw_reg.negate)
3350             fprintf(file, "-");
3351          if (inst->src[i].fixed_hw_reg.abs)
3352             fprintf(file, "|");
3353          if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3354             switch (inst->src[i].fixed_hw_reg.nr) {
3355             case BRW_ARF_NULL:
3356                fprintf(file, "null");
3357                break;
3358             case BRW_ARF_ADDRESS:
3359                fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
3360                break;
3361             case BRW_ARF_ACCUMULATOR:
3362                fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
3363                break;
3364             case BRW_ARF_FLAG:
3365                fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3366                                 inst->src[i].fixed_hw_reg.subnr);
3367                break;
3368             default:
3369                fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3370                                   inst->src[i].fixed_hw_reg.subnr);
3371                break;
3372             }
3373          } else {
3374             fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3375          }
3376          if (inst->src[i].fixed_hw_reg.subnr)
3377             fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
3378          if (inst->src[i].fixed_hw_reg.abs)
3379             fprintf(file, "|");
3380          break;
3381       default:
3382          fprintf(file, "???");
3383          break;
3384       }
3385       if (inst->src[i].abs)
3386          fprintf(file, "|");
3387
3388       if (inst->src[i].file != IMM) {
3389          fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
3390       }
3391
3392       if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
3393          fprintf(file, ", ");
3394    }
3395
3396    fprintf(file, " ");
3397
3398    if (dispatch_width == 16 && inst->exec_size == 8) {
3399       if (inst->force_sechalf)
3400          fprintf(file, "2ndhalf ");
3401       else
3402          fprintf(file, "1sthalf ");
3403    }
3404
3405    fprintf(file, "\n");
3406 }
3407
3408 /**
3409  * Possibly returns an instruction that set up @param reg.
3410  *
3411  * Sometimes we want to take the result of some expression/variable
3412  * dereference tree and rewrite the instruction generating the result
3413  * of the tree.  When processing the tree, we know that the
3414  * instructions generated are all writing temporaries that are dead
3415  * outside of this tree.  So, if we have some instructions that write
3416  * a temporary, we're free to point that temp write somewhere else.
3417  *
3418  * Note that this doesn't guarantee that the instruction generated
3419  * only reg -- it might be the size=4 destination of a texture instruction.
3420  */
3421 fs_inst *
3422 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3423                                            fs_inst *end,
3424                                            const fs_reg &reg)
3425 {
3426    if (end == start ||
3427        end->is_partial_write() ||
3428        reg.reladdr ||
3429        !reg.equals(end->dst)) {
3430       return NULL;
3431    } else {
3432       return end;
3433    }
3434 }
3435
3436 void
3437 fs_visitor::setup_payload_gen6()
3438 {
3439    bool uses_depth =
3440       (prog->InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3441    unsigned barycentric_interp_modes =
3442       (stage == MESA_SHADER_FRAGMENT) ?
3443       ((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
3444
3445    assert(brw->gen >= 6);
3446
3447    /* R0-1: masks, pixel X/Y coordinates. */
3448    payload.num_regs = 2;
3449    /* R2: only for 32-pixel dispatch.*/
3450
3451    /* R3-26: barycentric interpolation coordinates.  These appear in the
3452     * same order that they appear in the brw_wm_barycentric_interp_mode
3453     * enum.  Each set of coordinates occupies 2 registers if dispatch width
3454     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
3455     * appear if they were enabled using the "Barycentric Interpolation
3456     * Mode" bits in WM_STATE.
3457     */
3458    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3459       if (barycentric_interp_modes & (1 << i)) {
3460          payload.barycentric_coord_reg[i] = payload.num_regs;
3461          payload.num_regs += 2;
3462          if (dispatch_width == 16) {
3463             payload.num_regs += 2;
3464          }
3465       }
3466    }
3467
3468    /* R27: interpolated depth if uses source depth */
3469    if (uses_depth) {
3470       payload.source_depth_reg = payload.num_regs;
3471       payload.num_regs++;
3472       if (dispatch_width == 16) {
3473          /* R28: interpolated depth if not SIMD8. */
3474          payload.num_regs++;
3475       }
3476    }
3477    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3478    if (uses_depth) {
3479       payload.source_w_reg = payload.num_regs;
3480       payload.num_regs++;
3481       if (dispatch_width == 16) {
3482          /* R30: interpolated W if not SIMD8. */
3483          payload.num_regs++;
3484       }
3485    }
3486
3487    if (stage == MESA_SHADER_FRAGMENT) {
3488       brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3489       brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3490       prog_data->uses_pos_offset = key->compute_pos_offset;
3491       /* R31: MSAA position offsets. */
3492       if (prog_data->uses_pos_offset) {
3493          payload.sample_pos_reg = payload.num_regs;
3494          payload.num_regs++;
3495       }
3496    }
3497
3498    /* R32: MSAA input coverage mask */
3499    if (prog->SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3500       assert(brw->gen >= 7);
3501       payload.sample_mask_in_reg = payload.num_regs;
3502       payload.num_regs++;
3503       if (dispatch_width == 16) {
3504          /* R33: input coverage mask if not SIMD8. */
3505          payload.num_regs++;
3506       }
3507    }
3508
3509    /* R34-: bary for 32-pixel. */
3510    /* R58-59: interp W for 32-pixel. */
3511
3512    if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3513       source_depth_to_render_target = true;
3514    }
3515 }
3516
3517 void
3518 fs_visitor::setup_vs_payload()
3519 {
3520    /* R0: thread header, R1: urb handles */
3521    payload.num_regs = 2;
3522 }
3523
3524 void
3525 fs_visitor::assign_binding_table_offsets()
3526 {
3527    assert(stage == MESA_SHADER_FRAGMENT);
3528    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3529    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3530    uint32_t next_binding_table_offset = 0;
3531
3532    /* If there are no color regions, we still perform an FB write to a null
3533     * renderbuffer, which we place at surface index 0.
3534     */
3535    prog_data->binding_table.render_target_start = next_binding_table_offset;
3536    next_binding_table_offset += MAX2(key->nr_color_regions, 1);
3537
3538    assign_common_binding_table_offsets(next_binding_table_offset);
3539 }
3540
3541 void
3542 fs_visitor::calculate_register_pressure()
3543 {
3544    invalidate_live_intervals();
3545    calculate_live_intervals();
3546
3547    unsigned num_instructions = 0;
3548    foreach_block(block, cfg)
3549       num_instructions += block->instructions.length();
3550
3551    regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3552
3553    for (int reg = 0; reg < virtual_grf_count; reg++) {
3554       for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3555          regs_live_at_ip[ip] += virtual_grf_sizes[reg];
3556    }
3557 }
3558
3559 void
3560 fs_visitor::optimize()
3561 {
3562    const char *stage_name = stage == MESA_SHADER_VERTEX ? "vs" : "fs";
3563
3564    calculate_cfg();
3565
3566    split_virtual_grfs();
3567
3568    move_uniform_array_access_to_pull_constants();
3569    assign_constant_locations();
3570    demote_pull_constants();
3571
3572 #define OPT(pass, args...) ({                                           \
3573       pass_num++;                                                       \
3574       bool this_progress = pass(args);                                  \
3575                                                                         \
3576       if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) {   \
3577          char filename[64];                                             \
3578          snprintf(filename, 64, "%s%d-%04d-%02d-%02d-" #pass,              \
3579                   stage_name, dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
3580                                                                         \
3581          backend_visitor::dump_instructions(filename);                  \
3582       }                                                                 \
3583                                                                         \
3584       progress = progress || this_progress;                             \
3585       this_progress;                                                    \
3586    })
3587
3588    if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
3589       char filename[64];
3590       snprintf(filename, 64, "%s%d-%04d-00-start",
3591                stage_name, dispatch_width, shader_prog ? shader_prog->Name : 0);
3592
3593       backend_visitor::dump_instructions(filename);
3594    }
3595
3596    bool progress;
3597    int iteration = 0;
3598    int pass_num = 0;
3599    do {
3600       progress = false;
3601       pass_num = 0;
3602       iteration++;
3603
3604       OPT(remove_duplicate_mrf_writes);
3605
3606       OPT(opt_algebraic);
3607       OPT(opt_cse);
3608       OPT(opt_copy_propagate);
3609       OPT(opt_peephole_predicated_break);
3610       OPT(opt_cmod_propagation);
3611       OPT(dead_code_eliminate);
3612       OPT(opt_peephole_sel);
3613       OPT(dead_control_flow_eliminate, this);
3614       OPT(opt_register_renaming);
3615       OPT(opt_saturate_propagation);
3616       OPT(register_coalesce);
3617       OPT(compute_to_mrf);
3618
3619       OPT(compact_virtual_grfs);
3620    } while (progress);
3621
3622    pass_num = 0;
3623
3624    if (OPT(lower_load_payload)) {
3625       split_virtual_grfs();
3626       OPT(register_coalesce);
3627       OPT(compute_to_mrf);
3628       OPT(dead_code_eliminate);
3629    }
3630
3631    lower_uniform_pull_constant_loads();
3632 }
3633
3634 /**
3635  * Three source instruction must have a GRF/MRF destination register.
3636  * ARF NULL is not allowed.  Fix that up by allocating a temporary GRF.
3637  */
3638 void
3639 fs_visitor::fixup_3src_null_dest()
3640 {
3641    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3642       if (inst->is_3src() && inst->dst.is_null()) {
3643          inst->dst = fs_reg(GRF, virtual_grf_alloc(dispatch_width / 8),
3644                             inst->dst.type);
3645       }
3646    }
3647 }
3648
3649 void
3650 fs_visitor::allocate_registers()
3651 {
3652    bool allocated_without_spills;
3653
3654    static const enum instruction_scheduler_mode pre_modes[] = {
3655       SCHEDULE_PRE,
3656       SCHEDULE_PRE_NON_LIFO,
3657       SCHEDULE_PRE_LIFO,
3658    };
3659
3660    /* Try each scheduling heuristic to see if it can successfully register
3661     * allocate without spilling.  They should be ordered by decreasing
3662     * performance but increasing likelihood of allocating.
3663     */
3664    for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3665       schedule_instructions(pre_modes[i]);
3666
3667       if (0) {
3668          assign_regs_trivial();
3669          allocated_without_spills = true;
3670       } else {
3671          allocated_without_spills = assign_regs(false);
3672       }
3673       if (allocated_without_spills)
3674          break;
3675    }
3676
3677    if (!allocated_without_spills) {
3678       const char *stage_name = stage == MESA_SHADER_VERTEX ?
3679          "Vertex" : "Fragment";
3680
3681       /* We assume that any spilling is worse than just dropping back to
3682        * SIMD8.  There's probably actually some intermediate point where
3683        * SIMD16 with a couple of spills is still better.
3684        */
3685       if (dispatch_width == 16) {
3686          fail("Failure to register allocate.  Reduce number of "
3687               "live scalar values to avoid this.");
3688       } else {
3689          perf_debug("%s shader triggered register spilling.  "
3690                     "Try reducing the number of live scalar values to "
3691                     "improve performance.\n", stage_name);
3692       }
3693
3694       /* Since we're out of heuristics, just go spill registers until we
3695        * get an allocation.
3696        */
3697       while (!assign_regs(true)) {
3698          if (failed)
3699             break;
3700       }
3701    }
3702
3703    /* This must come after all optimization and register allocation, since
3704     * it inserts dead code that happens to have side effects, and it does
3705     * so based on the actual physical registers in use.
3706     */
3707    insert_gen4_send_dependency_workarounds();
3708
3709    if (failed)
3710       return;
3711
3712    if (!allocated_without_spills)
3713       schedule_instructions(SCHEDULE_POST);
3714
3715    if (last_scratch > 0)
3716       prog_data->total_scratch = brw_get_scratch_size(last_scratch);
3717 }
3718
3719 bool
3720 fs_visitor::run_vs()
3721 {
3722    assert(stage == MESA_SHADER_VERTEX);
3723
3724    assign_common_binding_table_offsets(0);
3725    setup_vs_payload();
3726
3727    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3728       emit_shader_time_begin();
3729
3730    foreach_in_list(ir_instruction, ir, shader->base.ir) {
3731       base_ir = ir;
3732       this->result = reg_undef;
3733       ir->accept(this);
3734    }
3735    base_ir = NULL;
3736    if (failed)
3737       return false;
3738
3739    emit_urb_writes();
3740
3741    optimize();
3742
3743    assign_curb_setup();
3744    assign_vs_urb_setup();
3745
3746    fixup_3src_null_dest();
3747    allocate_registers();
3748
3749    return !failed;
3750 }
3751
3752 bool
3753 fs_visitor::run_fs()
3754 {
3755    brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
3756    brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
3757
3758    assert(stage == MESA_SHADER_FRAGMENT);
3759
3760    sanity_param_count = prog->Parameters->NumParameters;
3761
3762    assign_binding_table_offsets();
3763
3764    if (brw->gen >= 6)
3765       setup_payload_gen6();
3766    else
3767       setup_payload_gen4();
3768
3769    if (0) {
3770       emit_dummy_fs();
3771    } else if (brw->use_rep_send && dispatch_width == 16) {
3772       emit_repclear_shader();
3773    } else {
3774       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3775          emit_shader_time_begin();
3776
3777       calculate_urb_setup();
3778       if (prog->InputsRead > 0) {
3779          if (brw->gen < 6)
3780             emit_interpolation_setup_gen4();
3781          else
3782             emit_interpolation_setup_gen6();
3783       }
3784
3785       /* We handle discards by keeping track of the still-live pixels in f0.1.
3786        * Initialize it with the dispatched pixels.
3787        */
3788       if (wm_prog_data->uses_kill) {
3789          fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3790          discard_init->flag_subreg = 1;
3791       }
3792
3793       /* Generate FS IR for main().  (the visitor only descends into
3794        * functions called "main").
3795        */
3796       if (shader) {
3797          if (getenv("INTEL_USE_NIR") != NULL) {
3798             emit_nir_code();
3799          } else {
3800             foreach_in_list(ir_instruction, ir, shader->base.ir) {
3801                base_ir = ir;
3802                this->result = reg_undef;
3803                ir->accept(this);
3804             }
3805          }
3806       } else {
3807          emit_fragment_program_code();
3808       }
3809       base_ir = NULL;
3810       if (failed)
3811          return false;
3812
3813       emit(FS_OPCODE_PLACEHOLDER_HALT);
3814
3815       if (wm_key->alpha_test_func)
3816          emit_alpha_test();
3817
3818       emit_fb_writes();
3819
3820       optimize();
3821
3822       assign_curb_setup();
3823       assign_urb_setup();
3824
3825       fixup_3src_null_dest();
3826       allocate_registers();
3827
3828       if (failed)
3829          return false;
3830    }
3831
3832    if (dispatch_width == 8)
3833       wm_prog_data->reg_blocks = brw_register_blocks(grf_used);
3834    else
3835       wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
3836
3837    /* If any state parameters were appended, then ParameterValues could have
3838     * been realloced, in which case the driver uniform storage set up by
3839     * _mesa_associate_uniform_storage() would point to freed memory.  Make
3840     * sure that didn't happen.
3841     */
3842    assert(sanity_param_count == prog->Parameters->NumParameters);
3843
3844    return !failed;
3845 }
3846
3847 const unsigned *
3848 brw_wm_fs_emit(struct brw_context *brw,
3849                void *mem_ctx,
3850                const struct brw_wm_prog_key *key,
3851                struct brw_wm_prog_data *prog_data,
3852                struct gl_fragment_program *fp,
3853                struct gl_shader_program *prog,
3854                unsigned *final_assembly_size)
3855 {
3856    bool start_busy = false;
3857    double start_time = 0;
3858
3859    if (unlikely(brw->perf_debug)) {
3860       start_busy = (brw->batch.last_bo &&
3861                     drm_intel_bo_busy(brw->batch.last_bo));
3862       start_time = get_time();
3863    }
3864
3865    struct brw_shader *shader = NULL;
3866    if (prog)
3867       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3868
3869    if (unlikely(INTEL_DEBUG & DEBUG_WM))
3870       brw_dump_ir("fragment", prog, &shader->base, &fp->Base);
3871
3872    /* Now the main event: Visit the shader IR and generate our FS IR for it.
3873     */
3874    fs_visitor v(brw, mem_ctx, key, prog_data, prog, fp, 8);
3875    if (!v.run_fs()) {
3876       if (prog) {
3877          prog->LinkStatus = false;
3878          ralloc_strcat(&prog->InfoLog, v.fail_msg);
3879       }
3880
3881       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3882                     v.fail_msg);
3883
3884       return NULL;
3885    }
3886
3887    cfg_t *simd16_cfg = NULL;
3888    fs_visitor v2(brw, mem_ctx, key, prog_data, prog, fp, 16);
3889    if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16) ||
3890                                brw->use_rep_send)) {
3891       if (!v.simd16_unsupported) {
3892          /* Try a SIMD16 compile */
3893          v2.import_uniforms(&v);
3894          if (!v2.run_fs()) {
3895             perf_debug("SIMD16 shader failed to compile, falling back to "
3896                        "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
3897          } else {
3898             simd16_cfg = v2.cfg;
3899          }
3900       } else {
3901          perf_debug("SIMD16 shader unsupported, falling back to "
3902                     "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
3903       }
3904    }
3905
3906    cfg_t *simd8_cfg;
3907    int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8;
3908    if (no_simd8 && simd16_cfg) {
3909       simd8_cfg = NULL;
3910       prog_data->no_8 = true;
3911    } else {
3912       simd8_cfg = v.cfg;
3913       prog_data->no_8 = false;
3914    }
3915
3916    fs_generator g(brw, mem_ctx, (void *) key, &prog_data->base,
3917                   &fp->Base, v.runtime_check_aads_emit, "FS");
3918
3919    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3920       char *name;
3921       if (prog)
3922          name = ralloc_asprintf(mem_ctx, "%s fragment shader %d",
3923                                 prog->Label ? prog->Label : "unnamed",
3924                                 prog->Name);
3925       else
3926          name = ralloc_asprintf(mem_ctx, "fragment program %d", fp->Base.Id);
3927
3928       g.enable_debug(name);
3929    }
3930
3931    if (simd8_cfg)
3932       g.generate_code(simd8_cfg, 8);
3933    if (simd16_cfg)
3934       prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
3935
3936    if (unlikely(brw->perf_debug) && shader) {
3937       if (shader->compiled_once)
3938          brw_wm_debug_recompile(brw, prog, key);
3939       shader->compiled_once = true;
3940
3941       if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3942          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3943                     (get_time() - start_time) * 1000);
3944       }
3945    }
3946
3947    return g.get_assembly(final_assembly_size);
3948 }
3949
3950 extern "C" bool
3951 brw_fs_precompile(struct gl_context *ctx,
3952                   struct gl_shader_program *shader_prog,
3953                   struct gl_program *prog)
3954 {
3955    struct brw_context *brw = brw_context(ctx);
3956    struct brw_wm_prog_key key;
3957
3958    struct gl_fragment_program *fp = (struct gl_fragment_program *) prog;
3959    struct brw_fragment_program *bfp = brw_fragment_program(fp);
3960    bool program_uses_dfdy = fp->UsesDFdy;
3961
3962    memset(&key, 0, sizeof(key));
3963
3964    if (brw->gen < 6) {
3965       if (fp->UsesKill)
3966          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3967
3968       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3969          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3970
3971       /* Just assume depth testing. */
3972       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3973       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3974    }
3975
3976    if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
3977                                          BRW_FS_VARYING_INPUT_MASK) > 16)
3978       key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
3979
3980    const bool has_shader_channel_select = brw->is_haswell || brw->gen >= 8;
3981    unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
3982    for (unsigned i = 0; i < sampler_count; i++) {
3983       if (!has_shader_channel_select && (fp->Base.ShadowSamplers & (1 << i))) {
3984          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3985          key.tex.swizzles[i] =
3986             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3987       } else {
3988          /* Color sampler: assume no swizzling. */
3989          key.tex.swizzles[i] = SWIZZLE_XYZW;
3990       }
3991    }
3992
3993    if (fp->Base.InputsRead & VARYING_BIT_POS) {
3994       key.drawable_height = ctx->DrawBuffer->Height;
3995    }
3996
3997    key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
3998          ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
3999          BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
4000
4001    if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
4002       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
4003                           key.nr_color_regions > 1;
4004    }
4005
4006    key.program_string_id = bfp->id;
4007
4008    uint32_t old_prog_offset = brw->wm.base.prog_offset;
4009    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
4010
4011    bool success = do_wm_prog(brw, shader_prog, bfp, &key);
4012
4013    brw->wm.base.prog_offset = old_prog_offset;
4014    brw->wm.prog_data = old_prog_data;
4015
4016    return success;
4017 }