src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 #include <sys/types.h>
  32
  33 #include "util/hash_table.h"
  34 #include "main/macros.h"
  35 #include "main/shaderobj.h"
  36 #include "main/fbobject.h"
  37 #include "program/prog_parameter.h"
  38 #include "program/prog_print.h"
  39 #include "util/register_allocate.h"
  40 #include "program/hash_table.h"
  41 #include "brw_context.h"
  42 #include "brw_eu.h"
  43 #include "brw_wm.h"
  44 #include "brw_fs.h"
  45 #include "brw_cfg.h"
  46 #include "brw_dead_control_flow.h"
  47 #include "main/uniforms.h"
  48 #include "brw_fs_live_variables.h"
  49 #include "glsl/glsl_types.h"
  50 #include "program/sampler.h"
  51
  52 void
  53 fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
  54               const fs_reg *src, unsigned sources)
  55 {
  56    memset(this, 0, sizeof(*this));
  57
  58    this->src = new fs_reg[MAX2(sources, 3)];
  59    for (unsigned i = 0; i < sources; i++)
  60       this->src[i] = src[i];
  61
  62    this->opcode = opcode;
  63    this->dst = dst;
  64    this->sources = sources;
  65    this->exec_size = exec_size;
  66
  67    assert(dst.file != IMM && dst.file != UNIFORM);
  68
  69    /* If exec_size == 0, try to guess it from the registers.  Since all
  70     * manner of things may use hardware registers, we first try to guess
  71     * based on GRF registers.  If this fails, we will go ahead and take the
  72     * width from the destination register.
  73     */
  74    if (this->exec_size == 0) {
  75       if (dst.file == GRF) {
  76          this->exec_size = dst.width;
  77       } else {
  78          for (unsigned i = 0; i < sources; ++i) {
  79             if (src[i].file != GRF && src[i].file != ATTR)
  80                continue;
  81
  82             if (this->exec_size <= 1)
  83                this->exec_size = src[i].width;
  84             assert(src[i].width == 1 || src[i].width == this->exec_size);
  85          }
  86       }
  87
  88       if (this->exec_size == 0 && dst.file != BAD_FILE)
  89          this->exec_size = dst.width;
  90    }
  91    assert(this->exec_size != 0);
  92
  93    for (unsigned i = 0; i < sources; ++i) {
  94       switch (this->src[i].file) {
  95       case BAD_FILE:
  96          this->src[i].effective_width = 8;
  97          break;
  98       case GRF:
  99       case HW_REG:
 100       case ATTR:
 101          assert(this->src[i].width > 0);
 102          if (this->src[i].width == 1) {
 103             this->src[i].effective_width = this->exec_size;
 104          } else {
 105             this->src[i].effective_width = this->src[i].width;
 106          }
 107          break;
 108       case IMM:
 109       case UNIFORM:
 110          this->src[i].effective_width = this->exec_size;
 111          break;
 112       default:
 113          unreachable("Invalid source register file");
 114       }
 115    }
 116    this->dst.effective_width = this->exec_size;
 117
 118    this->conditional_mod = BRW_CONDITIONAL_NONE;
 119
 120    /* This will be the case for almost all instructions. */
 121    switch (dst.file) {
 122    case GRF:
 123    case HW_REG:
 124    case MRF:
 125    case ATTR:
 126       this->regs_written =
 127          DIV_ROUND_UP(MAX2(dst.width * dst.stride, 1) * type_sz(dst.type), 32);
 128       break;
 129    case BAD_FILE:
 130       this->regs_written = 0;
 131       break;
 132    case IMM:
 133    case UNIFORM:
 134       unreachable("Invalid destination register file");
 135    default:
 136       unreachable("Invalid register file");
 137    }
 138
 139    this->writes_accumulator = false;
 140 }
 141
 142 fs_inst::fs_inst()
 143 {
 144    init(BRW_OPCODE_NOP, 8, dst, NULL, 0);
 145 }
 146
 147 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
 148 {
 149    init(opcode, exec_size, reg_undef, NULL, 0);
 150 }
 151
 152 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
 153 {
 154    init(opcode, 0, dst, NULL, 0);
 155 }
 156
 157 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 158                  const fs_reg &src0)
 159 {
 160    const fs_reg src[1] = { src0 };
 161    init(opcode, exec_size, dst, src, 1);
 162 }
 163
 164 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
 165 {
 166    const fs_reg src[1] = { src0 };
 167    init(opcode, 0, dst, src, 1);
 168 }
 169
 170 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 171                  const fs_reg &src0, const fs_reg &src1)
 172 {
 173    const fs_reg src[2] = { src0, src1 };
 174    init(opcode, exec_size, dst, src, 2);
 175 }
 176
 177 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 178                  const fs_reg &src1)
 179 {
 180    const fs_reg src[2] = { src0, src1 };
 181    init(opcode, 0, dst, src, 2);
 182 }
 183
 184 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 185                  const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
 186 {
 187    const fs_reg src[3] = { src0, src1, src2 };
 188    init(opcode, exec_size, dst, src, 3);
 189 }
 190
 191 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 192                  const fs_reg &src1, const fs_reg &src2)
 193 {
 194    const fs_reg src[3] = { src0, src1, src2 };
 195    init(opcode, 0, dst, src, 3);
 196 }
 197
 198 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst,
 199                  const fs_reg src[], unsigned sources)
 200 {
 201    init(opcode, 0, dst, src, sources);
 202 }
 203
 204 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
 205                  const fs_reg src[], unsigned sources)
 206 {
 207    init(opcode, exec_width, dst, src, sources);
 208 }
 209
 210 fs_inst::fs_inst(const fs_inst &that)
 211 {
 212    memcpy(this, &that, sizeof(that));
 213
 214    this->src = new fs_reg[MAX2(that.sources, 3)];
 215
 216    for (unsigned i = 0; i < that.sources; i++)
 217       this->src[i] = that.src[i];
 218 }
 219
 220 fs_inst::~fs_inst()
 221 {
 222    delete[] this->src;
 223 }
 224
 225 void
 226 fs_inst::resize_sources(uint8_t num_sources)
 227 {
 228    if (this->sources != num_sources) {
 229       fs_reg *src = new fs_reg[MAX2(num_sources, 3)];
 230
 231       for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i)
 232          src[i] = this->src[i];
 233
 234       delete[] this->src;
 235       this->src = src;
 236       this->sources = num_sources;
 237    }
 238 }
 239
 240 #define ALU1(op)                                                        \
 241    fs_inst *                                                            \
 242    fs_visitor::op(const fs_reg &dst, const fs_reg &src0)                \
 243    {                                                                    \
 244       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
 245    }
 246
 247 #define ALU2(op)                                                        \
 248    fs_inst *                                                            \
 249    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 250                   const fs_reg &src1)                                   \
 251    {                                                                    \
 252       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
 253    }
 254
 255 #define ALU2_ACC(op)                                                    \
 256    fs_inst *                                                            \
 257    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 258                   const fs_reg &src1)                                   \
 259    {                                                                    \
 260       fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
 261       inst->writes_accumulator = true;                                  \
 262       return inst;                                                      \
 263    }
 264
 265 #define ALU3(op)                                                        \
 266    fs_inst *                                                            \
 267    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 268                   const fs_reg &src1, const fs_reg &src2)               \
 269    {                                                                    \
 270       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
 271    }
 272
 273 ALU1(NOT)
 274 ALU1(MOV)
 275 ALU1(FRC)
 276 ALU1(RNDD)
 277 ALU1(RNDE)
 278 ALU1(RNDZ)
 279 ALU2(ADD)
 280 ALU2(MUL)
 281 ALU2_ACC(MACH)
 282 ALU2(AND)
 283 ALU2(OR)
 284 ALU2(XOR)
 285 ALU2(SHL)
 286 ALU2(SHR)
 287 ALU2(ASR)
 288 ALU3(LRP)
 289 ALU1(BFREV)
 290 ALU3(BFE)
 291 ALU2(BFI1)
 292 ALU3(BFI2)
 293 ALU1(FBH)
 294 ALU1(FBL)
 295 ALU1(CBIT)
 296 ALU3(MAD)
 297 ALU2_ACC(ADDC)
 298 ALU2_ACC(SUBB)
 299 ALU2(SEL)
 300 ALU2(MAC)
 301
 302 /** Gen4 predicated IF. */
 303 fs_inst *
 304 fs_visitor::IF(enum brw_predicate predicate)
 305 {
 306    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width);
 307    inst->predicate = predicate;
 308    return inst;
 309 }
 310
 311 /** Gen6 IF with embedded comparison. */
 312 fs_inst *
 313 fs_visitor::IF(const fs_reg &src0, const fs_reg &src1,
 314                enum brw_conditional_mod condition)
 315 {
 316    assert(brw->gen == 6);
 317    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width,
 318                                         reg_null_d, src0, src1);
 319    inst->conditional_mod = condition;
 320    return inst;
 321 }
 322
 323 /**
 324  * CMP: Sets the low bit of the destination channels with the result
 325  * of the comparison, while the upper bits are undefined, and updates
 326  * the flag register with the packed 16 bits of the result.
 327  */
 328 fs_inst *
 329 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
 330                 enum brw_conditional_mod condition)
 331 {
 332    fs_inst *inst;
 333
 334    /* Take the instruction:
 335     *
 336     * CMP null<d> src0<f> src1<f>
 337     *
 338     * Original gen4 does type conversion to the destination type before
 339     * comparison, producing garbage results for floating point comparisons.
 340     *
 341     * The destination type doesn't matter on newer generations, so we set the
 342     * type to match src0 so we can compact the instruction.
 343     */
 344    dst.type = src0.type;
 345    if (dst.file == HW_REG)
 346       dst.fixed_hw_reg.type = dst.type;
 347
 348    resolve_ud_negate(&src0);
 349    resolve_ud_negate(&src1);
 350
 351    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
 352    inst->conditional_mod = condition;
 353
 354    return inst;
 355 }
 356
 357 fs_inst *
 358 fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources)
 359 {
 360    uint8_t exec_size = dst.width;
 361    for (int i = 0; i < sources; ++i) {
 362       assert(src[i].width % dst.width == 0);
 363       if (src[i].width > exec_size)
 364          exec_size = src[i].width;
 365    }
 366
 367    fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, exec_size,
 368                                         dst, src, sources);
 369    inst->regs_written = 0;
 370    for (int i = 0; i < sources; ++i) {
 371       /* The LOAD_PAYLOAD instruction only really makes sense if we are
 372        * dealing with whole registers.  If this ever changes, we can deal
 373        * with it later.
 374        */
 375       int size = inst->src[i].effective_width * type_sz(src[i].type);
 376       assert(size % 32 == 0);
 377       inst->regs_written += (size + 31) / 32;
 378    }
 379
 380    return inst;
 381 }
 382
 383 exec_list
 384 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
 385                                        const fs_reg &surf_index,
 386                                        const fs_reg &varying_offset,
 387                                        uint32_t const_offset)
 388 {
 389    exec_list instructions;
 390    fs_inst *inst;
 391
 392    /* We have our constant surface use a pitch of 4 bytes, so our index can
 393     * be any component of a vector, and then we load 4 contiguous
 394     * components starting from that.
 395     *
 396     * We break down the const_offset to a portion added to the variable
 397     * offset and a portion done using reg_offset, which means that if you
 398     * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
 399     * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
 400     * CSE can later notice that those loads are all the same and eliminate
 401     * the redundant ones.
 402     */
 403    fs_reg vec4_offset = vgrf(glsl_type::int_type);
 404    instructions.push_tail(ADD(vec4_offset,
 405                               varying_offset, fs_reg(const_offset & ~3)));
 406
 407    int scale = 1;
 408    if (brw->gen == 4 && dst.width == 8) {
 409       /* Pre-gen5, we can either use a SIMD8 message that requires (header,
 410        * u, v, r) as parameters, or we can just use the SIMD16 message
 411        * consisting of (header, u).  We choose the second, at the cost of a
 412        * longer return length.
 413        */
 414       scale = 2;
 415    }
 416
 417    enum opcode op;
 418    if (brw->gen >= 7)
 419       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
 420    else
 421       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
 422
 423    assert(dst.width % 8 == 0);
 424    int regs_written = 4 * (dst.width / 8) * scale;
 425    fs_reg vec4_result = fs_reg(GRF, alloc.allocate(regs_written),
 426                                dst.type, dst.width);
 427    inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
 428    inst->regs_written = regs_written;
 429    instructions.push_tail(inst);
 430
 431    if (brw->gen < 7) {
 432       inst->base_mrf = 13;
 433       inst->header_present = true;
 434       if (brw->gen == 4)
 435          inst->mlen = 3;
 436       else
 437          inst->mlen = 1 + dispatch_width / 8;
 438    }
 439
 440    fs_reg result = offset(vec4_result, (const_offset & 3) * scale);
 441    instructions.push_tail(MOV(dst, result));
 442
 443    return instructions;
 444 }
 445
 446 /**
 447  * A helper for MOV generation for fixing up broken hardware SEND dependency
 448  * handling.
 449  */
 450 fs_inst *
 451 fs_visitor::DEP_RESOLVE_MOV(int grf)
 452 {
 453    fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
 454
 455    inst->ir = NULL;
 456    inst->annotation = "send dependency resolve";
 457
 458    /* The caller always wants uncompressed to emit the minimal extra
 459     * dependencies, and to avoid having to deal with aligning its regs to 2.
 460     */
 461    inst->exec_size = 8;
 462
 463    return inst;
 464 }
 465
 466 bool
 467 fs_inst::equals(fs_inst *inst) const
 468 {
 469    return (opcode == inst->opcode &&
 470            dst.equals(inst->dst) &&
 471            src[0].equals(inst->src[0]) &&
 472            src[1].equals(inst->src[1]) &&
 473            src[2].equals(inst->src[2]) &&
 474            saturate == inst->saturate &&
 475            predicate == inst->predicate &&
 476            conditional_mod == inst->conditional_mod &&
 477            mlen == inst->mlen &&
 478            base_mrf == inst->base_mrf &&
 479            target == inst->target &&
 480            eot == inst->eot &&
 481            header_present == inst->header_present &&
 482            shadow_compare == inst->shadow_compare &&
 483            exec_size == inst->exec_size &&
 484            offset == inst->offset);
 485 }
 486
 487 bool
 488 fs_inst::overwrites_reg(const fs_reg &reg) const
 489 {
 490    return (reg.file == dst.file &&
 491            reg.reg == dst.reg &&
 492            reg.reg_offset >= dst.reg_offset  &&
 493            reg.reg_offset < dst.reg_offset + regs_written);
 494 }
 495
 496 bool
 497 fs_inst::is_send_from_grf() const
 498 {
 499    switch (opcode) {
 500    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
 501    case SHADER_OPCODE_SHADER_TIME_ADD:
 502    case FS_OPCODE_INTERPOLATE_AT_CENTROID:
 503    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
 504    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
 505    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
 506    case SHADER_OPCODE_UNTYPED_ATOMIC:
 507    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
 508    case SHADER_OPCODE_URB_WRITE_SIMD8:
 509       return true;
 510    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 511       return src[1].file == GRF;
 512    case FS_OPCODE_FB_WRITE:
 513       return src[0].file == GRF;
 514    default:
 515       if (is_tex())
 516          return src[0].file == GRF;
 517
 518       return false;
 519    }
 520 }
 521
 522 bool
 523 fs_inst::can_do_source_mods(struct brw_context *brw)
 524 {
 525    if (brw->gen == 6 && is_math())
 526       return false;
 527
 528    if (is_send_from_grf())
 529       return false;
 530
 531    if (!backend_instruction::can_do_source_mods())
 532       return false;
 533
 534    return true;
 535 }
 536
 537 void
 538 fs_reg::init()
 539 {
 540    memset(this, 0, sizeof(*this));
 541    stride = 1;
 542 }
 543
 544 /** Generic unset register constructor. */
 545 fs_reg::fs_reg()
 546 {
 547    init();
 548    this->file = BAD_FILE;
 549 }
 550
 551 /** Immediate value constructor. */
 552 fs_reg::fs_reg(float f)
 553 {
 554    init();
 555    this->file = IMM;
 556    this->type = BRW_REGISTER_TYPE_F;
 557    this->fixed_hw_reg.dw1.f = f;
 558    this->width = 1;
 559 }
 560
 561 /** Immediate value constructor. */
 562 fs_reg::fs_reg(int32_t i)
 563 {
 564    init();
 565    this->file = IMM;
 566    this->type = BRW_REGISTER_TYPE_D;
 567    this->fixed_hw_reg.dw1.d = i;
 568    this->width = 1;
 569 }
 570
 571 /** Immediate value constructor. */
 572 fs_reg::fs_reg(uint32_t u)
 573 {
 574    init();
 575    this->file = IMM;
 576    this->type = BRW_REGISTER_TYPE_UD;
 577    this->fixed_hw_reg.dw1.ud = u;
 578    this->width = 1;
 579 }
 580
 581 /** Vector float immediate value constructor. */
 582 fs_reg::fs_reg(uint8_t vf[4])
 583 {
 584    init();
 585    this->file = IMM;
 586    this->type = BRW_REGISTER_TYPE_VF;
 587    memcpy(&this->fixed_hw_reg.dw1.ud, vf, sizeof(unsigned));
 588 }
 589
 590 /** Vector float immediate value constructor. */
 591 fs_reg::fs_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
 592 {
 593    init();
 594    this->file = IMM;
 595    this->type = BRW_REGISTER_TYPE_VF;
 596    this->fixed_hw_reg.dw1.ud = (vf0 <<  0) |
 597                                (vf1 <<  8) |
 598                                (vf2 << 16) |
 599                                (vf3 << 24);
 600 }
 601
 602 /** Fixed brw_reg. */
 603 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 604 {
 605    init();
 606    this->file = HW_REG;
 607    this->fixed_hw_reg = fixed_hw_reg;
 608    this->type = fixed_hw_reg.type;
 609    this->width = 1 << fixed_hw_reg.width;
 610 }
 611
 612 bool
 613 fs_reg::equals(const fs_reg &r) const
 614 {
 615    return (file == r.file &&
 616            reg == r.reg &&
 617            reg_offset == r.reg_offset &&
 618            subreg_offset == r.subreg_offset &&
 619            type == r.type &&
 620            negate == r.negate &&
 621            abs == r.abs &&
 622            !reladdr && !r.reladdr &&
 623            memcmp(&fixed_hw_reg, &r.fixed_hw_reg, sizeof(fixed_hw_reg)) == 0 &&
 624            width == r.width &&
 625            stride == r.stride);
 626 }
 627
 628 fs_reg &
 629 fs_reg::set_smear(unsigned subreg)
 630 {
 631    assert(file != HW_REG && file != IMM);
 632    subreg_offset = subreg * type_sz(type);
 633    stride = 0;
 634    return *this;
 635 }
 636
 637 bool
 638 fs_reg::is_contiguous() const
 639 {
 640    return stride == 1;
 641 }
 642
 643 int
 644 fs_visitor::type_size(const struct glsl_type *type)
 645 {
 646    unsigned int size, i;
 647
 648    switch (type->base_type) {
 649    case GLSL_TYPE_UINT:
 650    case GLSL_TYPE_INT:
 651    case GLSL_TYPE_FLOAT:
 652    case GLSL_TYPE_BOOL:
 653       return type->components();
 654    case GLSL_TYPE_ARRAY:
 655       return type_size(type->fields.array) * type->length;
 656    case GLSL_TYPE_STRUCT:
 657       size = 0;
 658       for (i = 0; i < type->length; i++) {
 659          size += type_size(type->fields.structure[i].type);
 660       }
 661       return size;
 662    case GLSL_TYPE_SAMPLER:
 663       /* Samplers take up no register space, since they're baked in at
 664        * link time.
 665        */
 666       return 0;
 667    case GLSL_TYPE_ATOMIC_UINT:
 668       return 0;
 669    case GLSL_TYPE_IMAGE:
 670    case GLSL_TYPE_VOID:
 671    case GLSL_TYPE_ERROR:
 672    case GLSL_TYPE_INTERFACE:
 673    case GLSL_TYPE_DOUBLE:
 674       unreachable("not reached");
 675    }
 676
 677    return 0;
 678 }
 679
 680 fs_reg
 681 fs_visitor::get_timestamp()
 682 {
 683    assert(brw->gen >= 7);
 684
 685    fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 686                                           BRW_ARF_TIMESTAMP,
 687                                           0),
 688                              BRW_REGISTER_TYPE_UD));
 689
 690    fs_reg dst = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 4);
 691
 692    fs_inst *mov = emit(MOV(dst, ts));
 693    /* We want to read the 3 fields we care about even if it's not enabled in
 694     * the dispatch.
 695     */
 696    mov->force_writemask_all = true;
 697
 698    /* The caller wants the low 32 bits of the timestamp.  Since it's running
 699     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 700     * which is plenty of time for our purposes.  It is identical across the
 701     * EUs, but since it's tracking GPU core speed it will increment at a
 702     * varying rate as render P-states change.
 703     *
 704     * The caller could also check if render P-states have changed (or anything
 705     * else that might disrupt timing) by setting smear to 2 and checking if
 706     * that field is != 0.
 707     */
 708    dst.set_smear(0);
 709
 710    return dst;
 711 }
 712
 713 void
 714 fs_visitor::emit_shader_time_begin()
 715 {
 716    current_annotation = "shader time start";
 717    shader_start_time = get_timestamp();
 718 }
 719
 720 void
 721 fs_visitor::emit_shader_time_end()
 722 {
 723    current_annotation = "shader time end";
 724
 725    enum shader_time_shader_type type, written_type, reset_type;
 726    switch (stage) {
 727    case MESA_SHADER_VERTEX:
 728       type = ST_VS;
 729       written_type = ST_VS_WRITTEN;
 730       reset_type = ST_VS_RESET;
 731       break;
 732    case MESA_SHADER_GEOMETRY:
 733       type = ST_GS;
 734       written_type = ST_GS_WRITTEN;
 735       reset_type = ST_GS_RESET;
 736       break;
 737    case MESA_SHADER_FRAGMENT:
 738       if (dispatch_width == 8) {
 739          type = ST_FS8;
 740          written_type = ST_FS8_WRITTEN;
 741          reset_type = ST_FS8_RESET;
 742       } else {
 743          assert(dispatch_width == 16);
 744          type = ST_FS16;
 745          written_type = ST_FS16_WRITTEN;
 746          reset_type = ST_FS16_RESET;
 747       }
 748       break;
 749    default:
 750       unreachable("fs_visitor::emit_shader_time_end missing code");
 751    }
 752
 753    fs_reg shader_end_time = get_timestamp();
 754
 755    /* Check that there weren't any timestamp reset events (assuming these
 756     * were the only two timestamp reads that happened).
 757     */
 758    fs_reg reset = shader_end_time;
 759    reset.set_smear(2);
 760    fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
 761    test->conditional_mod = BRW_CONDITIONAL_Z;
 762    test->force_writemask_all = true;
 763    emit(IF(BRW_PREDICATE_NORMAL));
 764
 765    fs_reg start = shader_start_time;
 766    start.negate = true;
 767    fs_reg diff = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 1);
 768    diff.set_smear(0);
 769    fs_inst *add = ADD(diff, start, shader_end_time);
 770    add->force_writemask_all = true;
 771    emit(add);
 772
 773    /* If there were no instructions between the two timestamp gets, the diff
 774     * is 2 cycles.  Remove that overhead, so I can forget about that when
 775     * trying to determine the time taken for single instructions.
 776     */
 777    add = ADD(diff, diff, fs_reg(-2u));
 778    add->force_writemask_all = true;
 779    emit(add);
 780
 781    emit_shader_time_write(type, diff);
 782    emit_shader_time_write(written_type, fs_reg(1u));
 783    emit(BRW_OPCODE_ELSE);
 784    emit_shader_time_write(reset_type, fs_reg(1u));
 785    emit(BRW_OPCODE_ENDIF);
 786 }
 787
 788 void
 789 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
 790                                    fs_reg value)
 791 {
 792    int shader_time_index =
 793       brw_get_shader_time_index(brw, shader_prog, prog, type);
 794    fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
 795
 796    fs_reg payload;
 797    if (dispatch_width == 8)
 798       payload = vgrf(glsl_type::uvec2_type);
 799    else
 800       payload = vgrf(glsl_type::uint_type);
 801
 802    emit(new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
 803                              fs_reg(), payload, offset, value));
 804 }
 805
 806 void
 807 fs_visitor::vfail(const char *format, va_list va)
 808 {
 809    char *msg;
 810
 811    if (failed)
 812       return;
 813
 814    failed = true;
 815
 816    msg = ralloc_vasprintf(mem_ctx, format, va);
 817    msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
 818
 819    this->fail_msg = msg;
 820
 821    if (debug_enabled) {
 822       fprintf(stderr, "%s",  msg);
 823    }
 824 }
 825
 826 void
 827 fs_visitor::fail(const char *format, ...)
 828 {
 829    va_list va;
 830
 831    va_start(va, format);
 832    vfail(format, va);
 833    va_end(va);
 834 }
 835
 836 /**
 837  * Mark this program as impossible to compile in SIMD16 mode.
 838  *
 839  * During the SIMD8 compile (which happens first), we can detect and flag
 840  * things that are unsupported in SIMD16 mode, so the compiler can skip
 841  * the SIMD16 compile altogether.
 842  *
 843  * During a SIMD16 compile (if one happens anyway), this just calls fail().
 844  */
 845 void
 846 fs_visitor::no16(const char *format, ...)
 847 {
 848    va_list va;
 849
 850    va_start(va, format);
 851
 852    if (dispatch_width == 16) {
 853       vfail(format, va);
 854    } else {
 855       simd16_unsupported = true;
 856
 857       if (brw->perf_debug) {
 858          if (no16_msg)
 859             ralloc_vasprintf_append(&no16_msg, format, va);
 860          else
 861             no16_msg = ralloc_vasprintf(mem_ctx, format, va);
 862       }
 863    }
 864
 865    va_end(va);
 866 }
 867
 868 fs_inst *
 869 fs_visitor::emit(enum opcode opcode)
 870 {
 871    return emit(new(mem_ctx) fs_inst(opcode, dispatch_width));
 872 }
 873
 874 fs_inst *
 875 fs_visitor::emit(enum opcode opcode, const fs_reg &dst)
 876 {
 877    return emit(new(mem_ctx) fs_inst(opcode, dst));
 878 }
 879
 880 fs_inst *
 881 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
 882 {
 883    return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
 884 }
 885
 886 fs_inst *
 887 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 888                  const fs_reg &src1)
 889 {
 890    return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
 891 }
 892
 893 fs_inst *
 894 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 895                  const fs_reg &src1, const fs_reg &src2)
 896 {
 897    return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
 898 }
 899
 900 fs_inst *
 901 fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
 902                  fs_reg src[], int sources)
 903 {
 904    return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
 905 }
 906
 907 /**
 908  * Returns true if the instruction has a flag that means it won't
 909  * update an entire destination register.
 910  *
 911  * For example, dead code elimination and live variable analysis want to know
 912  * when a write to a variable screens off any preceding values that were in
 913  * it.
 914  */
 915 bool
 916 fs_inst::is_partial_write() const
 917 {
 918    return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
 919            (this->dst.width * type_sz(this->dst.type)) < 32 ||
 920            !this->dst.is_contiguous());
 921 }
 922
 923 int
 924 fs_inst::regs_read(int arg) const
 925 {
 926    if (is_tex() && arg == 0 && src[0].file == GRF) {
 927       return mlen;
 928    } else if (opcode == FS_OPCODE_FB_WRITE && arg == 0) {
 929       return mlen;
 930    } else if (opcode == SHADER_OPCODE_URB_WRITE_SIMD8 && arg == 0) {
 931       return mlen;
 932    } else if (opcode == SHADER_OPCODE_UNTYPED_ATOMIC && arg == 0) {
 933       return mlen;
 934    } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ && arg == 0) {
 935       return mlen;
 936    } else if (opcode == FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET && arg == 0) {
 937       return mlen;
 938    }
 939
 940    switch (src[arg].file) {
 941    case BAD_FILE:
 942    case UNIFORM:
 943    case IMM:
 944       return 1;
 945    case GRF:
 946    case HW_REG:
 947       if (src[arg].stride == 0) {
 948          return 1;
 949       } else {
 950          int size = src[arg].width * src[arg].stride * type_sz(src[arg].type);
 951          return (size + 31) / 32;
 952       }
 953    case MRF:
 954       unreachable("MRF registers are not allowed as sources");
 955    default:
 956       unreachable("Invalid register file");
 957    }
 958 }
 959
 960 bool
 961 fs_inst::reads_flag() const
 962 {
 963    return predicate;
 964 }
 965
 966 bool
 967 fs_inst::writes_flag() const
 968 {
 969    return (conditional_mod && (opcode != BRW_OPCODE_SEL &&
 970                                opcode != BRW_OPCODE_IF &&
 971                                opcode != BRW_OPCODE_WHILE)) ||
 972           opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
 973 }
 974
 975 /**
 976  * Returns how many MRFs an FS opcode will write over.
 977  *
 978  * Note that this is not the 0 or 1 implied writes in an actual gen
 979  * instruction -- the FS opcodes often generate MOVs in addition.
 980  */
 981 int
 982 fs_visitor::implied_mrf_writes(fs_inst *inst)
 983 {
 984    if (inst->mlen == 0)
 985       return 0;
 986
 987    if (inst->base_mrf == -1)
 988       return 0;
 989
 990    switch (inst->opcode) {
 991    case SHADER_OPCODE_RCP:
 992    case SHADER_OPCODE_RSQ:
 993    case SHADER_OPCODE_SQRT:
 994    case SHADER_OPCODE_EXP2:
 995    case SHADER_OPCODE_LOG2:
 996    case SHADER_OPCODE_SIN:
 997    case SHADER_OPCODE_COS:
 998       return 1 * dispatch_width / 8;
 999    case SHADER_OPCODE_POW:
1000    case SHADER_OPCODE_INT_QUOTIENT:
1001    case SHADER_OPCODE_INT_REMAINDER:
1002       return 2 * dispatch_width / 8;
1003    case SHADER_OPCODE_TEX:
1004    case FS_OPCODE_TXB:
1005    case SHADER_OPCODE_TXD:
1006    case SHADER_OPCODE_TXF:
1007    case SHADER_OPCODE_TXF_CMS:
1008    case SHADER_OPCODE_TXF_MCS:
1009    case SHADER_OPCODE_TG4:
1010    case SHADER_OPCODE_TG4_OFFSET:
1011    case SHADER_OPCODE_TXL:
1012    case SHADER_OPCODE_TXS:
1013    case SHADER_OPCODE_LOD:
1014       return 1;
1015    case FS_OPCODE_FB_WRITE:
1016       return 2;
1017    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
1018    case SHADER_OPCODE_GEN4_SCRATCH_READ:
1019       return 1;
1020    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
1021       return inst->mlen;
1022    case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
1023       return 2;
1024    case SHADER_OPCODE_UNTYPED_ATOMIC:
1025    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
1026    case SHADER_OPCODE_URB_WRITE_SIMD8:
1027    case FS_OPCODE_INTERPOLATE_AT_CENTROID:
1028    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
1029    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
1030    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
1031       return 0;
1032    default:
1033       unreachable("not reached");
1034    }
1035 }
1036
1037 fs_reg
1038 fs_visitor::vgrf(const glsl_type *const type)
1039 {
1040    int reg_width = dispatch_width / 8;
1041    return fs_reg(GRF, alloc.allocate(type_size(type) * reg_width),
1042                  brw_type_for_base_type(type), dispatch_width);
1043 }
1044
1045 fs_reg
1046 fs_visitor::vgrf(int num_components)
1047 {
1048    int reg_width = dispatch_width / 8;
1049    return fs_reg(GRF, alloc.allocate(num_components * reg_width),
1050                  BRW_REGISTER_TYPE_F, dispatch_width);
1051 }
1052
1053 /** Fixed HW reg constructor. */
1054 fs_reg::fs_reg(enum register_file file, int reg)
1055 {
1056    init();
1057    this->file = file;
1058    this->reg = reg;
1059    this->type = BRW_REGISTER_TYPE_F;
1060
1061    switch (file) {
1062    case UNIFORM:
1063       this->width = 1;
1064       break;
1065    default:
1066       this->width = 8;
1067    }
1068 }
1069
1070 /** Fixed HW reg constructor. */
1071 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
1072 {
1073    init();
1074    this->file = file;
1075    this->reg = reg;
1076    this->type = type;
1077
1078    switch (file) {
1079    case UNIFORM:
1080       this->width = 1;
1081       break;
1082    default:
1083       this->width = 8;
1084    }
1085 }
1086
1087 /** Fixed HW reg constructor. */
1088 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type,
1089                uint8_t width)
1090 {
1091    init();
1092    this->file = file;
1093    this->reg = reg;
1094    this->type = type;
1095    this->width = width;
1096 }
1097
1098 fs_reg *
1099 fs_visitor::variable_storage(ir_variable *var)
1100 {
1101    return (fs_reg *)hash_table_find(this->variable_ht, var);
1102 }
1103
1104 void
1105 import_uniforms_callback(const void *key,
1106                          void *data,
1107                          void *closure)
1108 {
1109    struct hash_table *dst_ht = (struct hash_table *)closure;
1110    const fs_reg *reg = (const fs_reg *)data;
1111
1112    if (reg->file != UNIFORM)
1113       return;
1114
1115    hash_table_insert(dst_ht, data, key);
1116 }
1117
1118 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
1119  * This brings in those uniform definitions
1120  */
1121 void
1122 fs_visitor::import_uniforms(fs_visitor *v)
1123 {
1124    hash_table_call_foreach(v->variable_ht,
1125                            import_uniforms_callback,
1126                            variable_ht);
1127    this->push_constant_loc = v->push_constant_loc;
1128    this->pull_constant_loc = v->pull_constant_loc;
1129    this->uniforms = v->uniforms;
1130    this->param_size = v->param_size;
1131 }
1132
1133 /* Our support for uniforms is piggy-backed on the struct
1134  * gl_fragment_program, because that's where the values actually
1135  * get stored, rather than in some global gl_shader_program uniform
1136  * store.
1137  */
1138 void
1139 fs_visitor::setup_uniform_values(ir_variable *ir)
1140 {
1141    int namelen = strlen(ir->name);
1142
1143    /* The data for our (non-builtin) uniforms is stored in a series of
1144     * gl_uniform_driver_storage structs for each subcomponent that
1145     * glGetUniformLocation() could name.  We know it's been set up in the same
1146     * order we'd walk the type, so walk the list of storage and find anything
1147     * with our name, or the prefix of a component that starts with our name.
1148     */
1149    unsigned params_before = uniforms;
1150    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
1151       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
1152
1153       if (strncmp(ir->name, storage->name, namelen) != 0 ||
1154           (storage->name[namelen] != 0 &&
1155            storage->name[namelen] != '.' &&
1156            storage->name[namelen] != '[')) {
1157          continue;
1158       }
1159
1160       unsigned slots = storage->type->component_slots();
1161       if (storage->array_elements)
1162          slots *= storage->array_elements;
1163
1164       for (unsigned i = 0; i < slots; i++) {
1165          stage_prog_data->param[uniforms++] = &storage->storage[i];
1166       }
1167    }
1168
1169    /* Make sure we actually initialized the right amount of stuff here. */
1170    assert(params_before + ir->type->component_slots() == uniforms);
1171    (void)params_before;
1172 }
1173
1174
1175 /* Our support for builtin uniforms is even scarier than non-builtin.
1176  * It sits on top of the PROG_STATE_VAR parameters that are
1177  * automatically updated from GL context state.
1178  */
1179 void
1180 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
1181 {
1182    const ir_state_slot *const slots = ir->get_state_slots();
1183    assert(slots != NULL);
1184
1185    for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
1186       /* This state reference has already been setup by ir_to_mesa, but we'll
1187        * get the same index back here.
1188        */
1189       int index = _mesa_add_state_reference(this->prog->Parameters,
1190                                             (gl_state_index *)slots[i].tokens);
1191
1192       /* Add each of the unique swizzles of the element as a parameter.
1193        * This'll end up matching the expected layout of the
1194        * array/matrix/structure we're trying to fill in.
1195        */
1196       int last_swiz = -1;
1197       for (unsigned int j = 0; j < 4; j++) {
1198          int swiz = GET_SWZ(slots[i].swizzle, j);
1199          if (swiz == last_swiz)
1200             break;
1201          last_swiz = swiz;
1202
1203          stage_prog_data->param[uniforms++] =
1204             &prog->Parameters->ParameterValues[index][swiz];
1205       }
1206    }
1207 }
1208
1209 fs_reg *
1210 fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
1211                                          bool origin_upper_left)
1212 {
1213    assert(stage == MESA_SHADER_FRAGMENT);
1214    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1215    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec4_type));
1216    fs_reg wpos = *reg;
1217    bool flip = !origin_upper_left ^ key->render_to_fbo;
1218
1219    /* gl_FragCoord.x */
1220    if (pixel_center_integer) {
1221       emit(MOV(wpos, this->pixel_x));
1222    } else {
1223       emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
1224    }
1225    wpos = offset(wpos, 1);
1226
1227    /* gl_FragCoord.y */
1228    if (!flip && pixel_center_integer) {
1229       emit(MOV(wpos, this->pixel_y));
1230    } else {
1231       fs_reg pixel_y = this->pixel_y;
1232       float offset = (pixel_center_integer ? 0.0 : 0.5);
1233
1234       if (flip) {
1235          pixel_y.negate = true;
1236          offset += key->drawable_height - 1.0;
1237       }
1238
1239       emit(ADD(wpos, pixel_y, fs_reg(offset)));
1240    }
1241    wpos = offset(wpos, 1);
1242
1243    /* gl_FragCoord.z */
1244    if (brw->gen >= 6) {
1245       emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
1246    } else {
1247       emit(FS_OPCODE_LINTERP, wpos,
1248            this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1249            this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1250            interp_reg(VARYING_SLOT_POS, 2));
1251    }
1252    wpos = offset(wpos, 1);
1253
1254    /* gl_FragCoord.w: Already set up in emit_interpolation */
1255    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1256
1257    return reg;
1258 }
1259
1260 fs_inst *
1261 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1262                          glsl_interp_qualifier interpolation_mode,
1263                          bool is_centroid, bool is_sample)
1264 {
1265    brw_wm_barycentric_interp_mode barycoord_mode;
1266    if (brw->gen >= 6) {
1267       if (is_centroid) {
1268          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1269             barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1270          else
1271             barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1272       } else if (is_sample) {
1273           if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1274             barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1275          else
1276             barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1277       } else {
1278          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1279             barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1280          else
1281             barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1282       }
1283    } else {
1284       /* On Ironlake and below, there is only one interpolation mode.
1285        * Centroid interpolation doesn't mean anything on this hardware --
1286        * there is no multisampling.
1287        */
1288       barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1289    }
1290    return emit(FS_OPCODE_LINTERP, attr,
1291                this->delta_x[barycoord_mode],
1292                this->delta_y[barycoord_mode], interp);
1293 }
1294
1295 void
1296 fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
1297                                        const glsl_type *type,
1298                                        glsl_interp_qualifier interpolation_mode,
1299                                        int location, bool mod_centroid,
1300                                        bool mod_sample)
1301 {
1302    attr.type = brw_type_for_base_type(type->get_scalar_type());
1303
1304    assert(stage == MESA_SHADER_FRAGMENT);
1305    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1306    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1307
1308    unsigned int array_elements;
1309
1310    if (type->is_array()) {
1311       array_elements = type->length;
1312       if (array_elements == 0) {
1313          fail("dereferenced array '%s' has length 0\n", name);
1314       }
1315       type = type->fields.array;
1316    } else {
1317       array_elements = 1;
1318    }
1319
1320    if (interpolation_mode == INTERP_QUALIFIER_NONE) {
1321       bool is_gl_Color =
1322          location == VARYING_SLOT_COL0 || location == VARYING_SLOT_COL1;
1323       if (key->flat_shade && is_gl_Color) {
1324          interpolation_mode = INTERP_QUALIFIER_FLAT;
1325       } else {
1326          interpolation_mode = INTERP_QUALIFIER_SMOOTH;
1327       }
1328    }
1329
1330    for (unsigned int i = 0; i < array_elements; i++) {
1331       for (unsigned int j = 0; j < type->matrix_columns; j++) {
1332          if (prog_data->urb_setup[location] == -1) {
1333             /* If there's no incoming setup data for this slot, don't
1334              * emit interpolation for it.
1335              */
1336             attr = offset(attr, type->vector_elements);
1337             location++;
1338             continue;
1339          }
1340
1341          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1342             /* Constant interpolation (flat shading) case. The SF has
1343              * handed us defined values in only the constant offset
1344              * field of the setup reg.
1345              */
1346             for (unsigned int k = 0; k < type->vector_elements; k++) {
1347                struct brw_reg interp = interp_reg(location, k);
1348                interp = suboffset(interp, 3);
1349                interp.type = attr.type;
1350                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1351                attr = offset(attr, 1);
1352             }
1353          } else {
1354             /* Smooth/noperspective interpolation case. */
1355             for (unsigned int k = 0; k < type->vector_elements; k++) {
1356                struct brw_reg interp = interp_reg(location, k);
1357                if (brw->needs_unlit_centroid_workaround && mod_centroid) {
1358                   /* Get the pixel/sample mask into f0 so that we know
1359                    * which pixels are lit.  Then, for each channel that is
1360                    * unlit, replace the centroid data with non-centroid
1361                    * data.
1362                    */
1363                   emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1364
1365                   fs_inst *inst;
1366                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1367                                       false, false);
1368                   inst->predicate = BRW_PREDICATE_NORMAL;
1369                   inst->predicate_inverse = true;
1370                   if (brw->has_pln)
1371                      inst->no_dd_clear = true;
1372
1373                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1374                                       mod_centroid && !key->persample_shading,
1375                                       mod_sample || key->persample_shading);
1376                   inst->predicate = BRW_PREDICATE_NORMAL;
1377                   inst->predicate_inverse = false;
1378                   if (brw->has_pln)
1379                      inst->no_dd_check = true;
1380
1381                } else {
1382                   emit_linterp(attr, fs_reg(interp), interpolation_mode,
1383                                mod_centroid && !key->persample_shading,
1384                                mod_sample || key->persample_shading);
1385                }
1386                if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1387                   emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1388                }
1389                attr = offset(attr, 1);
1390             }
1391
1392          }
1393          location++;
1394       }
1395    }
1396 }
1397
1398 fs_reg *
1399 fs_visitor::emit_frontfacing_interpolation()
1400 {
1401    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type));
1402
1403    if (brw->gen >= 6) {
1404       /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
1405        * a boolean result from this (~0/true or 0/false).
1406        *
1407        * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
1408        * this task in only one instruction:
1409        *    - a negation source modifier will flip the bit; and
1410        *    - a W -> D type conversion will sign extend the bit into the high
1411        *      word of the destination.
1412        *
1413        * An ASR 15 fills the low word of the destination.
1414        */
1415       fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
1416       g0.negate = true;
1417
1418       emit(ASR(*reg, g0, fs_reg(15)));
1419    } else {
1420       /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
1421        * a boolean result from this (1/true or 0/false).
1422        *
1423        * Like in the above case, since the bit is the MSB of g1.6:UD we can use
1424        * the negation source modifier to flip it. Unfortunately the SHR
1425        * instruction only operates on UD (or D with an abs source modifier)
1426        * sources without negation.
1427        *
1428        * Instead, use ASR (which will give ~0/true or 0/false).
1429        */
1430       fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
1431       g1_6.negate = true;
1432
1433       emit(ASR(*reg, g1_6, fs_reg(31)));
1434    }
1435
1436    return reg;
1437 }
1438
1439 void
1440 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1441 {
1442    assert(stage == MESA_SHADER_FRAGMENT);
1443    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1444    assert(dst.type == BRW_REGISTER_TYPE_F);
1445
1446    if (key->compute_pos_offset) {
1447       /* Convert int_sample_pos to floating point */
1448       emit(MOV(dst, int_sample_pos));
1449       /* Scale to the range [0, 1] */
1450       emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1451    }
1452    else {
1453       /* From ARB_sample_shading specification:
1454        * "When rendering to a non-multisample buffer, or if multisample
1455        *  rasterization is disabled, gl_SamplePosition will always be
1456        *  (0.5, 0.5).
1457        */
1458       emit(MOV(dst, fs_reg(0.5f)));
1459    }
1460 }
1461
1462 fs_reg *
1463 fs_visitor::emit_samplepos_setup()
1464 {
1465    assert(brw->gen >= 6);
1466
1467    this->current_annotation = "compute sample position";
1468    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type));
1469    fs_reg pos = *reg;
1470    fs_reg int_sample_x = vgrf(glsl_type::int_type);
1471    fs_reg int_sample_y = vgrf(glsl_type::int_type);
1472
1473    /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1474     * mode will be enabled.
1475     *
1476     * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1477     * R31.1:0         Position Offset X/Y for Slot[3:0]
1478     * R31.3:2         Position Offset X/Y for Slot[7:4]
1479     * .....
1480     *
1481     * The X, Y sample positions come in as bytes in  thread payload. So, read
1482     * the positions using vstride=16, width=8, hstride=2.
1483     */
1484    struct brw_reg sample_pos_reg =
1485       stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1486                     BRW_REGISTER_TYPE_B), 16, 8, 2);
1487
1488    if (dispatch_width == 8) {
1489       emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1490    } else {
1491       emit(MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg)));
1492       emit(MOV(half(int_sample_x, 1), fs_reg(suboffset(sample_pos_reg, 16))))
1493          ->force_sechalf = true;
1494    }
1495    /* Compute gl_SamplePosition.x */
1496    compute_sample_position(pos, int_sample_x);
1497    pos = offset(pos, 1);
1498    if (dispatch_width == 8) {
1499       emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1500    } else {
1501       emit(MOV(half(int_sample_y, 0),
1502                fs_reg(suboffset(sample_pos_reg, 1))));
1503       emit(MOV(half(int_sample_y, 1), fs_reg(suboffset(sample_pos_reg, 17))))
1504          ->force_sechalf = true;
1505    }
1506    /* Compute gl_SamplePosition.y */
1507    compute_sample_position(pos, int_sample_y);
1508    return reg;
1509 }
1510
1511 fs_reg *
1512 fs_visitor::emit_sampleid_setup()
1513 {
1514    assert(stage == MESA_SHADER_FRAGMENT);
1515    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1516    assert(brw->gen >= 6);
1517
1518    this->current_annotation = "compute sample id";
1519    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
1520
1521    if (key->compute_sample_id) {
1522       fs_reg t1 = vgrf(glsl_type::int_type);
1523       fs_reg t2 = vgrf(glsl_type::int_type);
1524       t2.type = BRW_REGISTER_TYPE_UW;
1525
1526       /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1527        * 8x multisampling, subspan 0 will represent sample N (where N
1528        * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1529        * 7. We can find the value of N by looking at R0.0 bits 7:6
1530        * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1531        * (since samples are always delivered in pairs). That is, we
1532        * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1533        * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1534        * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1535        * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1536        * populating a temporary variable with the sequence (0, 1, 2, 3),
1537        * and then reading from it using vstride=1, width=4, hstride=0.
1538        * These computations hold good for 4x multisampling as well.
1539        *
1540        * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1541        * the first four slots are sample 0 of subspan 0; the next four
1542        * are sample 1 of subspan 0; the third group is sample 0 of
1543        * subspan 1, and finally sample 1 of subspan 1.
1544        */
1545       fs_inst *inst;
1546       inst = emit(BRW_OPCODE_AND, t1,
1547                   fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1548                   fs_reg(0xc0));
1549       inst->force_writemask_all = true;
1550       inst = emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1551       inst->force_writemask_all = true;
1552       /* This works for both SIMD8 and SIMD16 */
1553       inst = emit(MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210)));
1554       inst->force_writemask_all = true;
1555       /* This special instruction takes care of setting vstride=1,
1556        * width=4, hstride=0 of t2 during an ADD instruction.
1557        */
1558       emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1559    } else {
1560       /* As per GL_ARB_sample_shading specification:
1561        * "When rendering to a non-multisample buffer, or if multisample
1562        *  rasterization is disabled, gl_SampleID will always be zero."
1563        */
1564       emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1565    }
1566
1567    return reg;
1568 }
1569
1570 void
1571 fs_visitor::resolve_source_modifiers(fs_reg *src)
1572 {
1573    if (!src->abs && !src->negate)
1574       return;
1575
1576    fs_reg temp = retype(vgrf(1), src->type);
1577    emit(MOV(temp, *src));
1578    *src = temp;
1579 }
1580
1581 fs_reg
1582 fs_visitor::fix_math_operand(fs_reg src)
1583 {
1584    /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1585     * might be able to do better by doing execsize = 1 math and then
1586     * expanding that result out, but we would need to be careful with
1587     * masking.
1588     *
1589     * The hardware ignores source modifiers (negate and abs) on math
1590     * instructions, so we also move to a temp to set those up.
1591     */
1592    if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1593        !src.abs && !src.negate)
1594       return src;
1595
1596    /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1597     * operands to math
1598     */
1599    if (brw->gen >= 7 && src.file != IMM)
1600       return src;
1601
1602    fs_reg expanded = vgrf(glsl_type::float_type);
1603    expanded.type = src.type;
1604    emit(BRW_OPCODE_MOV, expanded, src);
1605    return expanded;
1606 }
1607
1608 fs_inst *
1609 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1610 {
1611    switch (opcode) {
1612    case SHADER_OPCODE_RCP:
1613    case SHADER_OPCODE_RSQ:
1614    case SHADER_OPCODE_SQRT:
1615    case SHADER_OPCODE_EXP2:
1616    case SHADER_OPCODE_LOG2:
1617    case SHADER_OPCODE_SIN:
1618    case SHADER_OPCODE_COS:
1619       break;
1620    default:
1621       unreachable("not reached: bad math opcode");
1622    }
1623
1624    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
1625     * might be able to do better by doing execsize = 1 math and then
1626     * expanding that result out, but we would need to be careful with
1627     * masking.
1628     *
1629     * Gen 6 hardware ignores source modifiers (negate and abs) on math
1630     * instructions, so we also move to a temp to set those up.
1631     */
1632    if (brw->gen == 6 || brw->gen == 7)
1633       src = fix_math_operand(src);
1634
1635    fs_inst *inst = emit(opcode, dst, src);
1636
1637    if (brw->gen < 6) {
1638       inst->base_mrf = 2;
1639       inst->mlen = dispatch_width / 8;
1640    }
1641
1642    return inst;
1643 }
1644
1645 fs_inst *
1646 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1647 {
1648    int base_mrf = 2;
1649    fs_inst *inst;
1650
1651    if (brw->gen >= 8) {
1652       inst = emit(opcode, dst, src0, src1);
1653    } else if (brw->gen >= 6) {
1654       src0 = fix_math_operand(src0);
1655       src1 = fix_math_operand(src1);
1656
1657       inst = emit(opcode, dst, src0, src1);
1658    } else {
1659       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1660        * "Message Payload":
1661        *
1662        * "Operand0[7].  For the INT DIV functions, this operand is the
1663        *  denominator."
1664        *  ...
1665        * "Operand1[7].  For the INT DIV functions, this operand is the
1666        *  numerator."
1667        */
1668       bool is_int_div = opcode != SHADER_OPCODE_POW;
1669       fs_reg &op0 = is_int_div ? src1 : src0;
1670       fs_reg &op1 = is_int_div ? src0 : src1;
1671
1672       emit(MOV(fs_reg(MRF, base_mrf + 1, op1.type, dispatch_width), op1));
1673       inst = emit(opcode, dst, op0, reg_null_f);
1674
1675       inst->base_mrf = base_mrf;
1676       inst->mlen = 2 * dispatch_width / 8;
1677    }
1678    return inst;
1679 }
1680
1681 void
1682 fs_visitor::assign_curb_setup()
1683 {
1684    if (dispatch_width == 8) {
1685       prog_data->dispatch_grf_start_reg = payload.num_regs;
1686    } else {
1687       assert(stage == MESA_SHADER_FRAGMENT);
1688       brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1689       prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1690    }
1691
1692    prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1693
1694    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1695    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1696       for (unsigned int i = 0; i < inst->sources; i++) {
1697          if (inst->src[i].file == UNIFORM) {
1698             int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1699             int constant_nr;
1700             if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1701                constant_nr = push_constant_loc[uniform_nr];
1702             } else {
1703                /* Section 5.11 of the OpenGL 4.1 spec says:
1704                 * "Out-of-bounds reads return undefined values, which include
1705                 *  values from other variables of the active program or zero."
1706                 * Just return the first push constant.
1707                 */
1708                constant_nr = 0;
1709             }
1710
1711             struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1712                                                   constant_nr / 8,
1713                                                   constant_nr % 8);
1714
1715             inst->src[i].file = HW_REG;
1716             inst->src[i].fixed_hw_reg = byte_offset(
1717                retype(brw_reg, inst->src[i].type),
1718                inst->src[i].subreg_offset);
1719          }
1720       }
1721    }
1722 }
1723
1724 void
1725 fs_visitor::calculate_urb_setup()
1726 {
1727    assert(stage == MESA_SHADER_FRAGMENT);
1728    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1729    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1730
1731    memset(prog_data->urb_setup, -1,
1732           sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
1733
1734    int urb_next = 0;
1735    /* Figure out where each of the incoming setup attributes lands. */
1736    if (brw->gen >= 6) {
1737       if (_mesa_bitcount_64(prog->InputsRead &
1738                             BRW_FS_VARYING_INPUT_MASK) <= 16) {
1739          /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1740           * first 16 varying inputs, so we can put them wherever we want.
1741           * Just put them in order.
1742           *
1743           * This is useful because it means that (a) inputs not used by the
1744           * fragment shader won't take up valuable register space, and (b) we
1745           * won't have to recompile the fragment shader if it gets paired with
1746           * a different vertex (or geometry) shader.
1747           */
1748          for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1749             if (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1750                 BITFIELD64_BIT(i)) {
1751                prog_data->urb_setup[i] = urb_next++;
1752             }
1753          }
1754       } else {
1755          /* We have enough input varyings that the SF/SBE pipeline stage can't
1756           * arbitrarily rearrange them to suit our whim; we have to put them
1757           * in an order that matches the output of the previous pipeline stage
1758           * (geometry or vertex shader).
1759           */
1760          struct brw_vue_map prev_stage_vue_map;
1761          brw_compute_vue_map(brw, &prev_stage_vue_map,
1762                              key->input_slots_valid);
1763          int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1764          assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1765          for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1766               slot++) {
1767             int varying = prev_stage_vue_map.slot_to_varying[slot];
1768             /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1769              * unused.
1770              */
1771             if (varying != BRW_VARYING_SLOT_COUNT &&
1772                 (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1773                  BITFIELD64_BIT(varying))) {
1774                prog_data->urb_setup[varying] = slot - first_slot;
1775             }
1776          }
1777          urb_next = prev_stage_vue_map.num_slots - first_slot;
1778       }
1779    } else {
1780       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1781       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1782          /* Point size is packed into the header, not as a general attribute */
1783          if (i == VARYING_SLOT_PSIZ)
1784             continue;
1785
1786          if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1787             /* The back color slot is skipped when the front color is
1788              * also written to.  In addition, some slots can be
1789              * written in the vertex shader and not read in the
1790              * fragment shader.  So the register number must always be
1791              * incremented, mapped or not.
1792              */
1793             if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1794                prog_data->urb_setup[i] = urb_next;
1795             urb_next++;
1796          }
1797       }
1798
1799       /*
1800        * It's a FS only attribute, and we did interpolation for this attribute
1801        * in SF thread. So, count it here, too.
1802        *
1803        * See compile_sf_prog() for more info.
1804        */
1805       if (prog->InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1806          prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1807    }
1808
1809    prog_data->num_varying_inputs = urb_next;
1810 }
1811
1812 void
1813 fs_visitor::assign_urb_setup()
1814 {
1815    assert(stage == MESA_SHADER_FRAGMENT);
1816    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1817
1818    int urb_start = payload.num_regs + prog_data->base.curb_read_length;
1819
1820    /* Offset all the urb_setup[] index by the actual position of the
1821     * setup regs, now that the location of the constants has been chosen.
1822     */
1823    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1824       if (inst->opcode == FS_OPCODE_LINTERP) {
1825          assert(inst->src[2].file == HW_REG);
1826          inst->src[2].fixed_hw_reg.nr += urb_start;
1827       }
1828
1829       if (inst->opcode == FS_OPCODE_CINTERP) {
1830          assert(inst->src[0].file == HW_REG);
1831          inst->src[0].fixed_hw_reg.nr += urb_start;
1832       }
1833    }
1834
1835    /* Each attribute is 4 setup channels, each of which is half a reg. */
1836    this->first_non_payload_grf =
1837       urb_start + prog_data->num_varying_inputs * 2;
1838 }
1839
1840 void
1841 fs_visitor::assign_vs_urb_setup()
1842 {
1843    brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
1844    int grf, count, slot, channel, attr;
1845
1846    assert(stage == MESA_SHADER_VERTEX);
1847    count = _mesa_bitcount_64(vs_prog_data->inputs_read);
1848    if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid)
1849       count++;
1850
1851    /* Each attribute is 4 regs. */
1852    this->first_non_payload_grf =
1853       payload.num_regs + prog_data->curb_read_length + count * 4;
1854
1855    unsigned vue_entries =
1856       MAX2(count, vs_prog_data->base.vue_map.num_slots);
1857
1858    vs_prog_data->base.urb_entry_size = ALIGN(vue_entries, 4) / 4;
1859    vs_prog_data->base.urb_read_length = (count + 1) / 2;
1860
1861    assert(vs_prog_data->base.urb_read_length <= 15);
1862
1863    /* Rewrite all ATTR file references to the hw grf that they land in. */
1864    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1865       for (int i = 0; i < inst->sources; i++) {
1866          if (inst->src[i].file == ATTR) {
1867
1868             if (inst->src[i].reg == VERT_ATTRIB_MAX) {
1869                slot = count - 1;
1870             } else {
1871                /* Attributes come in in a contiguous block, ordered by their
1872                 * gl_vert_attrib value.  That means we can compute the slot
1873                 * number for an attribute by masking out the enabled
1874                 * attributes before it and counting the bits.
1875                 */
1876                attr = inst->src[i].reg + inst->src[i].reg_offset / 4;
1877                slot = _mesa_bitcount_64(vs_prog_data->inputs_read &
1878                                         BITFIELD64_MASK(attr));
1879             }
1880
1881             channel = inst->src[i].reg_offset & 3;
1882
1883             grf = payload.num_regs +
1884                prog_data->curb_read_length +
1885                slot * 4 + channel;
1886
1887             inst->src[i].file = HW_REG;
1888             inst->src[i].fixed_hw_reg =
1889                retype(brw_vec8_grf(grf, 0), inst->src[i].type);
1890          }
1891       }
1892    }
1893 }
1894
1895 /**
1896  * Split large virtual GRFs into separate components if we can.
1897  *
1898  * This is mostly duplicated with what brw_fs_vector_splitting does,
1899  * but that's really conservative because it's afraid of doing
1900  * splitting that doesn't result in real progress after the rest of
1901  * the optimization phases, which would cause infinite looping in
1902  * optimization.  We can do it once here, safely.  This also has the
1903  * opportunity to split interpolated values, or maybe even uniforms,
1904  * which we don't have at the IR level.
1905  *
1906  * We want to split, because virtual GRFs are what we register
1907  * allocate and spill (due to contiguousness requirements for some
1908  * instructions), and they're what we naturally generate in the
1909  * codegen process, but most virtual GRFs don't actually need to be
1910  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1911  * live intervals and better dead code elimination and coalescing.
1912  */
1913 void
1914 fs_visitor::split_virtual_grfs()
1915 {
1916    int num_vars = this->alloc.count;
1917
1918    /* Count the total number of registers */
1919    int reg_count = 0;
1920    int vgrf_to_reg[num_vars];
1921    for (int i = 0; i < num_vars; i++) {
1922       vgrf_to_reg[i] = reg_count;
1923       reg_count += alloc.sizes[i];
1924    }
1925
1926    /* An array of "split points".  For each register slot, this indicates
1927     * if this slot can be separated from the previous slot.  Every time an
1928     * instruction uses multiple elements of a register (as a source or
1929     * destination), we mark the used slots as inseparable.  Then we go
1930     * through and split the registers into the smallest pieces we can.
1931     */
1932    bool split_points[reg_count];
1933    memset(split_points, 0, sizeof(split_points));
1934
1935    /* Mark all used registers as fully splittable */
1936    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1937       if (inst->dst.file == GRF) {
1938          int reg = vgrf_to_reg[inst->dst.reg];
1939          for (unsigned j = 1; j < this->alloc.sizes[inst->dst.reg]; j++)
1940             split_points[reg + j] = true;
1941       }
1942
1943       for (int i = 0; i < inst->sources; i++) {
1944          if (inst->src[i].file == GRF) {
1945             int reg = vgrf_to_reg[inst->src[i].reg];
1946             for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].reg]; j++)
1947                split_points[reg + j] = true;
1948          }
1949       }
1950    }
1951
1952    if (brw->has_pln &&
1953        this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1954       /* PLN opcodes rely on the delta_xy being contiguous.  We only have to
1955        * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1956        * Gen6, that was the only supported interpolation mode, and since Gen6,
1957        * delta_x and delta_y are in fixed hardware registers.
1958        */
1959       int vgrf = this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg;
1960       split_points[vgrf_to_reg[vgrf] + 1] = false;
1961    }
1962
1963    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1964       if (inst->dst.file == GRF) {
1965          int reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
1966          for (int j = 1; j < inst->regs_written; j++)
1967             split_points[reg + j] = false;
1968       }
1969       for (int i = 0; i < inst->sources; i++) {
1970          if (inst->src[i].file == GRF) {
1971             int reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
1972             for (int j = 1; j < inst->regs_read(i); j++)
1973                split_points[reg + j] = false;
1974          }
1975       }
1976    }
1977
1978    int new_virtual_grf[reg_count];
1979    int new_reg_offset[reg_count];
1980
1981    int reg = 0;
1982    for (int i = 0; i < num_vars; i++) {
1983       /* The first one should always be 0 as a quick sanity check. */
1984       assert(split_points[reg] == false);
1985
1986       /* j = 0 case */
1987       new_reg_offset[reg] = 0;
1988       reg++;
1989       int offset = 1;
1990
1991       /* j > 0 case */
1992       for (unsigned j = 1; j < alloc.sizes[i]; j++) {
1993          /* If this is a split point, reset the offset to 0 and allocate a
1994           * new virtual GRF for the previous offset many registers
1995           */
1996          if (split_points[reg]) {
1997             assert(offset <= MAX_VGRF_SIZE);
1998             int grf = alloc.allocate(offset);
1999             for (int k = reg - offset; k < reg; k++)
2000                new_virtual_grf[k] = grf;
2001             offset = 0;
2002          }
2003          new_reg_offset[reg] = offset;
2004          offset++;
2005          reg++;
2006       }
2007
2008       /* The last one gets the original register number */
2009       assert(offset <= MAX_VGRF_SIZE);
2010       alloc.sizes[i] = offset;
2011       for (int k = reg - offset; k < reg; k++)
2012          new_virtual_grf[k] = i;
2013    }
2014    assert(reg == reg_count);
2015
2016    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2017       if (inst->dst.file == GRF) {
2018          reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
2019          inst->dst.reg = new_virtual_grf[reg];
2020          inst->dst.reg_offset = new_reg_offset[reg];
2021          assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
2022       }
2023       for (int i = 0; i < inst->sources; i++) {
2024          if (inst->src[i].file == GRF) {
2025             reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
2026             inst->src[i].reg = new_virtual_grf[reg];
2027             inst->src[i].reg_offset = new_reg_offset[reg];
2028             assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
2029          }
2030       }
2031    }
2032    invalidate_live_intervals();
2033 }
2034
2035 /**
2036  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
2037  *
2038  * During code generation, we create tons of temporary variables, many of
2039  * which get immediately killed and are never used again.  Yet, in later
2040  * optimization and analysis passes, such as compute_live_intervals, we need
2041  * to loop over all the virtual GRFs.  Compacting them can save a lot of
2042  * overhead.
2043  */
2044 bool
2045 fs_visitor::compact_virtual_grfs()
2046 {
2047    bool progress = false;
2048    int remap_table[this->alloc.count];
2049    memset(remap_table, -1, sizeof(remap_table));
2050
2051    /* Mark which virtual GRFs are used. */
2052    foreach_block_and_inst(block, const fs_inst, inst, cfg) {
2053       if (inst->dst.file == GRF)
2054          remap_table[inst->dst.reg] = 0;
2055
2056       for (int i = 0; i < inst->sources; i++) {
2057          if (inst->src[i].file == GRF)
2058             remap_table[inst->src[i].reg] = 0;
2059       }
2060    }
2061
2062    /* Compact the GRF arrays. */
2063    int new_index = 0;
2064    for (unsigned i = 0; i < this->alloc.count; i++) {
2065       if (remap_table[i] == -1) {
2066          /* We just found an unused register.  This means that we are
2067           * actually going to compact something.
2068           */
2069          progress = true;
2070       } else {
2071          remap_table[i] = new_index;
2072          alloc.sizes[new_index] = alloc.sizes[i];
2073          invalidate_live_intervals();
2074          ++new_index;
2075       }
2076    }
2077
2078    this->alloc.count = new_index;
2079
2080    /* Patch all the instructions to use the newly renumbered registers */
2081    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2082       if (inst->dst.file == GRF)
2083          inst->dst.reg = remap_table[inst->dst.reg];
2084
2085       for (int i = 0; i < inst->sources; i++) {
2086          if (inst->src[i].file == GRF)
2087             inst->src[i].reg = remap_table[inst->src[i].reg];
2088       }
2089    }
2090
2091    /* Patch all the references to delta_x/delta_y, since they're used in
2092     * register allocation.  If they're unused, switch them to BAD_FILE so
2093     * we don't think some random VGRF is delta_x/delta_y.
2094     */
2095    for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
2096       if (delta_x[i].file == GRF) {
2097          if (remap_table[delta_x[i].reg] != -1) {
2098             delta_x[i].reg = remap_table[delta_x[i].reg];
2099          } else {
2100             delta_x[i].file = BAD_FILE;
2101          }
2102       }
2103    }
2104    for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
2105       if (delta_y[i].file == GRF) {
2106          if (remap_table[delta_y[i].reg] != -1) {
2107             delta_y[i].reg = remap_table[delta_y[i].reg];
2108          } else {
2109             delta_y[i].file = BAD_FILE;
2110          }
2111       }
2112    }
2113
2114    return progress;
2115 }
2116
2117 /*
2118  * Implements array access of uniforms by inserting a
2119  * PULL_CONSTANT_LOAD instruction.
2120  *
2121  * Unlike temporary GRF array access (where we don't support it due to
2122  * the difficulty of doing relative addressing on instruction
2123  * destinations), we could potentially do array access of uniforms
2124  * that were loaded in GRF space as push constants.  In real-world
2125  * usage we've seen, though, the arrays being used are always larger
2126  * than we could load as push constants, so just always move all
2127  * uniform array access out to a pull constant buffer.
2128  */
2129 void
2130 fs_visitor::move_uniform_array_access_to_pull_constants()
2131 {
2132    if (dispatch_width != 8)
2133       return;
2134
2135    pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2136    memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
2137
2138    /* Walk through and find array access of uniforms.  Put a copy of that
2139     * uniform in the pull constant buffer.
2140     *
2141     * Note that we don't move constant-indexed accesses to arrays.  No
2142     * testing has been done of the performance impact of this choice.
2143     */
2144    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2145       for (int i = 0 ; i < inst->sources; i++) {
2146          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2147             continue;
2148
2149          int uniform = inst->src[i].reg;
2150
2151          /* If this array isn't already present in the pull constant buffer,
2152           * add it.
2153           */
2154          if (pull_constant_loc[uniform] == -1) {
2155             const gl_constant_value **values = &stage_prog_data->param[uniform];
2156
2157             assert(param_size[uniform]);
2158
2159             for (int j = 0; j < param_size[uniform]; j++) {
2160                pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
2161
2162                stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
2163                   values[j];
2164             }
2165          }
2166       }
2167    }
2168 }
2169
2170 /**
2171  * Assign UNIFORM file registers to either push constants or pull constants.
2172  *
2173  * We allow a fragment shader to have more than the specified minimum
2174  * maximum number of fragment shader uniform components (64).  If
2175  * there are too many of these, they'd fill up all of register space.
2176  * So, this will push some of them out to the pull constant buffer and
2177  * update the program to load them.
2178  */
2179 void
2180 fs_visitor::assign_constant_locations()
2181 {
2182    /* Only the first compile (SIMD8 mode) gets to decide on locations. */
2183    if (dispatch_width != 8)
2184       return;
2185
2186    /* Find which UNIFORM registers are still in use. */
2187    bool is_live[uniforms];
2188    for (unsigned int i = 0; i < uniforms; i++) {
2189       is_live[i] = false;
2190    }
2191
2192    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2193       for (int i = 0; i < inst->sources; i++) {
2194          if (inst->src[i].file != UNIFORM)
2195             continue;
2196
2197          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
2198          if (constant_nr >= 0 && constant_nr < (int) uniforms)
2199             is_live[constant_nr] = true;
2200       }
2201    }
2202
2203    /* Only allow 16 registers (128 uniform components) as push constants.
2204     *
2205     * Just demote the end of the list.  We could probably do better
2206     * here, demoting things that are rarely used in the program first.
2207     *
2208     * If changing this value, note the limitation about total_regs in
2209     * brw_curbe.c.
2210     */
2211    unsigned int max_push_components = 16 * 8;
2212    unsigned int num_push_constants = 0;
2213
2214    push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2215
2216    for (unsigned int i = 0; i < uniforms; i++) {
2217       if (!is_live[i] || pull_constant_loc[i] != -1) {
2218          /* This UNIFORM register is either dead, or has already been demoted
2219           * to a pull const.  Mark it as no longer living in the param[] array.
2220           */
2221          push_constant_loc[i] = -1;
2222          continue;
2223       }
2224
2225       if (num_push_constants < max_push_components) {
2226          /* Retain as a push constant.  Record the location in the params[]
2227           * array.
2228           */
2229          push_constant_loc[i] = num_push_constants++;
2230       } else {
2231          /* Demote to a pull constant. */
2232          push_constant_loc[i] = -1;
2233
2234          int pull_index = stage_prog_data->nr_pull_params++;
2235          stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
2236          pull_constant_loc[i] = pull_index;
2237       }
2238    }
2239
2240    stage_prog_data->nr_params = num_push_constants;
2241
2242    /* Up until now, the param[] array has been indexed by reg + reg_offset
2243     * of UNIFORM registers.  Condense it to only contain the uniforms we
2244     * chose to upload as push constants.
2245     */
2246    for (unsigned int i = 0; i < uniforms; i++) {
2247       int remapped = push_constant_loc[i];
2248
2249       if (remapped == -1)
2250          continue;
2251
2252       assert(remapped <= (int)i);
2253       stage_prog_data->param[remapped] = stage_prog_data->param[i];
2254    }
2255 }
2256
2257 /**
2258  * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
2259  * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
2260  */
2261 void
2262 fs_visitor::demote_pull_constants()
2263 {
2264    foreach_block_and_inst (block, fs_inst, inst, cfg) {
2265       for (int i = 0; i < inst->sources; i++) {
2266          if (inst->src[i].file != UNIFORM)
2267             continue;
2268
2269          int pull_index = pull_constant_loc[inst->src[i].reg +
2270                                             inst->src[i].reg_offset];
2271          if (pull_index == -1)
2272             continue;
2273
2274          /* Set up the annotation tracking for new generated instructions. */
2275          base_ir = inst->ir;
2276          current_annotation = inst->annotation;
2277
2278          fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
2279          fs_reg dst = vgrf(glsl_type::float_type);
2280
2281          /* Generate a pull load into dst. */
2282          if (inst->src[i].reladdr) {
2283             exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
2284                                                         surf_index,
2285                                                         *inst->src[i].reladdr,
2286                                                         pull_index);
2287             inst->insert_before(block, &list);
2288             inst->src[i].reladdr = NULL;
2289          } else {
2290             fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
2291             fs_inst *pull =
2292                new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 8,
2293                                     dst, surf_index, offset);
2294             inst->insert_before(block, pull);
2295             inst->src[i].set_smear(pull_index & 3);
2296          }
2297
2298          /* Rewrite the instruction to use the temporary VGRF. */
2299          inst->src[i].file = GRF;
2300          inst->src[i].reg = dst.reg;
2301          inst->src[i].reg_offset = 0;
2302          inst->src[i].width = dispatch_width;
2303       }
2304    }
2305    invalidate_live_intervals();
2306 }
2307
2308 bool
2309 fs_visitor::opt_algebraic()
2310 {
2311    bool progress = false;
2312
2313    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2314       switch (inst->opcode) {
2315       case BRW_OPCODE_MOV:
2316          if (inst->src[0].file != IMM)
2317             break;
2318
2319          if (inst->saturate) {
2320             if (inst->dst.type != inst->src[0].type)
2321                assert(!"unimplemented: saturate mixed types");
2322
2323             if (brw_saturate_immediate(inst->dst.type,
2324                                        &inst->src[0].fixed_hw_reg)) {
2325                inst->saturate = false;
2326                progress = true;
2327             }
2328          }
2329          break;
2330
2331       case BRW_OPCODE_MUL:
2332          if (inst->src[1].file != IMM)
2333             continue;
2334
2335          /* a * 1.0 = a */
2336          if (inst->src[1].is_one()) {
2337             inst->opcode = BRW_OPCODE_MOV;
2338             inst->src[1] = reg_undef;
2339             progress = true;
2340             break;
2341          }
2342
2343          /* a * -1.0 = -a */
2344          if (inst->src[1].is_negative_one()) {
2345             inst->opcode = BRW_OPCODE_MOV;
2346             inst->src[0].negate = !inst->src[0].negate;
2347             inst->src[1] = reg_undef;
2348             progress = true;
2349             break;
2350          }
2351
2352          /* a * 0.0 = 0.0 */
2353          if (inst->src[1].is_zero()) {
2354             inst->opcode = BRW_OPCODE_MOV;
2355             inst->src[0] = inst->src[1];
2356             inst->src[1] = reg_undef;
2357             progress = true;
2358             break;
2359          }
2360
2361          if (inst->src[0].file == IMM) {
2362             assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2363             inst->opcode = BRW_OPCODE_MOV;
2364             inst->src[0].fixed_hw_reg.dw1.f *= inst->src[1].fixed_hw_reg.dw1.f;
2365             inst->src[1] = reg_undef;
2366             progress = true;
2367             break;
2368          }
2369          break;
2370       case BRW_OPCODE_ADD:
2371          if (inst->src[1].file != IMM)
2372             continue;
2373
2374          /* a + 0.0 = a */
2375          if (inst->src[1].is_zero()) {
2376             inst->opcode = BRW_OPCODE_MOV;
2377             inst->src[1] = reg_undef;
2378             progress = true;
2379             break;
2380          }
2381
2382          if (inst->src[0].file == IMM) {
2383             assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2384             inst->opcode = BRW_OPCODE_MOV;
2385             inst->src[0].fixed_hw_reg.dw1.f += inst->src[1].fixed_hw_reg.dw1.f;
2386             inst->src[1] = reg_undef;
2387             progress = true;
2388             break;
2389          }
2390          break;
2391       case BRW_OPCODE_OR:
2392          if (inst->src[0].equals(inst->src[1])) {
2393             inst->opcode = BRW_OPCODE_MOV;
2394             inst->src[1] = reg_undef;
2395             progress = true;
2396             break;
2397          }
2398          break;
2399       case BRW_OPCODE_LRP:
2400          if (inst->src[1].equals(inst->src[2])) {
2401             inst->opcode = BRW_OPCODE_MOV;
2402             inst->src[0] = inst->src[1];
2403             inst->src[1] = reg_undef;
2404             inst->src[2] = reg_undef;
2405             progress = true;
2406             break;
2407          }
2408          break;
2409       case BRW_OPCODE_CMP:
2410          if (inst->conditional_mod == BRW_CONDITIONAL_GE &&
2411              inst->src[0].abs &&
2412              inst->src[0].negate &&
2413              inst->src[1].is_zero()) {
2414             inst->src[0].abs = false;
2415             inst->src[0].negate = false;
2416             inst->conditional_mod = BRW_CONDITIONAL_Z;
2417             progress = true;
2418             break;
2419          }
2420          break;
2421       case BRW_OPCODE_SEL:
2422          if (inst->src[0].equals(inst->src[1])) {
2423             inst->opcode = BRW_OPCODE_MOV;
2424             inst->src[1] = reg_undef;
2425             inst->predicate = BRW_PREDICATE_NONE;
2426             inst->predicate_inverse = false;
2427             progress = true;
2428          } else if (inst->saturate && inst->src[1].file == IMM) {
2429             switch (inst->conditional_mod) {
2430             case BRW_CONDITIONAL_LE:
2431             case BRW_CONDITIONAL_L:
2432                switch (inst->src[1].type) {
2433                case BRW_REGISTER_TYPE_F:
2434                   if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
2435                      inst->opcode = BRW_OPCODE_MOV;
2436                      inst->src[1] = reg_undef;
2437                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
2438                      progress = true;
2439                   }
2440                   break;
2441                default:
2442                   break;
2443                }
2444                break;
2445             case BRW_CONDITIONAL_GE:
2446             case BRW_CONDITIONAL_G:
2447                switch (inst->src[1].type) {
2448                case BRW_REGISTER_TYPE_F:
2449                   if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
2450                      inst->opcode = BRW_OPCODE_MOV;
2451                      inst->src[1] = reg_undef;
2452                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
2453                      progress = true;
2454                   }
2455                   break;
2456                default:
2457                   break;
2458                }
2459             default:
2460                break;
2461             }
2462          }
2463          break;
2464       case BRW_OPCODE_MAD:
2465          if (inst->src[1].is_zero() || inst->src[2].is_zero()) {
2466             inst->opcode = BRW_OPCODE_MOV;
2467             inst->src[1] = reg_undef;
2468             inst->src[2] = reg_undef;
2469             progress = true;
2470          } else if (inst->src[0].is_zero()) {
2471             inst->opcode = BRW_OPCODE_MUL;
2472             inst->src[0] = inst->src[2];
2473             inst->src[2] = reg_undef;
2474          } else if (inst->src[1].is_one()) {
2475             inst->opcode = BRW_OPCODE_ADD;
2476             inst->src[1] = inst->src[2];
2477             inst->src[2] = reg_undef;
2478             progress = true;
2479          } else if (inst->src[2].is_one()) {
2480             inst->opcode = BRW_OPCODE_ADD;
2481             inst->src[2] = reg_undef;
2482             progress = true;
2483          } else if (inst->src[1].file == IMM && inst->src[2].file == IMM) {
2484             inst->opcode = BRW_OPCODE_ADD;
2485             inst->src[1].fixed_hw_reg.dw1.f *= inst->src[2].fixed_hw_reg.dw1.f;
2486             inst->src[2] = reg_undef;
2487             progress = true;
2488          }
2489          break;
2490       case SHADER_OPCODE_RCP: {
2491          fs_inst *prev = (fs_inst *)inst->prev;
2492          if (prev->opcode == SHADER_OPCODE_SQRT) {
2493             if (inst->src[0].equals(prev->dst)) {
2494                inst->opcode = SHADER_OPCODE_RSQ;
2495                inst->src[0] = prev->src[0];
2496                progress = true;
2497             }
2498          }
2499          break;
2500       }
2501       default:
2502          break;
2503       }
2504    }
2505
2506    return progress;
2507 }
2508
2509 bool
2510 fs_visitor::opt_register_renaming()
2511 {
2512    bool progress = false;
2513    int depth = 0;
2514
2515    int remap[alloc.count];
2516    memset(remap, -1, sizeof(int) * alloc.count);
2517
2518    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2519       if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
2520          depth++;
2521       } else if (inst->opcode == BRW_OPCODE_ENDIF ||
2522                  inst->opcode == BRW_OPCODE_WHILE) {
2523          depth--;
2524       }
2525
2526       /* Rewrite instruction sources. */
2527       for (int i = 0; i < inst->sources; i++) {
2528          if (inst->src[i].file == GRF &&
2529              remap[inst->src[i].reg] != -1 &&
2530              remap[inst->src[i].reg] != inst->src[i].reg) {
2531             inst->src[i].reg = remap[inst->src[i].reg];
2532             progress = true;
2533          }
2534       }
2535
2536       const int dst = inst->dst.reg;
2537
2538       if (depth == 0 &&
2539           inst->dst.file == GRF &&
2540           alloc.sizes[inst->dst.reg] == inst->dst.width / 8 &&
2541           !inst->is_partial_write()) {
2542          if (remap[dst] == -1) {
2543             remap[dst] = dst;
2544          } else {
2545             remap[dst] = alloc.allocate(inst->dst.width / 8);
2546             inst->dst.reg = remap[dst];
2547             progress = true;
2548          }
2549       } else if (inst->dst.file == GRF &&
2550                  remap[dst] != -1 &&
2551                  remap[dst] != dst) {
2552          inst->dst.reg = remap[dst];
2553          progress = true;
2554       }
2555    }
2556
2557    if (progress) {
2558       invalidate_live_intervals();
2559
2560       for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
2561          if (delta_x[i].file == GRF && remap[delta_x[i].reg] != -1) {
2562             delta_x[i].reg = remap[delta_x[i].reg];
2563          }
2564       }
2565       for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
2566          if (delta_y[i].file == GRF && remap[delta_y[i].reg] != -1) {
2567             delta_y[i].reg = remap[delta_y[i].reg];
2568          }
2569       }
2570    }
2571
2572    return progress;
2573 }
2574
2575 /**
2576  * Remove redundant or useless discard jumps.
2577  *
2578  * For example, we can eliminate jumps in the following sequence:
2579  *
2580  * discard-jump       (redundant with the next jump)
2581  * discard-jump       (useless; jumps to the next instruction)
2582  * placeholder-halt
2583  */
2584 bool
2585 fs_visitor::opt_redundant_discard_jumps()
2586 {
2587    bool progress = false;
2588
2589    bblock_t *last_bblock = cfg->blocks[cfg->num_blocks - 1];
2590
2591    fs_inst *placeholder_halt = NULL;
2592    foreach_inst_in_block_reverse(fs_inst, inst, last_bblock) {
2593       if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT) {
2594          placeholder_halt = inst;
2595          break;
2596       }
2597    }
2598
2599    if (!placeholder_halt)
2600       return false;
2601
2602    /* Delete any HALTs immediately before the placeholder halt. */
2603    for (fs_inst *prev = (fs_inst *) placeholder_halt->prev;
2604         !prev->is_head_sentinel() && prev->opcode == FS_OPCODE_DISCARD_JUMP;
2605         prev = (fs_inst *) placeholder_halt->prev) {
2606       prev->remove(last_bblock);
2607       progress = true;
2608    }
2609
2610    if (progress)
2611       invalidate_live_intervals();
2612
2613    return progress;
2614 }
2615
2616 bool
2617 fs_visitor::compute_to_mrf()
2618 {
2619    bool progress = false;
2620    int next_ip = 0;
2621
2622    /* No MRFs on Gen >= 7. */
2623    if (brw->gen >= 7)
2624       return false;
2625
2626    calculate_live_intervals();
2627
2628    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2629       int ip = next_ip;
2630       next_ip++;
2631
2632       if (inst->opcode != BRW_OPCODE_MOV ||
2633           inst->is_partial_write() ||
2634           inst->dst.file != MRF || inst->src[0].file != GRF ||
2635           inst->dst.type != inst->src[0].type ||
2636           inst->src[0].abs || inst->src[0].negate ||
2637           !inst->src[0].is_contiguous() ||
2638           inst->src[0].subreg_offset)
2639          continue;
2640
2641       /* Work out which hardware MRF registers are written by this
2642        * instruction.
2643        */
2644       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2645       int mrf_high;
2646       if (inst->dst.reg & BRW_MRF_COMPR4) {
2647          mrf_high = mrf_low + 4;
2648       } else if (inst->exec_size == 16) {
2649          mrf_high = mrf_low + 1;
2650       } else {
2651          mrf_high = mrf_low;
2652       }
2653
2654       /* Can't compute-to-MRF this GRF if someone else was going to
2655        * read it later.
2656        */
2657       if (this->virtual_grf_end[inst->src[0].reg] > ip)
2658          continue;
2659
2660       /* Found a move of a GRF to a MRF.  Let's see if we can go
2661        * rewrite the thing that made this GRF to write into the MRF.
2662        */
2663       foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2664          if (scan_inst->dst.file == GRF &&
2665              scan_inst->dst.reg == inst->src[0].reg) {
2666             /* Found the last thing to write our reg we want to turn
2667              * into a compute-to-MRF.
2668              */
2669
2670             /* If this one instruction didn't populate all the
2671              * channels, bail.  We might be able to rewrite everything
2672              * that writes that reg, but it would require smarter
2673              * tracking to delay the rewriting until complete success.
2674              */
2675             if (scan_inst->is_partial_write())
2676                break;
2677
2678             /* Things returning more than one register would need us to
2679              * understand coalescing out more than one MOV at a time.
2680              */
2681             if (scan_inst->regs_written > scan_inst->dst.width / 8)
2682                break;
2683
2684             /* SEND instructions can't have MRF as a destination. */
2685             if (scan_inst->mlen)
2686                break;
2687
2688             if (brw->gen == 6) {
2689                /* gen6 math instructions must have the destination be
2690                 * GRF, so no compute-to-MRF for them.
2691                 */
2692                if (scan_inst->is_math()) {
2693                   break;
2694                }
2695             }
2696
2697             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2698                /* Found the creator of our MRF's source value. */
2699                scan_inst->dst.file = MRF;
2700                scan_inst->dst.reg = inst->dst.reg;
2701                scan_inst->saturate |= inst->saturate;
2702                inst->remove(block);
2703                progress = true;
2704             }
2705             break;
2706          }
2707
2708          /* We don't handle control flow here.  Most computation of
2709           * values that end up in MRFs are shortly before the MRF
2710           * write anyway.
2711           */
2712          if (block->start() == scan_inst)
2713             break;
2714
2715          /* You can't read from an MRF, so if someone else reads our
2716           * MRF's source GRF that we wanted to rewrite, that stops us.
2717           */
2718          bool interfered = false;
2719          for (int i = 0; i < scan_inst->sources; i++) {
2720             if (scan_inst->src[i].file == GRF &&
2721                 scan_inst->src[i].reg == inst->src[0].reg &&
2722                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2723                interfered = true;
2724             }
2725          }
2726          if (interfered)
2727             break;
2728
2729          if (scan_inst->dst.file == MRF) {
2730             /* If somebody else writes our MRF here, we can't
2731              * compute-to-MRF before that.
2732              */
2733             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2734             int scan_mrf_high;
2735
2736             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2737                scan_mrf_high = scan_mrf_low + 4;
2738             } else if (scan_inst->exec_size == 16) {
2739                scan_mrf_high = scan_mrf_low + 1;
2740             } else {
2741                scan_mrf_high = scan_mrf_low;
2742             }
2743
2744             if (mrf_low == scan_mrf_low ||
2745                 mrf_low == scan_mrf_high ||
2746                 mrf_high == scan_mrf_low ||
2747                 mrf_high == scan_mrf_high) {
2748                break;
2749             }
2750          }
2751
2752          if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2753             /* Found a SEND instruction, which means that there are
2754              * live values in MRFs from base_mrf to base_mrf +
2755              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2756              * above it.
2757              */
2758             if (mrf_low >= scan_inst->base_mrf &&
2759                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2760                break;
2761             }
2762             if (mrf_high >= scan_inst->base_mrf &&
2763                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2764                break;
2765             }
2766          }
2767       }
2768    }
2769
2770    if (progress)
2771       invalidate_live_intervals();
2772
2773    return progress;
2774 }
2775
2776 /**
2777  * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
2778  * instructions to FS_OPCODE_REP_FB_WRITE.
2779  */
2780 void
2781 fs_visitor::emit_repclear_shader()
2782 {
2783    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2784    int base_mrf = 1;
2785    int color_mrf = base_mrf + 2;
2786
2787    fs_inst *mov = emit(MOV(vec4(brw_message_reg(color_mrf)),
2788                            fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F)));
2789    mov->force_writemask_all = true;
2790
2791    fs_inst *write;
2792    if (key->nr_color_regions == 1) {
2793       write = emit(FS_OPCODE_REP_FB_WRITE);
2794       write->saturate = key->clamp_fragment_color;
2795       write->base_mrf = color_mrf;
2796       write->target = 0;
2797       write->header_present = false;
2798       write->mlen = 1;
2799    } else {
2800       assume(key->nr_color_regions > 0);
2801       for (int i = 0; i < key->nr_color_regions; ++i) {
2802          write = emit(FS_OPCODE_REP_FB_WRITE);
2803          write->saturate = key->clamp_fragment_color;
2804          write->base_mrf = base_mrf;
2805          write->target = i;
2806          write->header_present = true;
2807          write->mlen = 3;
2808       }
2809    }
2810    write->eot = true;
2811
2812    calculate_cfg();
2813
2814    assign_constant_locations();
2815    assign_curb_setup();
2816
2817    /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
2818    assert(mov->src[0].file == HW_REG);
2819    mov->src[0] = brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
2820 }
2821
2822 /**
2823  * Walks through basic blocks, looking for repeated MRF writes and
2824  * removing the later ones.
2825  */
2826 bool
2827 fs_visitor::remove_duplicate_mrf_writes()
2828 {
2829    fs_inst *last_mrf_move[16];
2830    bool progress = false;
2831
2832    /* Need to update the MRF tracking for compressed instructions. */
2833    if (dispatch_width == 16)
2834       return false;
2835
2836    memset(last_mrf_move, 0, sizeof(last_mrf_move));
2837
2838    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
2839       if (inst->is_control_flow()) {
2840          memset(last_mrf_move, 0, sizeof(last_mrf_move));
2841       }
2842
2843       if (inst->opcode == BRW_OPCODE_MOV &&
2844           inst->dst.file == MRF) {
2845          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2846          if (prev_inst && inst->equals(prev_inst)) {
2847             inst->remove(block);
2848             progress = true;
2849             continue;
2850          }
2851       }
2852
2853       /* Clear out the last-write records for MRFs that were overwritten. */
2854       if (inst->dst.file == MRF) {
2855          last_mrf_move[inst->dst.reg] = NULL;
2856       }
2857
2858       if (inst->mlen > 0 && inst->base_mrf != -1) {
2859          /* Found a SEND instruction, which will include two or fewer
2860           * implied MRF writes.  We could do better here.
2861           */
2862          for (int i = 0; i < implied_mrf_writes(inst); i++) {
2863             last_mrf_move[inst->base_mrf + i] = NULL;
2864          }
2865       }
2866
2867       /* Clear out any MRF move records whose sources got overwritten. */
2868       if (inst->dst.file == GRF) {
2869          for (unsigned int i = 0; i < ARRAY_SIZE(last_mrf_move); i++) {
2870             if (last_mrf_move[i] &&
2871                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2872                last_mrf_move[i] = NULL;
2873             }
2874          }
2875       }
2876
2877       if (inst->opcode == BRW_OPCODE_MOV &&
2878           inst->dst.file == MRF &&
2879           inst->src[0].file == GRF &&
2880           !inst->is_partial_write()) {
2881          last_mrf_move[inst->dst.reg] = inst;
2882       }
2883    }
2884
2885    if (progress)
2886       invalidate_live_intervals();
2887
2888    return progress;
2889 }
2890
2891 static void
2892 clear_deps_for_inst_src(fs_inst *inst, bool *deps, int first_grf, int grf_len)
2893 {
2894    /* Clear the flag for registers that actually got read (as expected). */
2895    for (int i = 0; i < inst->sources; i++) {
2896       int grf;
2897       if (inst->src[i].file == GRF) {
2898          grf = inst->src[i].reg;
2899       } else if (inst->src[i].file == HW_REG &&
2900                  inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2901          grf = inst->src[i].fixed_hw_reg.nr;
2902       } else {
2903          continue;
2904       }
2905
2906       if (grf >= first_grf &&
2907           grf < first_grf + grf_len) {
2908          deps[grf - first_grf] = false;
2909          if (inst->exec_size == 16)
2910             deps[grf - first_grf + 1] = false;
2911       }
2912    }
2913 }
2914
2915 /**
2916  * Implements this workaround for the original 965:
2917  *
2918  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2919  *      check for post destination dependencies on this instruction, software
2920  *      must ensure that there is no destination hazard for the case of ‘write
2921  *      followed by a posted write’ shown in the following example.
2922  *
2923  *      1. mov r3 0
2924  *      2. send r3.xy <rest of send instruction>
2925  *      3. mov r2 r3
2926  *
2927  *      Due to no post-destination dependency check on the ‘send’, the above
2928  *      code sequence could have two instructions (1 and 2) in flight at the
2929  *      same time that both consider ‘r3’ as the target of their final writes.
2930  */
2931 void
2932 fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
2933                                                         fs_inst *inst)
2934 {
2935    int write_len = inst->regs_written;
2936    int first_write_grf = inst->dst.reg;
2937    bool needs_dep[BRW_MAX_MRF];
2938    assert(write_len < (int)sizeof(needs_dep) - 1);
2939
2940    memset(needs_dep, false, sizeof(needs_dep));
2941    memset(needs_dep, true, write_len);
2942
2943    clear_deps_for_inst_src(inst, needs_dep, first_write_grf, write_len);
2944
2945    /* Walk backwards looking for writes to registers we're writing which
2946     * aren't read since being written.  If we hit the start of the program,
2947     * we assume that there are no outstanding dependencies on entry to the
2948     * program.
2949     */
2950    foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2951       /* If we hit control flow, assume that there *are* outstanding
2952        * dependencies, and force their cleanup before our instruction.
2953        */
2954       if (block->start() == scan_inst) {
2955          for (int i = 0; i < write_len; i++) {
2956             if (needs_dep[i]) {
2957                inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
2958             }
2959          }
2960          return;
2961       }
2962
2963       /* We insert our reads as late as possible on the assumption that any
2964        * instruction but a MOV that might have left us an outstanding
2965        * dependency has more latency than a MOV.
2966        */
2967       if (scan_inst->dst.file == GRF) {
2968          for (int i = 0; i < scan_inst->regs_written; i++) {
2969             int reg = scan_inst->dst.reg + i;
2970
2971             if (reg >= first_write_grf &&
2972                 reg < first_write_grf + write_len &&
2973                 needs_dep[reg - first_write_grf]) {
2974                inst->insert_before(block, DEP_RESOLVE_MOV(reg));
2975                needs_dep[reg - first_write_grf] = false;
2976                if (scan_inst->exec_size == 16)
2977                   needs_dep[reg - first_write_grf + 1] = false;
2978             }
2979          }
2980       }
2981
2982       /* Clear the flag for registers that actually got read (as expected). */
2983       clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
2984
2985       /* Continue the loop only if we haven't resolved all the dependencies */
2986       int i;
2987       for (i = 0; i < write_len; i++) {
2988          if (needs_dep[i])
2989             break;
2990       }
2991       if (i == write_len)
2992          return;
2993    }
2994 }
2995
2996 /**
2997  * Implements this workaround for the original 965:
2998  *
2999  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
3000  *      used as a destination register until after it has been sourced by an
3001  *      instruction with a different destination register.
3002  */
3003 void
3004 fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
3005 {
3006    int write_len = inst->regs_written;
3007    int first_write_grf = inst->dst.reg;
3008    bool needs_dep[BRW_MAX_MRF];
3009    assert(write_len < (int)sizeof(needs_dep) - 1);
3010
3011    memset(needs_dep, false, sizeof(needs_dep));
3012    memset(needs_dep, true, write_len);
3013    /* Walk forwards looking for writes to registers we're writing which aren't
3014     * read before being written.
3015     */
3016    foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst, block) {
3017       /* If we hit control flow, force resolve all remaining dependencies. */
3018       if (block->end() == scan_inst) {
3019          for (int i = 0; i < write_len; i++) {
3020             if (needs_dep[i])
3021                scan_inst->insert_before(block,
3022                                         DEP_RESOLVE_MOV(first_write_grf + i));
3023          }
3024          return;
3025       }
3026
3027       /* Clear the flag for registers that actually got read (as expected). */
3028       clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3029
3030       /* We insert our reads as late as possible since they're reading the
3031        * result of a SEND, which has massive latency.
3032        */
3033       if (scan_inst->dst.file == GRF &&
3034           scan_inst->dst.reg >= first_write_grf &&
3035           scan_inst->dst.reg < first_write_grf + write_len &&
3036           needs_dep[scan_inst->dst.reg - first_write_grf]) {
3037          scan_inst->insert_before(block, DEP_RESOLVE_MOV(scan_inst->dst.reg));
3038          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
3039       }
3040
3041       /* Continue the loop only if we haven't resolved all the dependencies */
3042       int i;
3043       for (i = 0; i < write_len; i++) {
3044          if (needs_dep[i])
3045             break;
3046       }
3047       if (i == write_len)
3048          return;
3049    }
3050 }
3051
3052 void
3053 fs_visitor::insert_gen4_send_dependency_workarounds()
3054 {
3055    if (brw->gen != 4 || brw->is_g4x)
3056       return;
3057
3058    bool progress = false;
3059
3060    /* Note that we're done with register allocation, so GRF fs_regs always
3061     * have a .reg_offset of 0.
3062     */
3063
3064    foreach_block_and_inst(block, fs_inst, inst, cfg) {
3065       if (inst->mlen != 0 && inst->dst.file == GRF) {
3066          insert_gen4_pre_send_dependency_workarounds(block, inst);
3067          insert_gen4_post_send_dependency_workarounds(block, inst);
3068          progress = true;
3069       }
3070    }
3071
3072    if (progress)
3073       invalidate_live_intervals();
3074 }
3075
3076 /**
3077  * Turns the generic expression-style uniform pull constant load instruction
3078  * into a hardware-specific series of instructions for loading a pull
3079  * constant.
3080  *
3081  * The expression style allows the CSE pass before this to optimize out
3082  * repeated loads from the same offset, and gives the pre-register-allocation
3083  * scheduling full flexibility, while the conversion to native instructions
3084  * allows the post-register-allocation scheduler the best information
3085  * possible.
3086  *
3087  * Note that execution masking for setting up pull constant loads is special:
3088  * the channels that need to be written are unrelated to the current execution
3089  * mask, since a later instruction will use one of the result channels as a
3090  * source operand for all 8 or 16 of its channels.
3091  */
3092 void
3093 fs_visitor::lower_uniform_pull_constant_loads()
3094 {
3095    foreach_block_and_inst (block, fs_inst, inst, cfg) {
3096       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
3097          continue;
3098
3099       if (brw->gen >= 7) {
3100          /* The offset arg before was a vec4-aligned byte offset.  We need to
3101           * turn it into a dword offset.
3102           */
3103          fs_reg const_offset_reg = inst->src[1];
3104          assert(const_offset_reg.file == IMM &&
3105                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
3106          const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
3107          fs_reg payload = fs_reg(GRF, alloc.allocate(1));
3108
3109          /* We have to use a message header on Skylake to get SIMD4x2 mode.
3110           * Reserve space for the register.
3111           */
3112          if (brw->gen >= 9) {
3113             payload.reg_offset++;
3114             alloc.sizes[payload.reg] = 2;
3115          }
3116
3117          /* This is actually going to be a MOV, but since only the first dword
3118           * is accessed, we have a special opcode to do just that one.  Note
3119           * that this needs to be an operation that will be considered a def
3120           * by live variable analysis, or register allocation will explode.
3121           */
3122          fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
3123                                                8, payload, const_offset_reg);
3124          setup->force_writemask_all = true;
3125
3126          setup->ir = inst->ir;
3127          setup->annotation = inst->annotation;
3128          inst->insert_before(block, setup);
3129
3130          /* Similarly, this will only populate the first 4 channels of the
3131           * result register (since we only use smear values from 0-3), but we
3132           * don't tell the optimizer.
3133           */
3134          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
3135          inst->src[1] = payload;
3136
3137          invalidate_live_intervals();
3138       } else {
3139          /* Before register allocation, we didn't tell the scheduler about the
3140           * MRF we use.  We know it's safe to use this MRF because nothing
3141           * else does except for register spill/unspill, which generates and
3142           * uses its MRF within a single IR instruction.
3143           */
3144          inst->base_mrf = 14;
3145          inst->mlen = 1;
3146       }
3147    }
3148 }
3149
3150 bool
3151 fs_visitor::lower_load_payload()
3152 {
3153    bool progress = false;
3154
3155    int vgrf_to_reg[alloc.count];
3156    int reg_count = 0;
3157    for (unsigned i = 0; i < alloc.count; ++i) {
3158       vgrf_to_reg[i] = reg_count;
3159       reg_count += alloc.sizes[i];
3160    }
3161
3162    struct {
3163       bool written:1; /* Whether this register has ever been written */
3164       bool force_writemask_all:1;
3165       bool force_sechalf:1;
3166    } metadata[reg_count];
3167    memset(metadata, 0, sizeof(metadata));
3168
3169    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3170       if (inst->dst.file == GRF) {
3171          const int dst_reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
3172          bool force_sechalf = inst->force_sechalf &&
3173                               !inst->force_writemask_all;
3174          bool toggle_sechalf = inst->dst.width == 16 &&
3175                                type_sz(inst->dst.type) == 4 &&
3176                                !inst->force_writemask_all;
3177          for (int i = 0; i < inst->regs_written; ++i) {
3178             metadata[dst_reg + i].written = true;
3179             metadata[dst_reg + i].force_sechalf = force_sechalf;
3180             metadata[dst_reg + i].force_writemask_all = inst->force_writemask_all;
3181             force_sechalf = (toggle_sechalf != force_sechalf);
3182          }
3183       }
3184
3185       if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
3186          assert(inst->dst.file == MRF || inst->dst.file == GRF);
3187          fs_reg dst = inst->dst;
3188
3189          for (int i = 0; i < inst->sources; i++) {
3190             dst.width = inst->src[i].effective_width;
3191             dst.type = inst->src[i].type;
3192
3193             if (inst->src[i].file == BAD_FILE) {
3194                /* Do nothing but otherwise increment as normal */
3195             } else if (dst.file == MRF &&
3196                        dst.width == 8 &&
3197                        brw->has_compr4 &&
3198                        i + 4 < inst->sources &&
3199                        inst->src[i + 4].equals(horiz_offset(inst->src[i], 8))) {
3200                fs_reg compr4_dst = dst;
3201                compr4_dst.reg += BRW_MRF_COMPR4;
3202                compr4_dst.width = 16;
3203                fs_reg compr4_src = inst->src[i];
3204                compr4_src.width = 16;
3205                fs_inst *mov = MOV(compr4_dst, compr4_src);
3206                mov->force_writemask_all = true;
3207                inst->insert_before(block, mov);
3208                /* Mark i+4 as BAD_FILE so we don't emit a MOV for it */
3209                inst->src[i + 4].file = BAD_FILE;
3210             } else {
3211                fs_inst *mov = MOV(dst, inst->src[i]);
3212                if (inst->src[i].file == GRF) {
3213                   int src_reg = vgrf_to_reg[inst->src[i].reg] +
3214                                 inst->src[i].reg_offset;
3215                   mov->force_sechalf = metadata[src_reg].force_sechalf;
3216                   mov->force_writemask_all = metadata[src_reg].force_writemask_all;
3217                } else {
3218                   /* We don't have any useful metadata for immediates or
3219                    * uniforms.  Assume that any of the channels of the
3220                    * destination may be used.
3221                    */
3222                   assert(inst->src[i].file == IMM ||
3223                          inst->src[i].file == UNIFORM);
3224                   mov->force_writemask_all = true;
3225                }
3226
3227                if (dst.file == GRF) {
3228                   const int dst_reg = vgrf_to_reg[dst.reg] + dst.reg_offset;
3229                   const bool force_writemask = mov->force_writemask_all;
3230                   metadata[dst_reg].force_writemask_all = force_writemask;
3231                   metadata[dst_reg].force_sechalf = mov->force_sechalf;
3232                   if (dst.width * type_sz(dst.type) > 32) {
3233                      assert(!mov->force_sechalf);
3234                      metadata[dst_reg + 1].force_writemask_all = force_writemask;
3235                      metadata[dst_reg + 1].force_sechalf = !force_writemask;
3236                   }
3237                }
3238
3239                inst->insert_before(block, mov);
3240             }
3241
3242             dst = offset(dst, 1);
3243          }
3244
3245          inst->remove(block);
3246          progress = true;
3247       }
3248    }
3249
3250    if (progress)
3251       invalidate_live_intervals();
3252
3253    return progress;
3254 }
3255
3256 void
3257 fs_visitor::dump_instructions()
3258 {
3259    dump_instructions(NULL);
3260 }
3261
3262 void
3263 fs_visitor::dump_instructions(const char *name)
3264 {
3265    FILE *file = stderr;
3266    if (name && geteuid() != 0) {
3267       file = fopen(name, "w");
3268       if (!file)
3269          file = stderr;
3270    }
3271
3272    if (cfg) {
3273       calculate_register_pressure();
3274       int ip = 0, max_pressure = 0;
3275       foreach_block_and_inst(block, backend_instruction, inst, cfg) {
3276          max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
3277          fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
3278          dump_instruction(inst, file);
3279          ip++;
3280       }
3281       fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
3282    } else {
3283       int ip = 0;
3284       foreach_in_list(backend_instruction, inst, &instructions) {
3285          fprintf(file, "%4d: ", ip++);
3286          dump_instruction(inst, file);
3287       }
3288    }
3289
3290    if (file != stderr) {
3291       fclose(file);
3292    }
3293 }
3294
3295 void
3296 fs_visitor::dump_instruction(backend_instruction *be_inst)
3297 {
3298    dump_instruction(be_inst, stderr);
3299 }
3300
3301 void
3302 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
3303 {
3304    fs_inst *inst = (fs_inst *)be_inst;
3305
3306    if (inst->predicate) {
3307       fprintf(file, "(%cf0.%d) ",
3308              inst->predicate_inverse ? '-' : '+',
3309              inst->flag_subreg);
3310    }
3311
3312    fprintf(file, "%s", brw_instruction_name(inst->opcode));
3313    if (inst->saturate)
3314       fprintf(file, ".sat");
3315    if (inst->conditional_mod) {
3316       fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
3317       if (!inst->predicate &&
3318           (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
3319                               inst->opcode != BRW_OPCODE_IF &&
3320                               inst->opcode != BRW_OPCODE_WHILE))) {
3321          fprintf(file, ".f0.%d", inst->flag_subreg);
3322       }
3323    }
3324    fprintf(file, "(%d) ", inst->exec_size);
3325
3326
3327    switch (inst->dst.file) {
3328    case GRF:
3329       fprintf(file, "vgrf%d", inst->dst.reg);
3330       if (inst->dst.width != dispatch_width)
3331          fprintf(file, "@%d", inst->dst.width);
3332       if (alloc.sizes[inst->dst.reg] != inst->dst.width / 8 ||
3333           inst->dst.subreg_offset)
3334          fprintf(file, "+%d.%d",
3335                  inst->dst.reg_offset, inst->dst.subreg_offset);
3336       break;
3337    case MRF:
3338       fprintf(file, "m%d", inst->dst.reg);
3339       break;
3340    case BAD_FILE:
3341       fprintf(file, "(null)");
3342       break;
3343    case UNIFORM:
3344       fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
3345       break;
3346    case ATTR:
3347       fprintf(file, "***attr%d***", inst->dst.reg + inst->dst.reg_offset);
3348       break;
3349    case HW_REG:
3350       if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3351          switch (inst->dst.fixed_hw_reg.nr) {
3352          case BRW_ARF_NULL:
3353             fprintf(file, "null");
3354             break;
3355          case BRW_ARF_ADDRESS:
3356             fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
3357             break;
3358          case BRW_ARF_ACCUMULATOR:
3359             fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
3360             break;
3361          case BRW_ARF_FLAG:
3362             fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3363                              inst->dst.fixed_hw_reg.subnr);
3364             break;
3365          default:
3366             fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3367                                inst->dst.fixed_hw_reg.subnr);
3368             break;
3369          }
3370       } else {
3371          fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
3372       }
3373       if (inst->dst.fixed_hw_reg.subnr)
3374          fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
3375       break;
3376    default:
3377       fprintf(file, "???");
3378       break;
3379    }
3380    fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
3381
3382    for (int i = 0; i < inst->sources; i++) {
3383       if (inst->src[i].negate)
3384          fprintf(file, "-");
3385       if (inst->src[i].abs)
3386          fprintf(file, "|");
3387       switch (inst->src[i].file) {
3388       case GRF:
3389          fprintf(file, "vgrf%d", inst->src[i].reg);
3390          if (inst->src[i].width != dispatch_width)
3391             fprintf(file, "@%d", inst->src[i].width);
3392          if (alloc.sizes[inst->src[i].reg] != inst->src[i].width / 8 ||
3393              inst->src[i].subreg_offset)
3394             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3395                     inst->src[i].subreg_offset);
3396          break;
3397       case MRF:
3398          fprintf(file, "***m%d***", inst->src[i].reg);
3399          break;
3400       case ATTR:
3401          fprintf(file, "attr%d", inst->src[i].reg + inst->src[i].reg_offset);
3402          break;
3403       case UNIFORM:
3404          fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
3405          if (inst->src[i].reladdr) {
3406             fprintf(file, "+reladdr");
3407          } else if (inst->src[i].subreg_offset) {
3408             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3409                     inst->src[i].subreg_offset);
3410          }
3411          break;
3412       case BAD_FILE:
3413          fprintf(file, "(null)");
3414          break;
3415       case IMM:
3416          switch (inst->src[i].type) {
3417          case BRW_REGISTER_TYPE_F:
3418             fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
3419             break;
3420          case BRW_REGISTER_TYPE_W:
3421          case BRW_REGISTER_TYPE_D:
3422             fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
3423             break;
3424          case BRW_REGISTER_TYPE_UW:
3425          case BRW_REGISTER_TYPE_UD:
3426             fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
3427             break;
3428          case BRW_REGISTER_TYPE_VF:
3429             fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
3430                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >>  0) & 0xff),
3431                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >>  8) & 0xff),
3432                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 16) & 0xff),
3433                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 24) & 0xff));
3434             break;
3435          default:
3436             fprintf(file, "???");
3437             break;
3438          }
3439          break;
3440       case HW_REG:
3441          if (inst->src[i].fixed_hw_reg.negate)
3442             fprintf(file, "-");
3443          if (inst->src[i].fixed_hw_reg.abs)
3444             fprintf(file, "|");
3445          if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3446             switch (inst->src[i].fixed_hw_reg.nr) {
3447             case BRW_ARF_NULL:
3448                fprintf(file, "null");
3449                break;
3450             case BRW_ARF_ADDRESS:
3451                fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
3452                break;
3453             case BRW_ARF_ACCUMULATOR:
3454                fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
3455                break;
3456             case BRW_ARF_FLAG:
3457                fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3458                                 inst->src[i].fixed_hw_reg.subnr);
3459                break;
3460             default:
3461                fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3462                                   inst->src[i].fixed_hw_reg.subnr);
3463                break;
3464             }
3465          } else {
3466             fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3467          }
3468          if (inst->src[i].fixed_hw_reg.subnr)
3469             fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
3470          if (inst->src[i].fixed_hw_reg.abs)
3471             fprintf(file, "|");
3472          break;
3473       default:
3474          fprintf(file, "???");
3475          break;
3476       }
3477       if (inst->src[i].abs)
3478          fprintf(file, "|");
3479
3480       if (inst->src[i].file != IMM) {
3481          fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
3482       }
3483
3484       if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
3485          fprintf(file, ", ");
3486    }
3487
3488    fprintf(file, " ");
3489
3490    if (dispatch_width == 16 && inst->exec_size == 8) {
3491       if (inst->force_sechalf)
3492          fprintf(file, "2ndhalf ");
3493       else
3494          fprintf(file, "1sthalf ");
3495    }
3496
3497    fprintf(file, "\n");
3498 }
3499
3500 /**
3501  * Possibly returns an instruction that set up @param reg.
3502  *
3503  * Sometimes we want to take the result of some expression/variable
3504  * dereference tree and rewrite the instruction generating the result
3505  * of the tree.  When processing the tree, we know that the
3506  * instructions generated are all writing temporaries that are dead
3507  * outside of this tree.  So, if we have some instructions that write
3508  * a temporary, we're free to point that temp write somewhere else.
3509  *
3510  * Note that this doesn't guarantee that the instruction generated
3511  * only reg -- it might be the size=4 destination of a texture instruction.
3512  */
3513 fs_inst *
3514 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3515                                            fs_inst *end,
3516                                            const fs_reg &reg)
3517 {
3518    if (end == start ||
3519        end->is_partial_write() ||
3520        reg.reladdr ||
3521        !reg.equals(end->dst)) {
3522       return NULL;
3523    } else {
3524       return end;
3525    }
3526 }
3527
3528 void
3529 fs_visitor::setup_payload_gen6()
3530 {
3531    bool uses_depth =
3532       (prog->InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3533    unsigned barycentric_interp_modes =
3534       (stage == MESA_SHADER_FRAGMENT) ?
3535       ((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
3536
3537    assert(brw->gen >= 6);
3538
3539    /* R0-1: masks, pixel X/Y coordinates. */
3540    payload.num_regs = 2;
3541    /* R2: only for 32-pixel dispatch.*/
3542
3543    /* R3-26: barycentric interpolation coordinates.  These appear in the
3544     * same order that they appear in the brw_wm_barycentric_interp_mode
3545     * enum.  Each set of coordinates occupies 2 registers if dispatch width
3546     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
3547     * appear if they were enabled using the "Barycentric Interpolation
3548     * Mode" bits in WM_STATE.
3549     */
3550    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3551       if (barycentric_interp_modes & (1 << i)) {
3552          payload.barycentric_coord_reg[i] = payload.num_regs;
3553          payload.num_regs += 2;
3554          if (dispatch_width == 16) {
3555             payload.num_regs += 2;
3556          }
3557       }
3558    }
3559
3560    /* R27: interpolated depth if uses source depth */
3561    if (uses_depth) {
3562       payload.source_depth_reg = payload.num_regs;
3563       payload.num_regs++;
3564       if (dispatch_width == 16) {
3565          /* R28: interpolated depth if not SIMD8. */
3566          payload.num_regs++;
3567       }
3568    }
3569    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3570    if (uses_depth) {
3571       payload.source_w_reg = payload.num_regs;
3572       payload.num_regs++;
3573       if (dispatch_width == 16) {
3574          /* R30: interpolated W if not SIMD8. */
3575          payload.num_regs++;
3576       }
3577    }
3578
3579    if (stage == MESA_SHADER_FRAGMENT) {
3580       brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3581       brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3582       prog_data->uses_pos_offset = key->compute_pos_offset;
3583       /* R31: MSAA position offsets. */
3584       if (prog_data->uses_pos_offset) {
3585          payload.sample_pos_reg = payload.num_regs;
3586          payload.num_regs++;
3587       }
3588    }
3589
3590    /* R32: MSAA input coverage mask */
3591    if (prog->SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3592       assert(brw->gen >= 7);
3593       payload.sample_mask_in_reg = payload.num_regs;
3594       payload.num_regs++;
3595       if (dispatch_width == 16) {
3596          /* R33: input coverage mask if not SIMD8. */
3597          payload.num_regs++;
3598       }
3599    }
3600
3601    /* R34-: bary for 32-pixel. */
3602    /* R58-59: interp W for 32-pixel. */
3603
3604    if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3605       source_depth_to_render_target = true;
3606    }
3607 }
3608
3609 void
3610 fs_visitor::setup_vs_payload()
3611 {
3612    /* R0: thread header, R1: urb handles */
3613    payload.num_regs = 2;
3614 }
3615
3616 void
3617 fs_visitor::assign_binding_table_offsets()
3618 {
3619    assert(stage == MESA_SHADER_FRAGMENT);
3620    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3621    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3622    uint32_t next_binding_table_offset = 0;
3623
3624    /* If there are no color regions, we still perform an FB write to a null
3625     * renderbuffer, which we place at surface index 0.
3626     */
3627    prog_data->binding_table.render_target_start = next_binding_table_offset;
3628    next_binding_table_offset += MAX2(key->nr_color_regions, 1);
3629
3630    assign_common_binding_table_offsets(next_binding_table_offset);
3631 }
3632
3633 void
3634 fs_visitor::calculate_register_pressure()
3635 {
3636    invalidate_live_intervals();
3637    calculate_live_intervals();
3638
3639    unsigned num_instructions = 0;
3640    foreach_block(block, cfg)
3641       num_instructions += block->instructions.length();
3642
3643    regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3644
3645    for (unsigned reg = 0; reg < alloc.count; reg++) {
3646       for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3647          regs_live_at_ip[ip] += alloc.sizes[reg];
3648    }
3649 }
3650
3651 void
3652 fs_visitor::optimize()
3653 {
3654    const char *stage_name = stage == MESA_SHADER_VERTEX ? "vs" : "fs";
3655
3656    split_virtual_grfs();
3657
3658    move_uniform_array_access_to_pull_constants();
3659    assign_constant_locations();
3660    demote_pull_constants();
3661
3662 #define OPT(pass, args...) ({                                           \
3663       pass_num++;                                                       \
3664       bool this_progress = pass(args);                                  \
3665                                                                         \
3666       if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) {   \
3667          char filename[64];                                             \
3668          snprintf(filename, 64, "%s%d-%04d-%02d-%02d-" #pass,              \
3669                   stage_name, dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
3670                                                                         \
3671          backend_visitor::dump_instructions(filename);                  \
3672       }                                                                 \
3673                                                                         \
3674       progress = progress || this_progress;                             \
3675       this_progress;                                                    \
3676    })
3677
3678    if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
3679       char filename[64];
3680       snprintf(filename, 64, "%s%d-%04d-00-start",
3681                stage_name, dispatch_width, shader_prog ? shader_prog->Name : 0);
3682
3683       backend_visitor::dump_instructions(filename);
3684    }
3685
3686    bool progress;
3687    int iteration = 0;
3688    int pass_num = 0;
3689    do {
3690       progress = false;
3691       pass_num = 0;
3692       iteration++;
3693
3694       OPT(remove_duplicate_mrf_writes);
3695
3696       OPT(opt_algebraic);
3697       OPT(opt_cse);
3698       OPT(opt_copy_propagate);
3699       OPT(opt_peephole_predicated_break);
3700       OPT(opt_cmod_propagation);
3701       OPT(dead_code_eliminate);
3702       OPT(opt_peephole_sel);
3703       OPT(dead_control_flow_eliminate, this);
3704       OPT(opt_register_renaming);
3705       OPT(opt_redundant_discard_jumps);
3706       OPT(opt_saturate_propagation);
3707       OPT(register_coalesce);
3708       OPT(compute_to_mrf);
3709
3710       OPT(compact_virtual_grfs);
3711    } while (progress);
3712
3713    pass_num = 0;
3714
3715    if (OPT(lower_load_payload)) {
3716       split_virtual_grfs();
3717       OPT(register_coalesce);
3718       OPT(compute_to_mrf);
3719       OPT(dead_code_eliminate);
3720    }
3721
3722    OPT(opt_combine_constants);
3723
3724    lower_uniform_pull_constant_loads();
3725 }
3726
3727 /**
3728  * Three source instruction must have a GRF/MRF destination register.
3729  * ARF NULL is not allowed.  Fix that up by allocating a temporary GRF.
3730  */
3731 void
3732 fs_visitor::fixup_3src_null_dest()
3733 {
3734    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3735       if (inst->is_3src() && inst->dst.is_null()) {
3736          inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
3737                             inst->dst.type);
3738       }
3739    }
3740 }
3741
3742 void
3743 fs_visitor::allocate_registers()
3744 {
3745    bool allocated_without_spills;
3746
3747    static const enum instruction_scheduler_mode pre_modes[] = {
3748       SCHEDULE_PRE,
3749       SCHEDULE_PRE_NON_LIFO,
3750       SCHEDULE_PRE_LIFO,
3751    };
3752
3753    /* Try each scheduling heuristic to see if it can successfully register
3754     * allocate without spilling.  They should be ordered by decreasing
3755     * performance but increasing likelihood of allocating.
3756     */
3757    for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3758       schedule_instructions(pre_modes[i]);
3759
3760       if (0) {
3761          assign_regs_trivial();
3762          allocated_without_spills = true;
3763       } else {
3764          allocated_without_spills = assign_regs(false);
3765       }
3766       if (allocated_without_spills)
3767          break;
3768    }
3769
3770    if (!allocated_without_spills) {
3771       const char *stage_name = stage == MESA_SHADER_VERTEX ?
3772          "Vertex" : "Fragment";
3773
3774       /* We assume that any spilling is worse than just dropping back to
3775        * SIMD8.  There's probably actually some intermediate point where
3776        * SIMD16 with a couple of spills is still better.
3777        */
3778       if (dispatch_width == 16) {
3779          fail("Failure to register allocate.  Reduce number of "
3780               "live scalar values to avoid this.");
3781       } else {
3782          perf_debug("%s shader triggered register spilling.  "
3783                     "Try reducing the number of live scalar values to "
3784                     "improve performance.\n", stage_name);
3785       }
3786
3787       /* Since we're out of heuristics, just go spill registers until we
3788        * get an allocation.
3789        */
3790       while (!assign_regs(true)) {
3791          if (failed)
3792             break;
3793       }
3794    }
3795
3796    /* This must come after all optimization and register allocation, since
3797     * it inserts dead code that happens to have side effects, and it does
3798     * so based on the actual physical registers in use.
3799     */
3800    insert_gen4_send_dependency_workarounds();
3801
3802    if (failed)
3803       return;
3804
3805    if (!allocated_without_spills)
3806       schedule_instructions(SCHEDULE_POST);
3807
3808    if (last_scratch > 0)
3809       prog_data->total_scratch = brw_get_scratch_size(last_scratch);
3810 }
3811
3812 bool
3813 fs_visitor::run_vs()
3814 {
3815    assert(stage == MESA_SHADER_VERTEX);
3816
3817    assign_common_binding_table_offsets(0);
3818    setup_vs_payload();
3819
3820    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3821       emit_shader_time_begin();
3822
3823    foreach_in_list(ir_instruction, ir, shader->base.ir) {
3824       base_ir = ir;
3825       this->result = reg_undef;
3826       ir->accept(this);
3827    }
3828    base_ir = NULL;
3829    if (failed)
3830       return false;
3831
3832    emit_urb_writes();
3833
3834    calculate_cfg();
3835
3836    optimize();
3837
3838    assign_curb_setup();
3839    assign_vs_urb_setup();
3840
3841    fixup_3src_null_dest();
3842    allocate_registers();
3843
3844    return !failed;
3845 }
3846
3847 bool
3848 fs_visitor::run_fs()
3849 {
3850    brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
3851    brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
3852
3853    assert(stage == MESA_SHADER_FRAGMENT);
3854
3855    sanity_param_count = prog->Parameters->NumParameters;
3856
3857    assign_binding_table_offsets();
3858
3859    if (brw->gen >= 6)
3860       setup_payload_gen6();
3861    else
3862       setup_payload_gen4();
3863
3864    if (0) {
3865       emit_dummy_fs();
3866    } else if (brw->use_rep_send && dispatch_width == 16) {
3867       emit_repclear_shader();
3868    } else {
3869       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3870          emit_shader_time_begin();
3871
3872       calculate_urb_setup();
3873       if (prog->InputsRead > 0) {
3874          if (brw->gen < 6)
3875             emit_interpolation_setup_gen4();
3876          else
3877             emit_interpolation_setup_gen6();
3878       }
3879
3880       /* We handle discards by keeping track of the still-live pixels in f0.1.
3881        * Initialize it with the dispatched pixels.
3882        */
3883       if (wm_prog_data->uses_kill) {
3884          fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3885          discard_init->flag_subreg = 1;
3886       }
3887
3888       /* Generate FS IR for main().  (the visitor only descends into
3889        * functions called "main").
3890        */
3891       if (shader) {
3892          if (getenv("INTEL_USE_NIR") != NULL) {
3893             emit_nir_code();
3894          } else {
3895             foreach_in_list(ir_instruction, ir, shader->base.ir) {
3896                base_ir = ir;
3897                this->result = reg_undef;
3898                ir->accept(this);
3899             }
3900          }
3901       } else {
3902          emit_fragment_program_code();
3903       }
3904       base_ir = NULL;
3905       if (failed)
3906          return false;
3907
3908       emit(FS_OPCODE_PLACEHOLDER_HALT);
3909
3910       if (wm_key->alpha_test_func)
3911          emit_alpha_test();
3912
3913       emit_fb_writes();
3914
3915       calculate_cfg();
3916
3917       optimize();
3918
3919       assign_curb_setup();
3920       assign_urb_setup();
3921
3922       fixup_3src_null_dest();
3923       allocate_registers();
3924
3925       if (failed)
3926          return false;
3927    }
3928
3929    if (dispatch_width == 8)
3930       wm_prog_data->reg_blocks = brw_register_blocks(grf_used);
3931    else
3932       wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
3933
3934    /* If any state parameters were appended, then ParameterValues could have
3935     * been realloced, in which case the driver uniform storage set up by
3936     * _mesa_associate_uniform_storage() would point to freed memory.  Make
3937     * sure that didn't happen.
3938     */
3939    assert(sanity_param_count == prog->Parameters->NumParameters);
3940
3941    return !failed;
3942 }
3943
3944 const unsigned *
3945 brw_wm_fs_emit(struct brw_context *brw,
3946                void *mem_ctx,
3947                const struct brw_wm_prog_key *key,
3948                struct brw_wm_prog_data *prog_data,
3949                struct gl_fragment_program *fp,
3950                struct gl_shader_program *prog,
3951                unsigned *final_assembly_size)
3952 {
3953    bool start_busy = false;
3954    double start_time = 0;
3955
3956    if (unlikely(brw->perf_debug)) {
3957       start_busy = (brw->batch.last_bo &&
3958                     drm_intel_bo_busy(brw->batch.last_bo));
3959       start_time = get_time();
3960    }
3961
3962    struct brw_shader *shader = NULL;
3963    if (prog)
3964       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3965
3966    if (unlikely(INTEL_DEBUG & DEBUG_WM))
3967       brw_dump_ir("fragment", prog, &shader->base, &fp->Base);
3968
3969    /* Now the main event: Visit the shader IR and generate our FS IR for it.
3970     */
3971    fs_visitor v(brw, mem_ctx, key, prog_data, prog, fp, 8);
3972    if (!v.run_fs()) {
3973       if (prog) {
3974          prog->LinkStatus = false;
3975          ralloc_strcat(&prog->InfoLog, v.fail_msg);
3976       }
3977
3978       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3979                     v.fail_msg);
3980
3981       return NULL;
3982    }
3983
3984    cfg_t *simd16_cfg = NULL;
3985    fs_visitor v2(brw, mem_ctx, key, prog_data, prog, fp, 16);
3986    if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16) ||
3987                                brw->use_rep_send)) {
3988       if (!v.simd16_unsupported) {
3989          /* Try a SIMD16 compile */
3990          v2.import_uniforms(&v);
3991          if (!v2.run_fs()) {
3992             perf_debug("SIMD16 shader failed to compile, falling back to "
3993                        "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
3994          } else {
3995             simd16_cfg = v2.cfg;
3996          }
3997       } else {
3998          perf_debug("SIMD16 shader unsupported, falling back to "
3999                     "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
4000       }
4001    }
4002
4003    cfg_t *simd8_cfg;
4004    int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8;
4005    if (no_simd8 && simd16_cfg) {
4006       simd8_cfg = NULL;
4007       prog_data->no_8 = true;
4008    } else {
4009       simd8_cfg = v.cfg;
4010       prog_data->no_8 = false;
4011    }
4012
4013    fs_generator g(brw, mem_ctx, (void *) key, &prog_data->base,
4014                   &fp->Base, v.runtime_check_aads_emit, "FS");
4015
4016    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
4017       char *name;
4018       if (prog)
4019          name = ralloc_asprintf(mem_ctx, "%s fragment shader %d",
4020                                 prog->Label ? prog->Label : "unnamed",
4021                                 prog->Name);
4022       else
4023          name = ralloc_asprintf(mem_ctx, "fragment program %d", fp->Base.Id);
4024
4025       g.enable_debug(name);
4026    }
4027
4028    if (simd8_cfg)
4029       g.generate_code(simd8_cfg, 8);
4030    if (simd16_cfg)
4031       prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
4032
4033    if (unlikely(brw->perf_debug) && shader) {
4034       if (shader->compiled_once)
4035          brw_wm_debug_recompile(brw, prog, key);
4036       shader->compiled_once = true;
4037
4038       if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
4039          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
4040                     (get_time() - start_time) * 1000);
4041       }
4042    }
4043
4044    return g.get_assembly(final_assembly_size);
4045 }
4046
4047 extern "C" bool
4048 brw_fs_precompile(struct gl_context *ctx,
4049                   struct gl_shader_program *shader_prog,
4050                   struct gl_program *prog)
4051 {
4052    struct brw_context *brw = brw_context(ctx);
4053    struct brw_wm_prog_key key;
4054
4055    struct gl_fragment_program *fp = (struct gl_fragment_program *) prog;
4056    struct brw_fragment_program *bfp = brw_fragment_program(fp);
4057    bool program_uses_dfdy = fp->UsesDFdy;
4058
4059    memset(&key, 0, sizeof(key));
4060
4061    if (brw->gen < 6) {
4062       if (fp->UsesKill)
4063          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
4064
4065       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
4066          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
4067
4068       /* Just assume depth testing. */
4069       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
4070       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
4071    }
4072
4073    if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
4074                                          BRW_FS_VARYING_INPUT_MASK) > 16)
4075       key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
4076
4077    const bool has_shader_channel_select = brw->is_haswell || brw->gen >= 8;
4078    unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
4079    for (unsigned i = 0; i < sampler_count; i++) {
4080       if (!has_shader_channel_select && (fp->Base.ShadowSamplers & (1 << i))) {
4081          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
4082          key.tex.swizzles[i] =
4083             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
4084       } else {
4085          /* Color sampler: assume no swizzling. */
4086          key.tex.swizzles[i] = SWIZZLE_XYZW;
4087       }
4088    }
4089
4090    if (fp->Base.InputsRead & VARYING_BIT_POS) {
4091       key.drawable_height = ctx->DrawBuffer->Height;
4092    }
4093
4094    key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
4095          ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
4096          BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
4097
4098    if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
4099       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
4100                           key.nr_color_regions > 1;
4101    }
4102
4103    key.program_string_id = bfp->id;
4104
4105    uint32_t old_prog_offset = brw->wm.base.prog_offset;
4106    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
4107
4108    bool success = do_wm_prog(brw, shader_prog, bfp, &key);
4109
4110    brw->wm.base.prog_offset = old_prog_offset;
4111    brw->wm.prog_data = old_prog_data;
4112
4113    return success;
4114 }