src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 #include <sys/types.h>
  32
  33 #include "util/hash_table.h"
  34 #include "main/macros.h"
  35 #include "main/shaderobj.h"
  36 #include "main/fbobject.h"
  37 #include "program/prog_parameter.h"
  38 #include "program/prog_print.h"
  39 #include "util/register_allocate.h"
  40 #include "program/hash_table.h"
  41 #include "brw_context.h"
  42 #include "brw_eu.h"
  43 #include "brw_wm.h"
  44 #include "brw_fs.h"
  45 #include "brw_cfg.h"
  46 #include "brw_dead_control_flow.h"
  47 #include "main/uniforms.h"
  48 #include "brw_fs_live_variables.h"
  49 #include "glsl/glsl_types.h"
  50 #include "program/sampler.h"
  51
  52 void
  53 fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
  54               const fs_reg *src, unsigned sources)
  55 {
  56    memset(this, 0, sizeof(*this));
  57
  58    this->src = new fs_reg[MAX2(sources, 3)];
  59    for (unsigned i = 0; i < sources; i++)
  60       this->src[i] = src[i];
  61
  62    this->opcode = opcode;
  63    this->dst = dst;
  64    this->sources = sources;
  65    this->exec_size = exec_size;
  66
  67    assert(dst.file != IMM && dst.file != UNIFORM);
  68
  69    /* If exec_size == 0, try to guess it from the registers.  Since all
  70     * manner of things may use hardware registers, we first try to guess
  71     * based on GRF registers.  If this fails, we will go ahead and take the
  72     * width from the destination register.
  73     */
  74    if (this->exec_size == 0) {
  75       if (dst.file == GRF) {
  76          this->exec_size = dst.width;
  77       } else {
  78          for (unsigned i = 0; i < sources; ++i) {
  79             if (src[i].file != GRF && src[i].file != ATTR)
  80                continue;
  81
  82             if (this->exec_size <= 1)
  83                this->exec_size = src[i].width;
  84             assert(src[i].width == 1 || src[i].width == this->exec_size);
  85          }
  86       }
  87
  88       if (this->exec_size == 0 && dst.file != BAD_FILE)
  89          this->exec_size = dst.width;
  90    }
  91    assert(this->exec_size != 0);
  92
  93    for (unsigned i = 0; i < sources; ++i) {
  94       switch (this->src[i].file) {
  95       case BAD_FILE:
  96          this->src[i].effective_width = 8;
  97          break;
  98       case GRF:
  99       case HW_REG:
 100       case ATTR:
 101          assert(this->src[i].width > 0);
 102          if (this->src[i].width == 1) {
 103             this->src[i].effective_width = this->exec_size;
 104          } else {
 105             this->src[i].effective_width = this->src[i].width;
 106          }
 107          break;
 108       case IMM:
 109       case UNIFORM:
 110          this->src[i].effective_width = this->exec_size;
 111          break;
 112       default:
 113          unreachable("Invalid source register file");
 114       }
 115    }
 116    this->dst.effective_width = this->exec_size;
 117
 118    this->conditional_mod = BRW_CONDITIONAL_NONE;
 119
 120    /* This will be the case for almost all instructions. */
 121    switch (dst.file) {
 122    case GRF:
 123    case HW_REG:
 124    case MRF:
 125    case ATTR:
 126       this->regs_written =
 127          DIV_ROUND_UP(MAX2(dst.width * dst.stride, 1) * type_sz(dst.type), 32);
 128       break;
 129    case BAD_FILE:
 130       this->regs_written = 0;
 131       break;
 132    case IMM:
 133    case UNIFORM:
 134       unreachable("Invalid destination register file");
 135    default:
 136       unreachable("Invalid register file");
 137    }
 138
 139    this->writes_accumulator = false;
 140 }
 141
 142 fs_inst::fs_inst()
 143 {
 144    init(BRW_OPCODE_NOP, 8, dst, NULL, 0);
 145 }
 146
 147 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
 148 {
 149    init(opcode, exec_size, reg_undef, NULL, 0);
 150 }
 151
 152 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
 153 {
 154    init(opcode, 0, dst, NULL, 0);
 155 }
 156
 157 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 158                  const fs_reg &src0)
 159 {
 160    const fs_reg src[1] = { src0 };
 161    init(opcode, exec_size, dst, src, 1);
 162 }
 163
 164 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
 165 {
 166    const fs_reg src[1] = { src0 };
 167    init(opcode, 0, dst, src, 1);
 168 }
 169
 170 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 171                  const fs_reg &src0, const fs_reg &src1)
 172 {
 173    const fs_reg src[2] = { src0, src1 };
 174    init(opcode, exec_size, dst, src, 2);
 175 }
 176
 177 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 178                  const fs_reg &src1)
 179 {
 180    const fs_reg src[2] = { src0, src1 };
 181    init(opcode, 0, dst, src, 2);
 182 }
 183
 184 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 185                  const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
 186 {
 187    const fs_reg src[3] = { src0, src1, src2 };
 188    init(opcode, exec_size, dst, src, 3);
 189 }
 190
 191 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 192                  const fs_reg &src1, const fs_reg &src2)
 193 {
 194    const fs_reg src[3] = { src0, src1, src2 };
 195    init(opcode, 0, dst, src, 3);
 196 }
 197
 198 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst,
 199                  const fs_reg src[], unsigned sources)
 200 {
 201    init(opcode, 0, dst, src, sources);
 202 }
 203
 204 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
 205                  const fs_reg src[], unsigned sources)
 206 {
 207    init(opcode, exec_width, dst, src, sources);
 208 }
 209
 210 fs_inst::fs_inst(const fs_inst &that)
 211 {
 212    memcpy(this, &that, sizeof(that));
 213
 214    this->src = new fs_reg[MAX2(that.sources, 3)];
 215
 216    for (unsigned i = 0; i < that.sources; i++)
 217       this->src[i] = that.src[i];
 218 }
 219
 220 fs_inst::~fs_inst()
 221 {
 222    delete[] this->src;
 223 }
 224
 225 void
 226 fs_inst::resize_sources(uint8_t num_sources)
 227 {
 228    if (this->sources != num_sources) {
 229       fs_reg *src = new fs_reg[MAX2(num_sources, 3)];
 230
 231       for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i)
 232          src[i] = this->src[i];
 233
 234       delete[] this->src;
 235       this->src = src;
 236       this->sources = num_sources;
 237    }
 238 }
 239
 240 #define ALU1(op)                                                        \
 241    fs_inst *                                                            \
 242    fs_visitor::op(const fs_reg &dst, const fs_reg &src0)                \
 243    {                                                                    \
 244       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
 245    }
 246
 247 #define ALU2(op)                                                        \
 248    fs_inst *                                                            \
 249    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 250                   const fs_reg &src1)                                   \
 251    {                                                                    \
 252       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
 253    }
 254
 255 #define ALU2_ACC(op)                                                    \
 256    fs_inst *                                                            \
 257    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 258                   const fs_reg &src1)                                   \
 259    {                                                                    \
 260       fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
 261       inst->writes_accumulator = true;                                  \
 262       return inst;                                                      \
 263    }
 264
 265 #define ALU3(op)                                                        \
 266    fs_inst *                                                            \
 267    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 268                   const fs_reg &src1, const fs_reg &src2)               \
 269    {                                                                    \
 270       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
 271    }
 272
 273 ALU1(NOT)
 274 ALU1(MOV)
 275 ALU1(FRC)
 276 ALU1(RNDD)
 277 ALU1(RNDE)
 278 ALU1(RNDZ)
 279 ALU2(ADD)
 280 ALU2(MUL)
 281 ALU2_ACC(MACH)
 282 ALU2(AND)
 283 ALU2(OR)
 284 ALU2(XOR)
 285 ALU2(SHL)
 286 ALU2(SHR)
 287 ALU2(ASR)
 288 ALU3(LRP)
 289 ALU1(BFREV)
 290 ALU3(BFE)
 291 ALU2(BFI1)
 292 ALU3(BFI2)
 293 ALU1(FBH)
 294 ALU1(FBL)
 295 ALU1(CBIT)
 296 ALU3(MAD)
 297 ALU2_ACC(ADDC)
 298 ALU2_ACC(SUBB)
 299 ALU2(SEL)
 300 ALU2(MAC)
 301
 302 /** Gen4 predicated IF. */
 303 fs_inst *
 304 fs_visitor::IF(enum brw_predicate predicate)
 305 {
 306    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width);
 307    inst->predicate = predicate;
 308    return inst;
 309 }
 310
 311 /** Gen6 IF with embedded comparison. */
 312 fs_inst *
 313 fs_visitor::IF(const fs_reg &src0, const fs_reg &src1,
 314                enum brw_conditional_mod condition)
 315 {
 316    assert(brw->gen == 6);
 317    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width,
 318                                         reg_null_d, src0, src1);
 319    inst->conditional_mod = condition;
 320    return inst;
 321 }
 322
 323 /**
 324  * CMP: Sets the low bit of the destination channels with the result
 325  * of the comparison, while the upper bits are undefined, and updates
 326  * the flag register with the packed 16 bits of the result.
 327  */
 328 fs_inst *
 329 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
 330                 enum brw_conditional_mod condition)
 331 {
 332    fs_inst *inst;
 333
 334    /* Take the instruction:
 335     *
 336     * CMP null<d> src0<f> src1<f>
 337     *
 338     * Original gen4 does type conversion to the destination type before
 339     * comparison, producing garbage results for floating point comparisons.
 340     *
 341     * The destination type doesn't matter on newer generations, so we set the
 342     * type to match src0 so we can compact the instruction.
 343     */
 344    dst.type = src0.type;
 345    if (dst.file == HW_REG)
 346       dst.fixed_hw_reg.type = dst.type;
 347
 348    resolve_ud_negate(&src0);
 349    resolve_ud_negate(&src1);
 350
 351    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
 352    inst->conditional_mod = condition;
 353
 354    return inst;
 355 }
 356
 357 fs_inst *
 358 fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources)
 359 {
 360    uint8_t exec_size = dst.width;
 361    for (int i = 0; i < sources; ++i) {
 362       assert(src[i].width % dst.width == 0);
 363       if (src[i].width > exec_size)
 364          exec_size = src[i].width;
 365    }
 366
 367    fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, exec_size,
 368                                         dst, src, sources);
 369    inst->regs_written = 0;
 370    for (int i = 0; i < sources; ++i) {
 371       /* The LOAD_PAYLOAD instruction only really makes sense if we are
 372        * dealing with whole registers.  If this ever changes, we can deal
 373        * with it later.
 374        */
 375       int size = inst->src[i].effective_width * type_sz(src[i].type);
 376       assert(size % 32 == 0);
 377       inst->regs_written += (size + 31) / 32;
 378    }
 379
 380    return inst;
 381 }
 382
 383 exec_list
 384 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
 385                                        const fs_reg &surf_index,
 386                                        const fs_reg &varying_offset,
 387                                        uint32_t const_offset)
 388 {
 389    exec_list instructions;
 390    fs_inst *inst;
 391
 392    /* We have our constant surface use a pitch of 4 bytes, so our index can
 393     * be any component of a vector, and then we load 4 contiguous
 394     * components starting from that.
 395     *
 396     * We break down the const_offset to a portion added to the variable
 397     * offset and a portion done using reg_offset, which means that if you
 398     * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
 399     * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
 400     * CSE can later notice that those loads are all the same and eliminate
 401     * the redundant ones.
 402     */
 403    fs_reg vec4_offset = vgrf(glsl_type::int_type);
 404    instructions.push_tail(ADD(vec4_offset,
 405                               varying_offset, fs_reg(const_offset & ~3)));
 406
 407    int scale = 1;
 408    if (brw->gen == 4 && dst.width == 8) {
 409       /* Pre-gen5, we can either use a SIMD8 message that requires (header,
 410        * u, v, r) as parameters, or we can just use the SIMD16 message
 411        * consisting of (header, u).  We choose the second, at the cost of a
 412        * longer return length.
 413        */
 414       scale = 2;
 415    }
 416
 417    enum opcode op;
 418    if (brw->gen >= 7)
 419       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
 420    else
 421       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
 422
 423    assert(dst.width % 8 == 0);
 424    int regs_written = 4 * (dst.width / 8) * scale;
 425    fs_reg vec4_result = fs_reg(GRF, alloc.allocate(regs_written),
 426                                dst.type, dst.width);
 427    inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
 428    inst->regs_written = regs_written;
 429    instructions.push_tail(inst);
 430
 431    if (brw->gen < 7) {
 432       inst->base_mrf = 13;
 433       inst->header_present = true;
 434       if (brw->gen == 4)
 435          inst->mlen = 3;
 436       else
 437          inst->mlen = 1 + dispatch_width / 8;
 438    }
 439
 440    fs_reg result = offset(vec4_result, (const_offset & 3) * scale);
 441    instructions.push_tail(MOV(dst, result));
 442
 443    return instructions;
 444 }
 445
 446 /**
 447  * A helper for MOV generation for fixing up broken hardware SEND dependency
 448  * handling.
 449  */
 450 fs_inst *
 451 fs_visitor::DEP_RESOLVE_MOV(int grf)
 452 {
 453    fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
 454
 455    inst->ir = NULL;
 456    inst->annotation = "send dependency resolve";
 457
 458    /* The caller always wants uncompressed to emit the minimal extra
 459     * dependencies, and to avoid having to deal with aligning its regs to 2.
 460     */
 461    inst->exec_size = 8;
 462
 463    return inst;
 464 }
 465
 466 bool
 467 fs_inst::equals(fs_inst *inst) const
 468 {
 469    return (opcode == inst->opcode &&
 470            dst.equals(inst->dst) &&
 471            src[0].equals(inst->src[0]) &&
 472            src[1].equals(inst->src[1]) &&
 473            src[2].equals(inst->src[2]) &&
 474            saturate == inst->saturate &&
 475            predicate == inst->predicate &&
 476            conditional_mod == inst->conditional_mod &&
 477            mlen == inst->mlen &&
 478            base_mrf == inst->base_mrf &&
 479            target == inst->target &&
 480            eot == inst->eot &&
 481            header_present == inst->header_present &&
 482            shadow_compare == inst->shadow_compare &&
 483            exec_size == inst->exec_size &&
 484            offset == inst->offset);
 485 }
 486
 487 bool
 488 fs_inst::overwrites_reg(const fs_reg &reg) const
 489 {
 490    return (reg.file == dst.file &&
 491            reg.reg == dst.reg &&
 492            reg.reg_offset >= dst.reg_offset  &&
 493            reg.reg_offset < dst.reg_offset + regs_written);
 494 }
 495
 496 bool
 497 fs_inst::is_send_from_grf() const
 498 {
 499    switch (opcode) {
 500    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
 501    case SHADER_OPCODE_SHADER_TIME_ADD:
 502    case FS_OPCODE_INTERPOLATE_AT_CENTROID:
 503    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
 504    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
 505    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
 506    case SHADER_OPCODE_UNTYPED_ATOMIC:
 507    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
 508    case SHADER_OPCODE_URB_WRITE_SIMD8:
 509       return true;
 510    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 511       return src[1].file == GRF;
 512    case FS_OPCODE_FB_WRITE:
 513       return src[0].file == GRF;
 514    default:
 515       if (is_tex())
 516          return src[0].file == GRF;
 517
 518       return false;
 519    }
 520 }
 521
 522 bool
 523 fs_inst::can_do_source_mods(struct brw_context *brw)
 524 {
 525    if (brw->gen == 6 && is_math())
 526       return false;
 527
 528    if (is_send_from_grf())
 529       return false;
 530
 531    if (!backend_instruction::can_do_source_mods())
 532       return false;
 533
 534    return true;
 535 }
 536
 537 void
 538 fs_reg::init()
 539 {
 540    memset(this, 0, sizeof(*this));
 541    stride = 1;
 542 }
 543
 544 /** Generic unset register constructor. */
 545 fs_reg::fs_reg()
 546 {
 547    init();
 548    this->file = BAD_FILE;
 549 }
 550
 551 /** Immediate value constructor. */
 552 fs_reg::fs_reg(float f)
 553 {
 554    init();
 555    this->file = IMM;
 556    this->type = BRW_REGISTER_TYPE_F;
 557    this->fixed_hw_reg.dw1.f = f;
 558    this->width = 1;
 559 }
 560
 561 /** Immediate value constructor. */
 562 fs_reg::fs_reg(int32_t i)
 563 {
 564    init();
 565    this->file = IMM;
 566    this->type = BRW_REGISTER_TYPE_D;
 567    this->fixed_hw_reg.dw1.d = i;
 568    this->width = 1;
 569 }
 570
 571 /** Immediate value constructor. */
 572 fs_reg::fs_reg(uint32_t u)
 573 {
 574    init();
 575    this->file = IMM;
 576    this->type = BRW_REGISTER_TYPE_UD;
 577    this->fixed_hw_reg.dw1.ud = u;
 578    this->width = 1;
 579 }
 580
 581 /** Vector float immediate value constructor. */
 582 fs_reg::fs_reg(uint8_t vf[4])
 583 {
 584    init();
 585    this->file = IMM;
 586    this->type = BRW_REGISTER_TYPE_VF;
 587    memcpy(&this->fixed_hw_reg.dw1.ud, vf, sizeof(unsigned));
 588 }
 589
 590 /** Vector float immediate value constructor. */
 591 fs_reg::fs_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
 592 {
 593    init();
 594    this->file = IMM;
 595    this->type = BRW_REGISTER_TYPE_VF;
 596    this->fixed_hw_reg.dw1.ud = (vf0 <<  0) |
 597                                (vf1 <<  8) |
 598                                (vf2 << 16) |
 599                                (vf3 << 24);
 600 }
 601
 602 /** Fixed brw_reg. */
 603 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 604 {
 605    init();
 606    this->file = HW_REG;
 607    this->fixed_hw_reg = fixed_hw_reg;
 608    this->type = fixed_hw_reg.type;
 609    this->width = 1 << fixed_hw_reg.width;
 610 }
 611
 612 bool
 613 fs_reg::equals(const fs_reg &r) const
 614 {
 615    return (file == r.file &&
 616            reg == r.reg &&
 617            reg_offset == r.reg_offset &&
 618            subreg_offset == r.subreg_offset &&
 619            type == r.type &&
 620            negate == r.negate &&
 621            abs == r.abs &&
 622            !reladdr && !r.reladdr &&
 623            memcmp(&fixed_hw_reg, &r.fixed_hw_reg, sizeof(fixed_hw_reg)) == 0 &&
 624            width == r.width &&
 625            stride == r.stride);
 626 }
 627
 628 fs_reg &
 629 fs_reg::set_smear(unsigned subreg)
 630 {
 631    assert(file != HW_REG && file != IMM);
 632    subreg_offset = subreg * type_sz(type);
 633    stride = 0;
 634    return *this;
 635 }
 636
 637 bool
 638 fs_reg::is_contiguous() const
 639 {
 640    return stride == 1;
 641 }
 642
 643 int
 644 fs_visitor::type_size(const struct glsl_type *type)
 645 {
 646    unsigned int size, i;
 647
 648    switch (type->base_type) {
 649    case GLSL_TYPE_UINT:
 650    case GLSL_TYPE_INT:
 651    case GLSL_TYPE_FLOAT:
 652    case GLSL_TYPE_BOOL:
 653       return type->components();
 654    case GLSL_TYPE_ARRAY:
 655       return type_size(type->fields.array) * type->length;
 656    case GLSL_TYPE_STRUCT:
 657       size = 0;
 658       for (i = 0; i < type->length; i++) {
 659          size += type_size(type->fields.structure[i].type);
 660       }
 661       return size;
 662    case GLSL_TYPE_SAMPLER:
 663       /* Samplers take up no register space, since they're baked in at
 664        * link time.
 665        */
 666       return 0;
 667    case GLSL_TYPE_ATOMIC_UINT:
 668       return 0;
 669    case GLSL_TYPE_IMAGE:
 670    case GLSL_TYPE_VOID:
 671    case GLSL_TYPE_ERROR:
 672    case GLSL_TYPE_INTERFACE:
 673    case GLSL_TYPE_DOUBLE:
 674       unreachable("not reached");
 675    }
 676
 677    return 0;
 678 }
 679
 680 /**
 681  * Create a MOV to read the timestamp register.
 682  *
 683  * The caller is responsible for emitting the MOV.  The return value is
 684  * the destination of the MOV, with extra parameters set.
 685  */
 686 fs_reg
 687 fs_visitor::get_timestamp(fs_inst **out_mov)
 688 {
 689    assert(brw->gen >= 7);
 690
 691    fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 692                                           BRW_ARF_TIMESTAMP,
 693                                           0),
 694                              BRW_REGISTER_TYPE_UD));
 695
 696    fs_reg dst = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 4);
 697
 698    fs_inst *mov = MOV(dst, ts);
 699    /* We want to read the 3 fields we care about even if it's not enabled in
 700     * the dispatch.
 701     */
 702    mov->force_writemask_all = true;
 703
 704    /* The caller wants the low 32 bits of the timestamp.  Since it's running
 705     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 706     * which is plenty of time for our purposes.  It is identical across the
 707     * EUs, but since it's tracking GPU core speed it will increment at a
 708     * varying rate as render P-states change.
 709     *
 710     * The caller could also check if render P-states have changed (or anything
 711     * else that might disrupt timing) by setting smear to 2 and checking if
 712     * that field is != 0.
 713     */
 714    dst.set_smear(0);
 715
 716    *out_mov = mov;
 717    return dst;
 718 }
 719
 720 void
 721 fs_visitor::emit_shader_time_begin()
 722 {
 723    current_annotation = "shader time start";
 724    fs_inst *mov;
 725    shader_start_time = get_timestamp(&mov);
 726    emit(mov);
 727 }
 728
 729 void
 730 fs_visitor::emit_shader_time_end()
 731 {
 732    current_annotation = "shader time end";
 733
 734    enum shader_time_shader_type type, written_type, reset_type;
 735    switch (stage) {
 736    case MESA_SHADER_VERTEX:
 737       type = ST_VS;
 738       written_type = ST_VS_WRITTEN;
 739       reset_type = ST_VS_RESET;
 740       break;
 741    case MESA_SHADER_GEOMETRY:
 742       type = ST_GS;
 743       written_type = ST_GS_WRITTEN;
 744       reset_type = ST_GS_RESET;
 745       break;
 746    case MESA_SHADER_FRAGMENT:
 747       if (dispatch_width == 8) {
 748          type = ST_FS8;
 749          written_type = ST_FS8_WRITTEN;
 750          reset_type = ST_FS8_RESET;
 751       } else {
 752          assert(dispatch_width == 16);
 753          type = ST_FS16;
 754          written_type = ST_FS16_WRITTEN;
 755          reset_type = ST_FS16_RESET;
 756       }
 757       break;
 758    default:
 759       unreachable("fs_visitor::emit_shader_time_end missing code");
 760    }
 761
 762    /* Insert our code just before the final SEND with EOT. */
 763    exec_node *end = this->instructions.get_tail();
 764    assert(end && ((fs_inst *) end)->eot);
 765
 766    fs_inst *tm_read;
 767    fs_reg shader_end_time = get_timestamp(&tm_read);
 768    end->insert_before(tm_read);
 769
 770    /* Check that there weren't any timestamp reset events (assuming these
 771     * were the only two timestamp reads that happened).
 772     */
 773    fs_reg reset = shader_end_time;
 774    reset.set_smear(2);
 775    fs_inst *test = AND(reg_null_d, reset, fs_reg(1u));
 776    test->conditional_mod = BRW_CONDITIONAL_Z;
 777    test->force_writemask_all = true;
 778    end->insert_before(test);
 779    end->insert_before(IF(BRW_PREDICATE_NORMAL));
 780
 781    fs_reg start = shader_start_time;
 782    start.negate = true;
 783    fs_reg diff = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 1);
 784    diff.set_smear(0);
 785    fs_inst *add = ADD(diff, start, shader_end_time);
 786    add->force_writemask_all = true;
 787    end->insert_before(add);
 788
 789    /* If there were no instructions between the two timestamp gets, the diff
 790     * is 2 cycles.  Remove that overhead, so I can forget about that when
 791     * trying to determine the time taken for single instructions.
 792     */
 793    add = ADD(diff, diff, fs_reg(-2u));
 794    add->force_writemask_all = true;
 795    end->insert_before(add);
 796
 797    end->insert_before(SHADER_TIME_ADD(type, diff));
 798    end->insert_before(SHADER_TIME_ADD(written_type, fs_reg(1u)));
 799    end->insert_before(new(mem_ctx) fs_inst(BRW_OPCODE_ELSE, dispatch_width));
 800    end->insert_before(SHADER_TIME_ADD(reset_type, fs_reg(1u)));
 801    end->insert_before(new(mem_ctx) fs_inst(BRW_OPCODE_ENDIF, dispatch_width));
 802 }
 803
 804 fs_inst *
 805 fs_visitor::SHADER_TIME_ADD(enum shader_time_shader_type type, fs_reg value)
 806 {
 807    int shader_time_index =
 808       brw_get_shader_time_index(brw, shader_prog, prog, type);
 809    fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
 810
 811    fs_reg payload;
 812    if (dispatch_width == 8)
 813       payload = vgrf(glsl_type::uvec2_type);
 814    else
 815       payload = vgrf(glsl_type::uint_type);
 816
 817    return new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
 818                                fs_reg(), payload, offset, value);
 819 }
 820
 821 void
 822 fs_visitor::vfail(const char *format, va_list va)
 823 {
 824    char *msg;
 825
 826    if (failed)
 827       return;
 828
 829    failed = true;
 830
 831    msg = ralloc_vasprintf(mem_ctx, format, va);
 832    msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
 833
 834    this->fail_msg = msg;
 835
 836    if (debug_enabled) {
 837       fprintf(stderr, "%s",  msg);
 838    }
 839 }
 840
 841 void
 842 fs_visitor::fail(const char *format, ...)
 843 {
 844    va_list va;
 845
 846    va_start(va, format);
 847    vfail(format, va);
 848    va_end(va);
 849 }
 850
 851 /**
 852  * Mark this program as impossible to compile in SIMD16 mode.
 853  *
 854  * During the SIMD8 compile (which happens first), we can detect and flag
 855  * things that are unsupported in SIMD16 mode, so the compiler can skip
 856  * the SIMD16 compile altogether.
 857  *
 858  * During a SIMD16 compile (if one happens anyway), this just calls fail().
 859  */
 860 void
 861 fs_visitor::no16(const char *format, ...)
 862 {
 863    va_list va;
 864
 865    va_start(va, format);
 866
 867    if (dispatch_width == 16) {
 868       vfail(format, va);
 869    } else {
 870       simd16_unsupported = true;
 871
 872       if (brw->perf_debug) {
 873          if (no16_msg)
 874             ralloc_vasprintf_append(&no16_msg, format, va);
 875          else
 876             no16_msg = ralloc_vasprintf(mem_ctx, format, va);
 877       }
 878    }
 879
 880    va_end(va);
 881 }
 882
 883 fs_inst *
 884 fs_visitor::emit(enum opcode opcode)
 885 {
 886    return emit(new(mem_ctx) fs_inst(opcode, dispatch_width));
 887 }
 888
 889 fs_inst *
 890 fs_visitor::emit(enum opcode opcode, const fs_reg &dst)
 891 {
 892    return emit(new(mem_ctx) fs_inst(opcode, dst));
 893 }
 894
 895 fs_inst *
 896 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
 897 {
 898    return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
 899 }
 900
 901 fs_inst *
 902 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 903                  const fs_reg &src1)
 904 {
 905    return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
 906 }
 907
 908 fs_inst *
 909 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 910                  const fs_reg &src1, const fs_reg &src2)
 911 {
 912    return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
 913 }
 914
 915 fs_inst *
 916 fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
 917                  fs_reg src[], int sources)
 918 {
 919    return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
 920 }
 921
 922 /**
 923  * Returns true if the instruction has a flag that means it won't
 924  * update an entire destination register.
 925  *
 926  * For example, dead code elimination and live variable analysis want to know
 927  * when a write to a variable screens off any preceding values that were in
 928  * it.
 929  */
 930 bool
 931 fs_inst::is_partial_write() const
 932 {
 933    return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
 934            (this->dst.width * type_sz(this->dst.type)) < 32 ||
 935            !this->dst.is_contiguous());
 936 }
 937
 938 int
 939 fs_inst::regs_read(int arg) const
 940 {
 941    if (is_tex() && arg == 0 && src[0].file == GRF) {
 942       return mlen;
 943    } else if (opcode == FS_OPCODE_FB_WRITE && arg == 0) {
 944       return mlen;
 945    } else if (opcode == SHADER_OPCODE_URB_WRITE_SIMD8 && arg == 0) {
 946       return mlen;
 947    } else if (opcode == SHADER_OPCODE_UNTYPED_ATOMIC && arg == 0) {
 948       return mlen;
 949    } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ && arg == 0) {
 950       return mlen;
 951    } else if (opcode == FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET && arg == 0) {
 952       return mlen;
 953    }
 954
 955    switch (src[arg].file) {
 956    case BAD_FILE:
 957    case UNIFORM:
 958    case IMM:
 959       return 1;
 960    case GRF:
 961    case HW_REG:
 962       if (src[arg].stride == 0) {
 963          return 1;
 964       } else {
 965          int size = src[arg].width * src[arg].stride * type_sz(src[arg].type);
 966          return (size + 31) / 32;
 967       }
 968    case MRF:
 969       unreachable("MRF registers are not allowed as sources");
 970    default:
 971       unreachable("Invalid register file");
 972    }
 973 }
 974
 975 bool
 976 fs_inst::reads_flag() const
 977 {
 978    return predicate;
 979 }
 980
 981 bool
 982 fs_inst::writes_flag() const
 983 {
 984    return (conditional_mod && (opcode != BRW_OPCODE_SEL &&
 985                                opcode != BRW_OPCODE_IF &&
 986                                opcode != BRW_OPCODE_WHILE)) ||
 987           opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
 988 }
 989
 990 /**
 991  * Returns how many MRFs an FS opcode will write over.
 992  *
 993  * Note that this is not the 0 or 1 implied writes in an actual gen
 994  * instruction -- the FS opcodes often generate MOVs in addition.
 995  */
 996 int
 997 fs_visitor::implied_mrf_writes(fs_inst *inst)
 998 {
 999    if (inst->mlen == 0)
1000       return 0;
1001
1002    if (inst->base_mrf == -1)
1003       return 0;
1004
1005    switch (inst->opcode) {
1006    case SHADER_OPCODE_RCP:
1007    case SHADER_OPCODE_RSQ:
1008    case SHADER_OPCODE_SQRT:
1009    case SHADER_OPCODE_EXP2:
1010    case SHADER_OPCODE_LOG2:
1011    case SHADER_OPCODE_SIN:
1012    case SHADER_OPCODE_COS:
1013       return 1 * dispatch_width / 8;
1014    case SHADER_OPCODE_POW:
1015    case SHADER_OPCODE_INT_QUOTIENT:
1016    case SHADER_OPCODE_INT_REMAINDER:
1017       return 2 * dispatch_width / 8;
1018    case SHADER_OPCODE_TEX:
1019    case FS_OPCODE_TXB:
1020    case SHADER_OPCODE_TXD:
1021    case SHADER_OPCODE_TXF:
1022    case SHADER_OPCODE_TXF_CMS:
1023    case SHADER_OPCODE_TXF_MCS:
1024    case SHADER_OPCODE_TG4:
1025    case SHADER_OPCODE_TG4_OFFSET:
1026    case SHADER_OPCODE_TXL:
1027    case SHADER_OPCODE_TXS:
1028    case SHADER_OPCODE_LOD:
1029       return 1;
1030    case FS_OPCODE_FB_WRITE:
1031       return 2;
1032    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
1033    case SHADER_OPCODE_GEN4_SCRATCH_READ:
1034       return 1;
1035    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
1036       return inst->mlen;
1037    case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
1038       return 2;
1039    case SHADER_OPCODE_UNTYPED_ATOMIC:
1040    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
1041    case SHADER_OPCODE_URB_WRITE_SIMD8:
1042    case FS_OPCODE_INTERPOLATE_AT_CENTROID:
1043    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
1044    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
1045    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
1046       return 0;
1047    default:
1048       unreachable("not reached");
1049    }
1050 }
1051
1052 fs_reg
1053 fs_visitor::vgrf(const glsl_type *const type)
1054 {
1055    int reg_width = dispatch_width / 8;
1056    return fs_reg(GRF, alloc.allocate(type_size(type) * reg_width),
1057                  brw_type_for_base_type(type), dispatch_width);
1058 }
1059
1060 fs_reg
1061 fs_visitor::vgrf(int num_components)
1062 {
1063    int reg_width = dispatch_width / 8;
1064    return fs_reg(GRF, alloc.allocate(num_components * reg_width),
1065                  BRW_REGISTER_TYPE_F, dispatch_width);
1066 }
1067
1068 /** Fixed HW reg constructor. */
1069 fs_reg::fs_reg(enum register_file file, int reg)
1070 {
1071    init();
1072    this->file = file;
1073    this->reg = reg;
1074    this->type = BRW_REGISTER_TYPE_F;
1075
1076    switch (file) {
1077    case UNIFORM:
1078       this->width = 1;
1079       break;
1080    default:
1081       this->width = 8;
1082    }
1083 }
1084
1085 /** Fixed HW reg constructor. */
1086 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
1087 {
1088    init();
1089    this->file = file;
1090    this->reg = reg;
1091    this->type = type;
1092
1093    switch (file) {
1094    case UNIFORM:
1095       this->width = 1;
1096       break;
1097    default:
1098       this->width = 8;
1099    }
1100 }
1101
1102 /** Fixed HW reg constructor. */
1103 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type,
1104                uint8_t width)
1105 {
1106    init();
1107    this->file = file;
1108    this->reg = reg;
1109    this->type = type;
1110    this->width = width;
1111 }
1112
1113 fs_reg *
1114 fs_visitor::variable_storage(ir_variable *var)
1115 {
1116    return (fs_reg *)hash_table_find(this->variable_ht, var);
1117 }
1118
1119 void
1120 import_uniforms_callback(const void *key,
1121                          void *data,
1122                          void *closure)
1123 {
1124    struct hash_table *dst_ht = (struct hash_table *)closure;
1125    const fs_reg *reg = (const fs_reg *)data;
1126
1127    if (reg->file != UNIFORM)
1128       return;
1129
1130    hash_table_insert(dst_ht, data, key);
1131 }
1132
1133 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
1134  * This brings in those uniform definitions
1135  */
1136 void
1137 fs_visitor::import_uniforms(fs_visitor *v)
1138 {
1139    hash_table_call_foreach(v->variable_ht,
1140                            import_uniforms_callback,
1141                            variable_ht);
1142    this->push_constant_loc = v->push_constant_loc;
1143    this->pull_constant_loc = v->pull_constant_loc;
1144    this->uniforms = v->uniforms;
1145    this->param_size = v->param_size;
1146 }
1147
1148 /* Our support for uniforms is piggy-backed on the struct
1149  * gl_fragment_program, because that's where the values actually
1150  * get stored, rather than in some global gl_shader_program uniform
1151  * store.
1152  */
1153 void
1154 fs_visitor::setup_uniform_values(ir_variable *ir)
1155 {
1156    int namelen = strlen(ir->name);
1157
1158    /* The data for our (non-builtin) uniforms is stored in a series of
1159     * gl_uniform_driver_storage structs for each subcomponent that
1160     * glGetUniformLocation() could name.  We know it's been set up in the same
1161     * order we'd walk the type, so walk the list of storage and find anything
1162     * with our name, or the prefix of a component that starts with our name.
1163     */
1164    unsigned params_before = uniforms;
1165    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
1166       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
1167
1168       if (strncmp(ir->name, storage->name, namelen) != 0 ||
1169           (storage->name[namelen] != 0 &&
1170            storage->name[namelen] != '.' &&
1171            storage->name[namelen] != '[')) {
1172          continue;
1173       }
1174
1175       unsigned slots = storage->type->component_slots();
1176       if (storage->array_elements)
1177          slots *= storage->array_elements;
1178
1179       for (unsigned i = 0; i < slots; i++) {
1180          stage_prog_data->param[uniforms++] = &storage->storage[i];
1181       }
1182    }
1183
1184    /* Make sure we actually initialized the right amount of stuff here. */
1185    assert(params_before + ir->type->component_slots() == uniforms);
1186    (void)params_before;
1187 }
1188
1189
1190 /* Our support for builtin uniforms is even scarier than non-builtin.
1191  * It sits on top of the PROG_STATE_VAR parameters that are
1192  * automatically updated from GL context state.
1193  */
1194 void
1195 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
1196 {
1197    const ir_state_slot *const slots = ir->get_state_slots();
1198    assert(slots != NULL);
1199
1200    for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
1201       /* This state reference has already been setup by ir_to_mesa, but we'll
1202        * get the same index back here.
1203        */
1204       int index = _mesa_add_state_reference(this->prog->Parameters,
1205                                             (gl_state_index *)slots[i].tokens);
1206
1207       /* Add each of the unique swizzles of the element as a parameter.
1208        * This'll end up matching the expected layout of the
1209        * array/matrix/structure we're trying to fill in.
1210        */
1211       int last_swiz = -1;
1212       for (unsigned int j = 0; j < 4; j++) {
1213          int swiz = GET_SWZ(slots[i].swizzle, j);
1214          if (swiz == last_swiz)
1215             break;
1216          last_swiz = swiz;
1217
1218          stage_prog_data->param[uniforms++] =
1219             &prog->Parameters->ParameterValues[index][swiz];
1220       }
1221    }
1222 }
1223
1224 fs_reg *
1225 fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
1226                                          bool origin_upper_left)
1227 {
1228    assert(stage == MESA_SHADER_FRAGMENT);
1229    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1230    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec4_type));
1231    fs_reg wpos = *reg;
1232    bool flip = !origin_upper_left ^ key->render_to_fbo;
1233
1234    /* gl_FragCoord.x */
1235    if (pixel_center_integer) {
1236       emit(MOV(wpos, this->pixel_x));
1237    } else {
1238       emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
1239    }
1240    wpos = offset(wpos, 1);
1241
1242    /* gl_FragCoord.y */
1243    if (!flip && pixel_center_integer) {
1244       emit(MOV(wpos, this->pixel_y));
1245    } else {
1246       fs_reg pixel_y = this->pixel_y;
1247       float offset = (pixel_center_integer ? 0.0 : 0.5);
1248
1249       if (flip) {
1250          pixel_y.negate = true;
1251          offset += key->drawable_height - 1.0;
1252       }
1253
1254       emit(ADD(wpos, pixel_y, fs_reg(offset)));
1255    }
1256    wpos = offset(wpos, 1);
1257
1258    /* gl_FragCoord.z */
1259    if (brw->gen >= 6) {
1260       emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
1261    } else {
1262       emit(FS_OPCODE_LINTERP, wpos,
1263            this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1264            this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1265            interp_reg(VARYING_SLOT_POS, 2));
1266    }
1267    wpos = offset(wpos, 1);
1268
1269    /* gl_FragCoord.w: Already set up in emit_interpolation */
1270    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1271
1272    return reg;
1273 }
1274
1275 fs_inst *
1276 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1277                          glsl_interp_qualifier interpolation_mode,
1278                          bool is_centroid, bool is_sample)
1279 {
1280    brw_wm_barycentric_interp_mode barycoord_mode;
1281    if (brw->gen >= 6) {
1282       if (is_centroid) {
1283          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1284             barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1285          else
1286             barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1287       } else if (is_sample) {
1288           if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1289             barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1290          else
1291             barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1292       } else {
1293          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1294             barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1295          else
1296             barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1297       }
1298    } else {
1299       /* On Ironlake and below, there is only one interpolation mode.
1300        * Centroid interpolation doesn't mean anything on this hardware --
1301        * there is no multisampling.
1302        */
1303       barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1304    }
1305    return emit(FS_OPCODE_LINTERP, attr,
1306                this->delta_x[barycoord_mode],
1307                this->delta_y[barycoord_mode], interp);
1308 }
1309
1310 void
1311 fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
1312                                        const glsl_type *type,
1313                                        glsl_interp_qualifier interpolation_mode,
1314                                        int location, bool mod_centroid,
1315                                        bool mod_sample)
1316 {
1317    attr.type = brw_type_for_base_type(type->get_scalar_type());
1318
1319    assert(stage == MESA_SHADER_FRAGMENT);
1320    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1321    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1322
1323    unsigned int array_elements;
1324
1325    if (type->is_array()) {
1326       array_elements = type->length;
1327       if (array_elements == 0) {
1328          fail("dereferenced array '%s' has length 0\n", name);
1329       }
1330       type = type->fields.array;
1331    } else {
1332       array_elements = 1;
1333    }
1334
1335    if (interpolation_mode == INTERP_QUALIFIER_NONE) {
1336       bool is_gl_Color =
1337          location == VARYING_SLOT_COL0 || location == VARYING_SLOT_COL1;
1338       if (key->flat_shade && is_gl_Color) {
1339          interpolation_mode = INTERP_QUALIFIER_FLAT;
1340       } else {
1341          interpolation_mode = INTERP_QUALIFIER_SMOOTH;
1342       }
1343    }
1344
1345    for (unsigned int i = 0; i < array_elements; i++) {
1346       for (unsigned int j = 0; j < type->matrix_columns; j++) {
1347          if (prog_data->urb_setup[location] == -1) {
1348             /* If there's no incoming setup data for this slot, don't
1349              * emit interpolation for it.
1350              */
1351             attr = offset(attr, type->vector_elements);
1352             location++;
1353             continue;
1354          }
1355
1356          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1357             /* Constant interpolation (flat shading) case. The SF has
1358              * handed us defined values in only the constant offset
1359              * field of the setup reg.
1360              */
1361             for (unsigned int k = 0; k < type->vector_elements; k++) {
1362                struct brw_reg interp = interp_reg(location, k);
1363                interp = suboffset(interp, 3);
1364                interp.type = attr.type;
1365                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1366                attr = offset(attr, 1);
1367             }
1368          } else {
1369             /* Smooth/noperspective interpolation case. */
1370             for (unsigned int k = 0; k < type->vector_elements; k++) {
1371                struct brw_reg interp = interp_reg(location, k);
1372                if (brw->needs_unlit_centroid_workaround && mod_centroid) {
1373                   /* Get the pixel/sample mask into f0 so that we know
1374                    * which pixels are lit.  Then, for each channel that is
1375                    * unlit, replace the centroid data with non-centroid
1376                    * data.
1377                    */
1378                   emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1379
1380                   fs_inst *inst;
1381                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1382                                       false, false);
1383                   inst->predicate = BRW_PREDICATE_NORMAL;
1384                   inst->predicate_inverse = true;
1385                   if (brw->has_pln)
1386                      inst->no_dd_clear = true;
1387
1388                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1389                                       mod_centroid && !key->persample_shading,
1390                                       mod_sample || key->persample_shading);
1391                   inst->predicate = BRW_PREDICATE_NORMAL;
1392                   inst->predicate_inverse = false;
1393                   if (brw->has_pln)
1394                      inst->no_dd_check = true;
1395
1396                } else {
1397                   emit_linterp(attr, fs_reg(interp), interpolation_mode,
1398                                mod_centroid && !key->persample_shading,
1399                                mod_sample || key->persample_shading);
1400                }
1401                if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1402                   emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1403                }
1404                attr = offset(attr, 1);
1405             }
1406
1407          }
1408          location++;
1409       }
1410    }
1411 }
1412
1413 fs_reg *
1414 fs_visitor::emit_frontfacing_interpolation()
1415 {
1416    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type));
1417
1418    if (brw->gen >= 6) {
1419       /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
1420        * a boolean result from this (~0/true or 0/false).
1421        *
1422        * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
1423        * this task in only one instruction:
1424        *    - a negation source modifier will flip the bit; and
1425        *    - a W -> D type conversion will sign extend the bit into the high
1426        *      word of the destination.
1427        *
1428        * An ASR 15 fills the low word of the destination.
1429        */
1430       fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
1431       g0.negate = true;
1432
1433       emit(ASR(*reg, g0, fs_reg(15)));
1434    } else {
1435       /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
1436        * a boolean result from this (1/true or 0/false).
1437        *
1438        * Like in the above case, since the bit is the MSB of g1.6:UD we can use
1439        * the negation source modifier to flip it. Unfortunately the SHR
1440        * instruction only operates on UD (or D with an abs source modifier)
1441        * sources without negation.
1442        *
1443        * Instead, use ASR (which will give ~0/true or 0/false).
1444        */
1445       fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
1446       g1_6.negate = true;
1447
1448       emit(ASR(*reg, g1_6, fs_reg(31)));
1449    }
1450
1451    return reg;
1452 }
1453
1454 void
1455 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1456 {
1457    assert(stage == MESA_SHADER_FRAGMENT);
1458    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1459    assert(dst.type == BRW_REGISTER_TYPE_F);
1460
1461    if (key->compute_pos_offset) {
1462       /* Convert int_sample_pos to floating point */
1463       emit(MOV(dst, int_sample_pos));
1464       /* Scale to the range [0, 1] */
1465       emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1466    }
1467    else {
1468       /* From ARB_sample_shading specification:
1469        * "When rendering to a non-multisample buffer, or if multisample
1470        *  rasterization is disabled, gl_SamplePosition will always be
1471        *  (0.5, 0.5).
1472        */
1473       emit(MOV(dst, fs_reg(0.5f)));
1474    }
1475 }
1476
1477 fs_reg *
1478 fs_visitor::emit_samplepos_setup()
1479 {
1480    assert(brw->gen >= 6);
1481
1482    this->current_annotation = "compute sample position";
1483    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type));
1484    fs_reg pos = *reg;
1485    fs_reg int_sample_x = vgrf(glsl_type::int_type);
1486    fs_reg int_sample_y = vgrf(glsl_type::int_type);
1487
1488    /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1489     * mode will be enabled.
1490     *
1491     * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1492     * R31.1:0         Position Offset X/Y for Slot[3:0]
1493     * R31.3:2         Position Offset X/Y for Slot[7:4]
1494     * .....
1495     *
1496     * The X, Y sample positions come in as bytes in  thread payload. So, read
1497     * the positions using vstride=16, width=8, hstride=2.
1498     */
1499    struct brw_reg sample_pos_reg =
1500       stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1501                     BRW_REGISTER_TYPE_B), 16, 8, 2);
1502
1503    if (dispatch_width == 8) {
1504       emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1505    } else {
1506       emit(MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg)));
1507       emit(MOV(half(int_sample_x, 1), fs_reg(suboffset(sample_pos_reg, 16))))
1508          ->force_sechalf = true;
1509    }
1510    /* Compute gl_SamplePosition.x */
1511    compute_sample_position(pos, int_sample_x);
1512    pos = offset(pos, 1);
1513    if (dispatch_width == 8) {
1514       emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1515    } else {
1516       emit(MOV(half(int_sample_y, 0),
1517                fs_reg(suboffset(sample_pos_reg, 1))));
1518       emit(MOV(half(int_sample_y, 1), fs_reg(suboffset(sample_pos_reg, 17))))
1519          ->force_sechalf = true;
1520    }
1521    /* Compute gl_SamplePosition.y */
1522    compute_sample_position(pos, int_sample_y);
1523    return reg;
1524 }
1525
1526 fs_reg *
1527 fs_visitor::emit_sampleid_setup()
1528 {
1529    assert(stage == MESA_SHADER_FRAGMENT);
1530    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1531    assert(brw->gen >= 6);
1532
1533    this->current_annotation = "compute sample id";
1534    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
1535
1536    if (key->compute_sample_id) {
1537       fs_reg t1 = vgrf(glsl_type::int_type);
1538       fs_reg t2 = vgrf(glsl_type::int_type);
1539       t2.type = BRW_REGISTER_TYPE_UW;
1540
1541       /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1542        * 8x multisampling, subspan 0 will represent sample N (where N
1543        * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1544        * 7. We can find the value of N by looking at R0.0 bits 7:6
1545        * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1546        * (since samples are always delivered in pairs). That is, we
1547        * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1548        * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1549        * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1550        * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1551        * populating a temporary variable with the sequence (0, 1, 2, 3),
1552        * and then reading from it using vstride=1, width=4, hstride=0.
1553        * These computations hold good for 4x multisampling as well.
1554        *
1555        * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1556        * the first four slots are sample 0 of subspan 0; the next four
1557        * are sample 1 of subspan 0; the third group is sample 0 of
1558        * subspan 1, and finally sample 1 of subspan 1.
1559        */
1560       fs_inst *inst;
1561       inst = emit(BRW_OPCODE_AND, t1,
1562                   fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1563                   fs_reg(0xc0));
1564       inst->force_writemask_all = true;
1565       inst = emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1566       inst->force_writemask_all = true;
1567       /* This works for both SIMD8 and SIMD16 */
1568       inst = emit(MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210)));
1569       inst->force_writemask_all = true;
1570       /* This special instruction takes care of setting vstride=1,
1571        * width=4, hstride=0 of t2 during an ADD instruction.
1572        */
1573       emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1574    } else {
1575       /* As per GL_ARB_sample_shading specification:
1576        * "When rendering to a non-multisample buffer, or if multisample
1577        *  rasterization is disabled, gl_SampleID will always be zero."
1578        */
1579       emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1580    }
1581
1582    return reg;
1583 }
1584
1585 void
1586 fs_visitor::resolve_source_modifiers(fs_reg *src)
1587 {
1588    if (!src->abs && !src->negate)
1589       return;
1590
1591    fs_reg temp = retype(vgrf(1), src->type);
1592    emit(MOV(temp, *src));
1593    *src = temp;
1594 }
1595
1596 fs_reg
1597 fs_visitor::fix_math_operand(fs_reg src)
1598 {
1599    /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1600     * might be able to do better by doing execsize = 1 math and then
1601     * expanding that result out, but we would need to be careful with
1602     * masking.
1603     *
1604     * The hardware ignores source modifiers (negate and abs) on math
1605     * instructions, so we also move to a temp to set those up.
1606     */
1607    if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1608        !src.abs && !src.negate)
1609       return src;
1610
1611    /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1612     * operands to math
1613     */
1614    if (brw->gen >= 7 && src.file != IMM)
1615       return src;
1616
1617    fs_reg expanded = vgrf(glsl_type::float_type);
1618    expanded.type = src.type;
1619    emit(BRW_OPCODE_MOV, expanded, src);
1620    return expanded;
1621 }
1622
1623 fs_inst *
1624 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1625 {
1626    switch (opcode) {
1627    case SHADER_OPCODE_RCP:
1628    case SHADER_OPCODE_RSQ:
1629    case SHADER_OPCODE_SQRT:
1630    case SHADER_OPCODE_EXP2:
1631    case SHADER_OPCODE_LOG2:
1632    case SHADER_OPCODE_SIN:
1633    case SHADER_OPCODE_COS:
1634       break;
1635    default:
1636       unreachable("not reached: bad math opcode");
1637    }
1638
1639    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
1640     * might be able to do better by doing execsize = 1 math and then
1641     * expanding that result out, but we would need to be careful with
1642     * masking.
1643     *
1644     * Gen 6 hardware ignores source modifiers (negate and abs) on math
1645     * instructions, so we also move to a temp to set those up.
1646     */
1647    if (brw->gen == 6 || brw->gen == 7)
1648       src = fix_math_operand(src);
1649
1650    fs_inst *inst = emit(opcode, dst, src);
1651
1652    if (brw->gen < 6) {
1653       inst->base_mrf = 2;
1654       inst->mlen = dispatch_width / 8;
1655    }
1656
1657    return inst;
1658 }
1659
1660 fs_inst *
1661 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1662 {
1663    int base_mrf = 2;
1664    fs_inst *inst;
1665
1666    if (brw->gen >= 8) {
1667       inst = emit(opcode, dst, src0, src1);
1668    } else if (brw->gen >= 6) {
1669       src0 = fix_math_operand(src0);
1670       src1 = fix_math_operand(src1);
1671
1672       inst = emit(opcode, dst, src0, src1);
1673    } else {
1674       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1675        * "Message Payload":
1676        *
1677        * "Operand0[7].  For the INT DIV functions, this operand is the
1678        *  denominator."
1679        *  ...
1680        * "Operand1[7].  For the INT DIV functions, this operand is the
1681        *  numerator."
1682        */
1683       bool is_int_div = opcode != SHADER_OPCODE_POW;
1684       fs_reg &op0 = is_int_div ? src1 : src0;
1685       fs_reg &op1 = is_int_div ? src0 : src1;
1686
1687       emit(MOV(fs_reg(MRF, base_mrf + 1, op1.type, dispatch_width), op1));
1688       inst = emit(opcode, dst, op0, reg_null_f);
1689
1690       inst->base_mrf = base_mrf;
1691       inst->mlen = 2 * dispatch_width / 8;
1692    }
1693    return inst;
1694 }
1695
1696 void
1697 fs_visitor::emit_discard_jump()
1698 {
1699    /* For performance, after a discard, jump to the end of the
1700     * shader if all relevant channels have been discarded.
1701     */
1702    fs_inst *discard_jump = emit(FS_OPCODE_DISCARD_JUMP);
1703    discard_jump->flag_subreg = 1;
1704
1705    discard_jump->predicate = (dispatch_width == 8)
1706                              ? BRW_PREDICATE_ALIGN1_ANY8H
1707                              : BRW_PREDICATE_ALIGN1_ANY16H;
1708    discard_jump->predicate_inverse = true;
1709 }
1710
1711 void
1712 fs_visitor::assign_curb_setup()
1713 {
1714    if (dispatch_width == 8) {
1715       prog_data->dispatch_grf_start_reg = payload.num_regs;
1716    } else {
1717       assert(stage == MESA_SHADER_FRAGMENT);
1718       brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1719       prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1720    }
1721
1722    prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1723
1724    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1725    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1726       for (unsigned int i = 0; i < inst->sources; i++) {
1727          if (inst->src[i].file == UNIFORM) {
1728             int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1729             int constant_nr;
1730             if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1731                constant_nr = push_constant_loc[uniform_nr];
1732             } else {
1733                /* Section 5.11 of the OpenGL 4.1 spec says:
1734                 * "Out-of-bounds reads return undefined values, which include
1735                 *  values from other variables of the active program or zero."
1736                 * Just return the first push constant.
1737                 */
1738                constant_nr = 0;
1739             }
1740
1741             struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1742                                                   constant_nr / 8,
1743                                                   constant_nr % 8);
1744
1745             inst->src[i].file = HW_REG;
1746             inst->src[i].fixed_hw_reg = byte_offset(
1747                retype(brw_reg, inst->src[i].type),
1748                inst->src[i].subreg_offset);
1749          }
1750       }
1751    }
1752 }
1753
1754 void
1755 fs_visitor::calculate_urb_setup()
1756 {
1757    assert(stage == MESA_SHADER_FRAGMENT);
1758    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1759    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1760
1761    memset(prog_data->urb_setup, -1,
1762           sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
1763
1764    int urb_next = 0;
1765    /* Figure out where each of the incoming setup attributes lands. */
1766    if (brw->gen >= 6) {
1767       if (_mesa_bitcount_64(prog->InputsRead &
1768                             BRW_FS_VARYING_INPUT_MASK) <= 16) {
1769          /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1770           * first 16 varying inputs, so we can put them wherever we want.
1771           * Just put them in order.
1772           *
1773           * This is useful because it means that (a) inputs not used by the
1774           * fragment shader won't take up valuable register space, and (b) we
1775           * won't have to recompile the fragment shader if it gets paired with
1776           * a different vertex (or geometry) shader.
1777           */
1778          for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1779             if (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1780                 BITFIELD64_BIT(i)) {
1781                prog_data->urb_setup[i] = urb_next++;
1782             }
1783          }
1784       } else {
1785          /* We have enough input varyings that the SF/SBE pipeline stage can't
1786           * arbitrarily rearrange them to suit our whim; we have to put them
1787           * in an order that matches the output of the previous pipeline stage
1788           * (geometry or vertex shader).
1789           */
1790          struct brw_vue_map prev_stage_vue_map;
1791          brw_compute_vue_map(brw, &prev_stage_vue_map,
1792                              key->input_slots_valid);
1793          int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1794          assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1795          for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1796               slot++) {
1797             int varying = prev_stage_vue_map.slot_to_varying[slot];
1798             /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1799              * unused.
1800              */
1801             if (varying != BRW_VARYING_SLOT_COUNT &&
1802                 (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1803                  BITFIELD64_BIT(varying))) {
1804                prog_data->urb_setup[varying] = slot - first_slot;
1805             }
1806          }
1807          urb_next = prev_stage_vue_map.num_slots - first_slot;
1808       }
1809    } else {
1810       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1811       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1812          /* Point size is packed into the header, not as a general attribute */
1813          if (i == VARYING_SLOT_PSIZ)
1814             continue;
1815
1816          if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1817             /* The back color slot is skipped when the front color is
1818              * also written to.  In addition, some slots can be
1819              * written in the vertex shader and not read in the
1820              * fragment shader.  So the register number must always be
1821              * incremented, mapped or not.
1822              */
1823             if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1824                prog_data->urb_setup[i] = urb_next;
1825             urb_next++;
1826          }
1827       }
1828
1829       /*
1830        * It's a FS only attribute, and we did interpolation for this attribute
1831        * in SF thread. So, count it here, too.
1832        *
1833        * See compile_sf_prog() for more info.
1834        */
1835       if (prog->InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1836          prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1837    }
1838
1839    prog_data->num_varying_inputs = urb_next;
1840 }
1841
1842 void
1843 fs_visitor::assign_urb_setup()
1844 {
1845    assert(stage == MESA_SHADER_FRAGMENT);
1846    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1847
1848    int urb_start = payload.num_regs + prog_data->base.curb_read_length;
1849
1850    /* Offset all the urb_setup[] index by the actual position of the
1851     * setup regs, now that the location of the constants has been chosen.
1852     */
1853    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1854       if (inst->opcode == FS_OPCODE_LINTERP) {
1855          assert(inst->src[2].file == HW_REG);
1856          inst->src[2].fixed_hw_reg.nr += urb_start;
1857       }
1858
1859       if (inst->opcode == FS_OPCODE_CINTERP) {
1860          assert(inst->src[0].file == HW_REG);
1861          inst->src[0].fixed_hw_reg.nr += urb_start;
1862       }
1863    }
1864
1865    /* Each attribute is 4 setup channels, each of which is half a reg. */
1866    this->first_non_payload_grf =
1867       urb_start + prog_data->num_varying_inputs * 2;
1868 }
1869
1870 void
1871 fs_visitor::assign_vs_urb_setup()
1872 {
1873    brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
1874    int grf, count, slot, channel, attr;
1875
1876    assert(stage == MESA_SHADER_VERTEX);
1877    count = _mesa_bitcount_64(vs_prog_data->inputs_read);
1878    if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid)
1879       count++;
1880
1881    /* Each attribute is 4 regs. */
1882    this->first_non_payload_grf =
1883       payload.num_regs + prog_data->curb_read_length + count * 4;
1884
1885    unsigned vue_entries =
1886       MAX2(count, vs_prog_data->base.vue_map.num_slots);
1887
1888    vs_prog_data->base.urb_entry_size = ALIGN(vue_entries, 4) / 4;
1889    vs_prog_data->base.urb_read_length = (count + 1) / 2;
1890
1891    assert(vs_prog_data->base.urb_read_length <= 15);
1892
1893    /* Rewrite all ATTR file references to the hw grf that they land in. */
1894    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1895       for (int i = 0; i < inst->sources; i++) {
1896          if (inst->src[i].file == ATTR) {
1897
1898             if (inst->src[i].reg == VERT_ATTRIB_MAX) {
1899                slot = count - 1;
1900             } else {
1901                /* Attributes come in in a contiguous block, ordered by their
1902                 * gl_vert_attrib value.  That means we can compute the slot
1903                 * number for an attribute by masking out the enabled
1904                 * attributes before it and counting the bits.
1905                 */
1906                attr = inst->src[i].reg + inst->src[i].reg_offset / 4;
1907                slot = _mesa_bitcount_64(vs_prog_data->inputs_read &
1908                                         BITFIELD64_MASK(attr));
1909             }
1910
1911             channel = inst->src[i].reg_offset & 3;
1912
1913             grf = payload.num_regs +
1914                prog_data->curb_read_length +
1915                slot * 4 + channel;
1916
1917             inst->src[i].file = HW_REG;
1918             inst->src[i].fixed_hw_reg =
1919                retype(brw_vec8_grf(grf, 0), inst->src[i].type);
1920          }
1921       }
1922    }
1923 }
1924
1925 /**
1926  * Split large virtual GRFs into separate components if we can.
1927  *
1928  * This is mostly duplicated with what brw_fs_vector_splitting does,
1929  * but that's really conservative because it's afraid of doing
1930  * splitting that doesn't result in real progress after the rest of
1931  * the optimization phases, which would cause infinite looping in
1932  * optimization.  We can do it once here, safely.  This also has the
1933  * opportunity to split interpolated values, or maybe even uniforms,
1934  * which we don't have at the IR level.
1935  *
1936  * We want to split, because virtual GRFs are what we register
1937  * allocate and spill (due to contiguousness requirements for some
1938  * instructions), and they're what we naturally generate in the
1939  * codegen process, but most virtual GRFs don't actually need to be
1940  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1941  * live intervals and better dead code elimination and coalescing.
1942  */
1943 void
1944 fs_visitor::split_virtual_grfs()
1945 {
1946    int num_vars = this->alloc.count;
1947
1948    /* Count the total number of registers */
1949    int reg_count = 0;
1950    int vgrf_to_reg[num_vars];
1951    for (int i = 0; i < num_vars; i++) {
1952       vgrf_to_reg[i] = reg_count;
1953       reg_count += alloc.sizes[i];
1954    }
1955
1956    /* An array of "split points".  For each register slot, this indicates
1957     * if this slot can be separated from the previous slot.  Every time an
1958     * instruction uses multiple elements of a register (as a source or
1959     * destination), we mark the used slots as inseparable.  Then we go
1960     * through and split the registers into the smallest pieces we can.
1961     */
1962    bool split_points[reg_count];
1963    memset(split_points, 0, sizeof(split_points));
1964
1965    /* Mark all used registers as fully splittable */
1966    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1967       if (inst->dst.file == GRF) {
1968          int reg = vgrf_to_reg[inst->dst.reg];
1969          for (unsigned j = 1; j < this->alloc.sizes[inst->dst.reg]; j++)
1970             split_points[reg + j] = true;
1971       }
1972
1973       for (int i = 0; i < inst->sources; i++) {
1974          if (inst->src[i].file == GRF) {
1975             int reg = vgrf_to_reg[inst->src[i].reg];
1976             for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].reg]; j++)
1977                split_points[reg + j] = true;
1978          }
1979       }
1980    }
1981
1982    if (brw->has_pln &&
1983        this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1984       /* PLN opcodes rely on the delta_xy being contiguous.  We only have to
1985        * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1986        * Gen6, that was the only supported interpolation mode, and since Gen6,
1987        * delta_x and delta_y are in fixed hardware registers.
1988        */
1989       int vgrf = this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg;
1990       split_points[vgrf_to_reg[vgrf] + 1] = false;
1991    }
1992
1993    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1994       if (inst->dst.file == GRF) {
1995          int reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
1996          for (int j = 1; j < inst->regs_written; j++)
1997             split_points[reg + j] = false;
1998       }
1999       for (int i = 0; i < inst->sources; i++) {
2000          if (inst->src[i].file == GRF) {
2001             int reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
2002             for (int j = 1; j < inst->regs_read(i); j++)
2003                split_points[reg + j] = false;
2004          }
2005       }
2006    }
2007
2008    int new_virtual_grf[reg_count];
2009    int new_reg_offset[reg_count];
2010
2011    int reg = 0;
2012    for (int i = 0; i < num_vars; i++) {
2013       /* The first one should always be 0 as a quick sanity check. */
2014       assert(split_points[reg] == false);
2015
2016       /* j = 0 case */
2017       new_reg_offset[reg] = 0;
2018       reg++;
2019       int offset = 1;
2020
2021       /* j > 0 case */
2022       for (unsigned j = 1; j < alloc.sizes[i]; j++) {
2023          /* If this is a split point, reset the offset to 0 and allocate a
2024           * new virtual GRF for the previous offset many registers
2025           */
2026          if (split_points[reg]) {
2027             assert(offset <= MAX_VGRF_SIZE);
2028             int grf = alloc.allocate(offset);
2029             for (int k = reg - offset; k < reg; k++)
2030                new_virtual_grf[k] = grf;
2031             offset = 0;
2032          }
2033          new_reg_offset[reg] = offset;
2034          offset++;
2035          reg++;
2036       }
2037
2038       /* The last one gets the original register number */
2039       assert(offset <= MAX_VGRF_SIZE);
2040       alloc.sizes[i] = offset;
2041       for (int k = reg - offset; k < reg; k++)
2042          new_virtual_grf[k] = i;
2043    }
2044    assert(reg == reg_count);
2045
2046    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2047       if (inst->dst.file == GRF) {
2048          reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
2049          inst->dst.reg = new_virtual_grf[reg];
2050          inst->dst.reg_offset = new_reg_offset[reg];
2051          assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
2052       }
2053       for (int i = 0; i < inst->sources; i++) {
2054          if (inst->src[i].file == GRF) {
2055             reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
2056             inst->src[i].reg = new_virtual_grf[reg];
2057             inst->src[i].reg_offset = new_reg_offset[reg];
2058             assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
2059          }
2060       }
2061    }
2062    invalidate_live_intervals();
2063 }
2064
2065 /**
2066  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
2067  *
2068  * During code generation, we create tons of temporary variables, many of
2069  * which get immediately killed and are never used again.  Yet, in later
2070  * optimization and analysis passes, such as compute_live_intervals, we need
2071  * to loop over all the virtual GRFs.  Compacting them can save a lot of
2072  * overhead.
2073  */
2074 bool
2075 fs_visitor::compact_virtual_grfs()
2076 {
2077    bool progress = false;
2078    int remap_table[this->alloc.count];
2079    memset(remap_table, -1, sizeof(remap_table));
2080
2081    /* Mark which virtual GRFs are used. */
2082    foreach_block_and_inst(block, const fs_inst, inst, cfg) {
2083       if (inst->dst.file == GRF)
2084          remap_table[inst->dst.reg] = 0;
2085
2086       for (int i = 0; i < inst->sources; i++) {
2087          if (inst->src[i].file == GRF)
2088             remap_table[inst->src[i].reg] = 0;
2089       }
2090    }
2091
2092    /* Compact the GRF arrays. */
2093    int new_index = 0;
2094    for (unsigned i = 0; i < this->alloc.count; i++) {
2095       if (remap_table[i] == -1) {
2096          /* We just found an unused register.  This means that we are
2097           * actually going to compact something.
2098           */
2099          progress = true;
2100       } else {
2101          remap_table[i] = new_index;
2102          alloc.sizes[new_index] = alloc.sizes[i];
2103          invalidate_live_intervals();
2104          ++new_index;
2105       }
2106    }
2107
2108    this->alloc.count = new_index;
2109
2110    /* Patch all the instructions to use the newly renumbered registers */
2111    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2112       if (inst->dst.file == GRF)
2113          inst->dst.reg = remap_table[inst->dst.reg];
2114
2115       for (int i = 0; i < inst->sources; i++) {
2116          if (inst->src[i].file == GRF)
2117             inst->src[i].reg = remap_table[inst->src[i].reg];
2118       }
2119    }
2120
2121    /* Patch all the references to delta_x/delta_y, since they're used in
2122     * register allocation.  If they're unused, switch them to BAD_FILE so
2123     * we don't think some random VGRF is delta_x/delta_y.
2124     */
2125    for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
2126       if (delta_x[i].file == GRF) {
2127          if (remap_table[delta_x[i].reg] != -1) {
2128             delta_x[i].reg = remap_table[delta_x[i].reg];
2129          } else {
2130             delta_x[i].file = BAD_FILE;
2131          }
2132       }
2133    }
2134    for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
2135       if (delta_y[i].file == GRF) {
2136          if (remap_table[delta_y[i].reg] != -1) {
2137             delta_y[i].reg = remap_table[delta_y[i].reg];
2138          } else {
2139             delta_y[i].file = BAD_FILE;
2140          }
2141       }
2142    }
2143
2144    return progress;
2145 }
2146
2147 /*
2148  * Implements array access of uniforms by inserting a
2149  * PULL_CONSTANT_LOAD instruction.
2150  *
2151  * Unlike temporary GRF array access (where we don't support it due to
2152  * the difficulty of doing relative addressing on instruction
2153  * destinations), we could potentially do array access of uniforms
2154  * that were loaded in GRF space as push constants.  In real-world
2155  * usage we've seen, though, the arrays being used are always larger
2156  * than we could load as push constants, so just always move all
2157  * uniform array access out to a pull constant buffer.
2158  */
2159 void
2160 fs_visitor::move_uniform_array_access_to_pull_constants()
2161 {
2162    if (dispatch_width != 8)
2163       return;
2164
2165    pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2166    memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
2167
2168    /* Walk through and find array access of uniforms.  Put a copy of that
2169     * uniform in the pull constant buffer.
2170     *
2171     * Note that we don't move constant-indexed accesses to arrays.  No
2172     * testing has been done of the performance impact of this choice.
2173     */
2174    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2175       for (int i = 0 ; i < inst->sources; i++) {
2176          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2177             continue;
2178
2179          int uniform = inst->src[i].reg;
2180
2181          /* If this array isn't already present in the pull constant buffer,
2182           * add it.
2183           */
2184          if (pull_constant_loc[uniform] == -1) {
2185             const gl_constant_value **values = &stage_prog_data->param[uniform];
2186
2187             assert(param_size[uniform]);
2188
2189             for (int j = 0; j < param_size[uniform]; j++) {
2190                pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
2191
2192                stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
2193                   values[j];
2194             }
2195          }
2196       }
2197    }
2198 }
2199
2200 /**
2201  * Assign UNIFORM file registers to either push constants or pull constants.
2202  *
2203  * We allow a fragment shader to have more than the specified minimum
2204  * maximum number of fragment shader uniform components (64).  If
2205  * there are too many of these, they'd fill up all of register space.
2206  * So, this will push some of them out to the pull constant buffer and
2207  * update the program to load them.
2208  */
2209 void
2210 fs_visitor::assign_constant_locations()
2211 {
2212    /* Only the first compile (SIMD8 mode) gets to decide on locations. */
2213    if (dispatch_width != 8)
2214       return;
2215
2216    /* Find which UNIFORM registers are still in use. */
2217    bool is_live[uniforms];
2218    for (unsigned int i = 0; i < uniforms; i++) {
2219       is_live[i] = false;
2220    }
2221
2222    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2223       for (int i = 0; i < inst->sources; i++) {
2224          if (inst->src[i].file != UNIFORM)
2225             continue;
2226
2227          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
2228          if (constant_nr >= 0 && constant_nr < (int) uniforms)
2229             is_live[constant_nr] = true;
2230       }
2231    }
2232
2233    /* Only allow 16 registers (128 uniform components) as push constants.
2234     *
2235     * Just demote the end of the list.  We could probably do better
2236     * here, demoting things that are rarely used in the program first.
2237     *
2238     * If changing this value, note the limitation about total_regs in
2239     * brw_curbe.c.
2240     */
2241    unsigned int max_push_components = 16 * 8;
2242    unsigned int num_push_constants = 0;
2243
2244    push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2245
2246    for (unsigned int i = 0; i < uniforms; i++) {
2247       if (!is_live[i] || pull_constant_loc[i] != -1) {
2248          /* This UNIFORM register is either dead, or has already been demoted
2249           * to a pull const.  Mark it as no longer living in the param[] array.
2250           */
2251          push_constant_loc[i] = -1;
2252          continue;
2253       }
2254
2255       if (num_push_constants < max_push_components) {
2256          /* Retain as a push constant.  Record the location in the params[]
2257           * array.
2258           */
2259          push_constant_loc[i] = num_push_constants++;
2260       } else {
2261          /* Demote to a pull constant. */
2262          push_constant_loc[i] = -1;
2263
2264          int pull_index = stage_prog_data->nr_pull_params++;
2265          stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
2266          pull_constant_loc[i] = pull_index;
2267       }
2268    }
2269
2270    stage_prog_data->nr_params = num_push_constants;
2271
2272    /* Up until now, the param[] array has been indexed by reg + reg_offset
2273     * of UNIFORM registers.  Condense it to only contain the uniforms we
2274     * chose to upload as push constants.
2275     */
2276    for (unsigned int i = 0; i < uniforms; i++) {
2277       int remapped = push_constant_loc[i];
2278
2279       if (remapped == -1)
2280          continue;
2281
2282       assert(remapped <= (int)i);
2283       stage_prog_data->param[remapped] = stage_prog_data->param[i];
2284    }
2285 }
2286
2287 /**
2288  * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
2289  * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
2290  */
2291 void
2292 fs_visitor::demote_pull_constants()
2293 {
2294    foreach_block_and_inst (block, fs_inst, inst, cfg) {
2295       for (int i = 0; i < inst->sources; i++) {
2296          if (inst->src[i].file != UNIFORM)
2297             continue;
2298
2299          int pull_index;
2300          unsigned location = inst->src[i].reg + inst->src[i].reg_offset;
2301          if (location >= uniforms) /* Out of bounds access */
2302             pull_index = -1;
2303          else
2304             pull_index = pull_constant_loc[location];
2305
2306          if (pull_index == -1)
2307             continue;
2308
2309          /* Set up the annotation tracking for new generated instructions. */
2310          base_ir = inst->ir;
2311          current_annotation = inst->annotation;
2312
2313          fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
2314          fs_reg dst = vgrf(glsl_type::float_type);
2315
2316          /* Generate a pull load into dst. */
2317          if (inst->src[i].reladdr) {
2318             exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
2319                                                         surf_index,
2320                                                         *inst->src[i].reladdr,
2321                                                         pull_index);
2322             inst->insert_before(block, &list);
2323             inst->src[i].reladdr = NULL;
2324          } else {
2325             fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
2326             fs_inst *pull =
2327                new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 8,
2328                                     dst, surf_index, offset);
2329             inst->insert_before(block, pull);
2330             inst->src[i].set_smear(pull_index & 3);
2331          }
2332
2333          /* Rewrite the instruction to use the temporary VGRF. */
2334          inst->src[i].file = GRF;
2335          inst->src[i].reg = dst.reg;
2336          inst->src[i].reg_offset = 0;
2337          inst->src[i].width = dispatch_width;
2338       }
2339    }
2340    invalidate_live_intervals();
2341 }
2342
2343 bool
2344 fs_visitor::opt_algebraic()
2345 {
2346    bool progress = false;
2347
2348    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2349       switch (inst->opcode) {
2350       case BRW_OPCODE_MOV:
2351          if (inst->src[0].file != IMM)
2352             break;
2353
2354          if (inst->saturate) {
2355             if (inst->dst.type != inst->src[0].type)
2356                assert(!"unimplemented: saturate mixed types");
2357
2358             if (brw_saturate_immediate(inst->dst.type,
2359                                        &inst->src[0].fixed_hw_reg)) {
2360                inst->saturate = false;
2361                progress = true;
2362             }
2363          }
2364          break;
2365
2366       case BRW_OPCODE_MUL:
2367          if (inst->src[1].file != IMM)
2368             continue;
2369
2370          /* a * 1.0 = a */
2371          if (inst->src[1].is_one()) {
2372             inst->opcode = BRW_OPCODE_MOV;
2373             inst->src[1] = reg_undef;
2374             progress = true;
2375             break;
2376          }
2377
2378          /* a * -1.0 = -a */
2379          if (inst->src[1].is_negative_one()) {
2380             inst->opcode = BRW_OPCODE_MOV;
2381             inst->src[0].negate = !inst->src[0].negate;
2382             inst->src[1] = reg_undef;
2383             progress = true;
2384             break;
2385          }
2386
2387          /* a * 0.0 = 0.0 */
2388          if (inst->src[1].is_zero()) {
2389             inst->opcode = BRW_OPCODE_MOV;
2390             inst->src[0] = inst->src[1];
2391             inst->src[1] = reg_undef;
2392             progress = true;
2393             break;
2394          }
2395
2396          if (inst->src[0].file == IMM) {
2397             assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2398             inst->opcode = BRW_OPCODE_MOV;
2399             inst->src[0].fixed_hw_reg.dw1.f *= inst->src[1].fixed_hw_reg.dw1.f;
2400             inst->src[1] = reg_undef;
2401             progress = true;
2402             break;
2403          }
2404          break;
2405       case BRW_OPCODE_ADD:
2406          if (inst->src[1].file != IMM)
2407             continue;
2408
2409          /* a + 0.0 = a */
2410          if (inst->src[1].is_zero()) {
2411             inst->opcode = BRW_OPCODE_MOV;
2412             inst->src[1] = reg_undef;
2413             progress = true;
2414             break;
2415          }
2416
2417          if (inst->src[0].file == IMM) {
2418             assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2419             inst->opcode = BRW_OPCODE_MOV;
2420             inst->src[0].fixed_hw_reg.dw1.f += inst->src[1].fixed_hw_reg.dw1.f;
2421             inst->src[1] = reg_undef;
2422             progress = true;
2423             break;
2424          }
2425          break;
2426       case BRW_OPCODE_OR:
2427          if (inst->src[0].equals(inst->src[1])) {
2428             inst->opcode = BRW_OPCODE_MOV;
2429             inst->src[1] = reg_undef;
2430             progress = true;
2431             break;
2432          }
2433          break;
2434       case BRW_OPCODE_LRP:
2435          if (inst->src[1].equals(inst->src[2])) {
2436             inst->opcode = BRW_OPCODE_MOV;
2437             inst->src[0] = inst->src[1];
2438             inst->src[1] = reg_undef;
2439             inst->src[2] = reg_undef;
2440             progress = true;
2441             break;
2442          }
2443          break;
2444       case BRW_OPCODE_CMP:
2445          if (inst->conditional_mod == BRW_CONDITIONAL_GE &&
2446              inst->src[0].abs &&
2447              inst->src[0].negate &&
2448              inst->src[1].is_zero()) {
2449             inst->src[0].abs = false;
2450             inst->src[0].negate = false;
2451             inst->conditional_mod = BRW_CONDITIONAL_Z;
2452             progress = true;
2453             break;
2454          }
2455          break;
2456       case BRW_OPCODE_SEL:
2457          if (inst->src[0].equals(inst->src[1])) {
2458             inst->opcode = BRW_OPCODE_MOV;
2459             inst->src[1] = reg_undef;
2460             inst->predicate = BRW_PREDICATE_NONE;
2461             inst->predicate_inverse = false;
2462             progress = true;
2463          } else if (inst->saturate && inst->src[1].file == IMM) {
2464             switch (inst->conditional_mod) {
2465             case BRW_CONDITIONAL_LE:
2466             case BRW_CONDITIONAL_L:
2467                switch (inst->src[1].type) {
2468                case BRW_REGISTER_TYPE_F:
2469                   if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
2470                      inst->opcode = BRW_OPCODE_MOV;
2471                      inst->src[1] = reg_undef;
2472                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
2473                      progress = true;
2474                   }
2475                   break;
2476                default:
2477                   break;
2478                }
2479                break;
2480             case BRW_CONDITIONAL_GE:
2481             case BRW_CONDITIONAL_G:
2482                switch (inst->src[1].type) {
2483                case BRW_REGISTER_TYPE_F:
2484                   if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
2485                      inst->opcode = BRW_OPCODE_MOV;
2486                      inst->src[1] = reg_undef;
2487                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
2488                      progress = true;
2489                   }
2490                   break;
2491                default:
2492                   break;
2493                }
2494             default:
2495                break;
2496             }
2497          }
2498          break;
2499       case BRW_OPCODE_MAD:
2500          if (inst->src[1].is_zero() || inst->src[2].is_zero()) {
2501             inst->opcode = BRW_OPCODE_MOV;
2502             inst->src[1] = reg_undef;
2503             inst->src[2] = reg_undef;
2504             progress = true;
2505          } else if (inst->src[0].is_zero()) {
2506             inst->opcode = BRW_OPCODE_MUL;
2507             inst->src[0] = inst->src[2];
2508             inst->src[2] = reg_undef;
2509             progress = true;
2510          } else if (inst->src[1].is_one()) {
2511             inst->opcode = BRW_OPCODE_ADD;
2512             inst->src[1] = inst->src[2];
2513             inst->src[2] = reg_undef;
2514             progress = true;
2515          } else if (inst->src[2].is_one()) {
2516             inst->opcode = BRW_OPCODE_ADD;
2517             inst->src[2] = reg_undef;
2518             progress = true;
2519          } else if (inst->src[1].file == IMM && inst->src[2].file == IMM) {
2520             inst->opcode = BRW_OPCODE_ADD;
2521             inst->src[1].fixed_hw_reg.dw1.f *= inst->src[2].fixed_hw_reg.dw1.f;
2522             inst->src[2] = reg_undef;
2523             progress = true;
2524          }
2525          break;
2526       case SHADER_OPCODE_RCP: {
2527          fs_inst *prev = (fs_inst *)inst->prev;
2528          if (prev->opcode == SHADER_OPCODE_SQRT) {
2529             if (inst->src[0].equals(prev->dst)) {
2530                inst->opcode = SHADER_OPCODE_RSQ;
2531                inst->src[0] = prev->src[0];
2532                progress = true;
2533             }
2534          }
2535          break;
2536       }
2537       default:
2538          break;
2539       }
2540
2541       /* Swap if src[0] is immediate. */
2542       if (progress && inst->is_commutative()) {
2543          if (inst->src[0].file == IMM) {
2544             fs_reg tmp = inst->src[1];
2545             inst->src[1] = inst->src[0];
2546             inst->src[0] = tmp;
2547          }
2548       }
2549    }
2550    return progress;
2551 }
2552
2553 bool
2554 fs_visitor::opt_register_renaming()
2555 {
2556    bool progress = false;
2557    int depth = 0;
2558
2559    int remap[alloc.count];
2560    memset(remap, -1, sizeof(int) * alloc.count);
2561
2562    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2563       if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
2564          depth++;
2565       } else if (inst->opcode == BRW_OPCODE_ENDIF ||
2566                  inst->opcode == BRW_OPCODE_WHILE) {
2567          depth--;
2568       }
2569
2570       /* Rewrite instruction sources. */
2571       for (int i = 0; i < inst->sources; i++) {
2572          if (inst->src[i].file == GRF &&
2573              remap[inst->src[i].reg] != -1 &&
2574              remap[inst->src[i].reg] != inst->src[i].reg) {
2575             inst->src[i].reg = remap[inst->src[i].reg];
2576             progress = true;
2577          }
2578       }
2579
2580       const int dst = inst->dst.reg;
2581
2582       if (depth == 0 &&
2583           inst->dst.file == GRF &&
2584           alloc.sizes[inst->dst.reg] == inst->dst.width / 8 &&
2585           !inst->is_partial_write()) {
2586          if (remap[dst] == -1) {
2587             remap[dst] = dst;
2588          } else {
2589             remap[dst] = alloc.allocate(inst->dst.width / 8);
2590             inst->dst.reg = remap[dst];
2591             progress = true;
2592          }
2593       } else if (inst->dst.file == GRF &&
2594                  remap[dst] != -1 &&
2595                  remap[dst] != dst) {
2596          inst->dst.reg = remap[dst];
2597          progress = true;
2598       }
2599    }
2600
2601    if (progress) {
2602       invalidate_live_intervals();
2603
2604       for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
2605          if (delta_x[i].file == GRF && remap[delta_x[i].reg] != -1) {
2606             delta_x[i].reg = remap[delta_x[i].reg];
2607          }
2608       }
2609       for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
2610          if (delta_y[i].file == GRF && remap[delta_y[i].reg] != -1) {
2611             delta_y[i].reg = remap[delta_y[i].reg];
2612          }
2613       }
2614    }
2615
2616    return progress;
2617 }
2618
2619 /**
2620  * Remove redundant or useless discard jumps.
2621  *
2622  * For example, we can eliminate jumps in the following sequence:
2623  *
2624  * discard-jump       (redundant with the next jump)
2625  * discard-jump       (useless; jumps to the next instruction)
2626  * placeholder-halt
2627  */
2628 bool
2629 fs_visitor::opt_redundant_discard_jumps()
2630 {
2631    bool progress = false;
2632
2633    bblock_t *last_bblock = cfg->blocks[cfg->num_blocks - 1];
2634
2635    fs_inst *placeholder_halt = NULL;
2636    foreach_inst_in_block_reverse(fs_inst, inst, last_bblock) {
2637       if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT) {
2638          placeholder_halt = inst;
2639          break;
2640       }
2641    }
2642
2643    if (!placeholder_halt)
2644       return false;
2645
2646    /* Delete any HALTs immediately before the placeholder halt. */
2647    for (fs_inst *prev = (fs_inst *) placeholder_halt->prev;
2648         !prev->is_head_sentinel() && prev->opcode == FS_OPCODE_DISCARD_JUMP;
2649         prev = (fs_inst *) placeholder_halt->prev) {
2650       prev->remove(last_bblock);
2651       progress = true;
2652    }
2653
2654    if (progress)
2655       invalidate_live_intervals();
2656
2657    return progress;
2658 }
2659
2660 bool
2661 fs_visitor::compute_to_mrf()
2662 {
2663    bool progress = false;
2664    int next_ip = 0;
2665
2666    /* No MRFs on Gen >= 7. */
2667    if (brw->gen >= 7)
2668       return false;
2669
2670    calculate_live_intervals();
2671
2672    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2673       int ip = next_ip;
2674       next_ip++;
2675
2676       if (inst->opcode != BRW_OPCODE_MOV ||
2677           inst->is_partial_write() ||
2678           inst->dst.file != MRF || inst->src[0].file != GRF ||
2679           inst->dst.type != inst->src[0].type ||
2680           inst->src[0].abs || inst->src[0].negate ||
2681           !inst->src[0].is_contiguous() ||
2682           inst->src[0].subreg_offset)
2683          continue;
2684
2685       /* Work out which hardware MRF registers are written by this
2686        * instruction.
2687        */
2688       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2689       int mrf_high;
2690       if (inst->dst.reg & BRW_MRF_COMPR4) {
2691          mrf_high = mrf_low + 4;
2692       } else if (inst->exec_size == 16) {
2693          mrf_high = mrf_low + 1;
2694       } else {
2695          mrf_high = mrf_low;
2696       }
2697
2698       /* Can't compute-to-MRF this GRF if someone else was going to
2699        * read it later.
2700        */
2701       if (this->virtual_grf_end[inst->src[0].reg] > ip)
2702          continue;
2703
2704       /* Found a move of a GRF to a MRF.  Let's see if we can go
2705        * rewrite the thing that made this GRF to write into the MRF.
2706        */
2707       foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2708          if (scan_inst->dst.file == GRF &&
2709              scan_inst->dst.reg == inst->src[0].reg) {
2710             /* Found the last thing to write our reg we want to turn
2711              * into a compute-to-MRF.
2712              */
2713
2714             /* If this one instruction didn't populate all the
2715              * channels, bail.  We might be able to rewrite everything
2716              * that writes that reg, but it would require smarter
2717              * tracking to delay the rewriting until complete success.
2718              */
2719             if (scan_inst->is_partial_write())
2720                break;
2721
2722             /* Things returning more than one register would need us to
2723              * understand coalescing out more than one MOV at a time.
2724              */
2725             if (scan_inst->regs_written > scan_inst->dst.width / 8)
2726                break;
2727
2728             /* SEND instructions can't have MRF as a destination. */
2729             if (scan_inst->mlen)
2730                break;
2731
2732             if (brw->gen == 6) {
2733                /* gen6 math instructions must have the destination be
2734                 * GRF, so no compute-to-MRF for them.
2735                 */
2736                if (scan_inst->is_math()) {
2737                   break;
2738                }
2739             }
2740
2741             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2742                /* Found the creator of our MRF's source value. */
2743                scan_inst->dst.file = MRF;
2744                scan_inst->dst.reg = inst->dst.reg;
2745                scan_inst->saturate |= inst->saturate;
2746                inst->remove(block);
2747                progress = true;
2748             }
2749             break;
2750          }
2751
2752          /* We don't handle control flow here.  Most computation of
2753           * values that end up in MRFs are shortly before the MRF
2754           * write anyway.
2755           */
2756          if (block->start() == scan_inst)
2757             break;
2758
2759          /* You can't read from an MRF, so if someone else reads our
2760           * MRF's source GRF that we wanted to rewrite, that stops us.
2761           */
2762          bool interfered = false;
2763          for (int i = 0; i < scan_inst->sources; i++) {
2764             if (scan_inst->src[i].file == GRF &&
2765                 scan_inst->src[i].reg == inst->src[0].reg &&
2766                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2767                interfered = true;
2768             }
2769          }
2770          if (interfered)
2771             break;
2772
2773          if (scan_inst->dst.file == MRF) {
2774             /* If somebody else writes our MRF here, we can't
2775              * compute-to-MRF before that.
2776              */
2777             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2778             int scan_mrf_high;
2779
2780             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2781                scan_mrf_high = scan_mrf_low + 4;
2782             } else if (scan_inst->exec_size == 16) {
2783                scan_mrf_high = scan_mrf_low + 1;
2784             } else {
2785                scan_mrf_high = scan_mrf_low;
2786             }
2787
2788             if (mrf_low == scan_mrf_low ||
2789                 mrf_low == scan_mrf_high ||
2790                 mrf_high == scan_mrf_low ||
2791                 mrf_high == scan_mrf_high) {
2792                break;
2793             }
2794          }
2795
2796          if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2797             /* Found a SEND instruction, which means that there are
2798              * live values in MRFs from base_mrf to base_mrf +
2799              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2800              * above it.
2801              */
2802             if (mrf_low >= scan_inst->base_mrf &&
2803                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2804                break;
2805             }
2806             if (mrf_high >= scan_inst->base_mrf &&
2807                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2808                break;
2809             }
2810          }
2811       }
2812    }
2813
2814    if (progress)
2815       invalidate_live_intervals();
2816
2817    return progress;
2818 }
2819
2820 /**
2821  * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
2822  * instructions to FS_OPCODE_REP_FB_WRITE.
2823  */
2824 void
2825 fs_visitor::emit_repclear_shader()
2826 {
2827    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2828    int base_mrf = 1;
2829    int color_mrf = base_mrf + 2;
2830
2831    fs_inst *mov = emit(MOV(vec4(brw_message_reg(color_mrf)),
2832                            fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F)));
2833    mov->force_writemask_all = true;
2834
2835    fs_inst *write;
2836    if (key->nr_color_regions == 1) {
2837       write = emit(FS_OPCODE_REP_FB_WRITE);
2838       write->saturate = key->clamp_fragment_color;
2839       write->base_mrf = color_mrf;
2840       write->target = 0;
2841       write->header_present = false;
2842       write->mlen = 1;
2843    } else {
2844       assume(key->nr_color_regions > 0);
2845       for (int i = 0; i < key->nr_color_regions; ++i) {
2846          write = emit(FS_OPCODE_REP_FB_WRITE);
2847          write->saturate = key->clamp_fragment_color;
2848          write->base_mrf = base_mrf;
2849          write->target = i;
2850          write->header_present = true;
2851          write->mlen = 3;
2852       }
2853    }
2854    write->eot = true;
2855
2856    calculate_cfg();
2857
2858    assign_constant_locations();
2859    assign_curb_setup();
2860
2861    /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
2862    assert(mov->src[0].file == HW_REG);
2863    mov->src[0] = brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
2864 }
2865
2866 /**
2867  * Walks through basic blocks, looking for repeated MRF writes and
2868  * removing the later ones.
2869  */
2870 bool
2871 fs_visitor::remove_duplicate_mrf_writes()
2872 {
2873    fs_inst *last_mrf_move[16];
2874    bool progress = false;
2875
2876    /* Need to update the MRF tracking for compressed instructions. */
2877    if (dispatch_width == 16)
2878       return false;
2879
2880    memset(last_mrf_move, 0, sizeof(last_mrf_move));
2881
2882    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
2883       if (inst->is_control_flow()) {
2884          memset(last_mrf_move, 0, sizeof(last_mrf_move));
2885       }
2886
2887       if (inst->opcode == BRW_OPCODE_MOV &&
2888           inst->dst.file == MRF) {
2889          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2890          if (prev_inst && inst->equals(prev_inst)) {
2891             inst->remove(block);
2892             progress = true;
2893             continue;
2894          }
2895       }
2896
2897       /* Clear out the last-write records for MRFs that were overwritten. */
2898       if (inst->dst.file == MRF) {
2899          last_mrf_move[inst->dst.reg] = NULL;
2900       }
2901
2902       if (inst->mlen > 0 && inst->base_mrf != -1) {
2903          /* Found a SEND instruction, which will include two or fewer
2904           * implied MRF writes.  We could do better here.
2905           */
2906          for (int i = 0; i < implied_mrf_writes(inst); i++) {
2907             last_mrf_move[inst->base_mrf + i] = NULL;
2908          }
2909       }
2910
2911       /* Clear out any MRF move records whose sources got overwritten. */
2912       if (inst->dst.file == GRF) {
2913          for (unsigned int i = 0; i < ARRAY_SIZE(last_mrf_move); i++) {
2914             if (last_mrf_move[i] &&
2915                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2916                last_mrf_move[i] = NULL;
2917             }
2918          }
2919       }
2920
2921       if (inst->opcode == BRW_OPCODE_MOV &&
2922           inst->dst.file == MRF &&
2923           inst->src[0].file == GRF &&
2924           !inst->is_partial_write()) {
2925          last_mrf_move[inst->dst.reg] = inst;
2926       }
2927    }
2928
2929    if (progress)
2930       invalidate_live_intervals();
2931
2932    return progress;
2933 }
2934
2935 static void
2936 clear_deps_for_inst_src(fs_inst *inst, bool *deps, int first_grf, int grf_len)
2937 {
2938    /* Clear the flag for registers that actually got read (as expected). */
2939    for (int i = 0; i < inst->sources; i++) {
2940       int grf;
2941       if (inst->src[i].file == GRF) {
2942          grf = inst->src[i].reg;
2943       } else if (inst->src[i].file == HW_REG &&
2944                  inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2945          grf = inst->src[i].fixed_hw_reg.nr;
2946       } else {
2947          continue;
2948       }
2949
2950       if (grf >= first_grf &&
2951           grf < first_grf + grf_len) {
2952          deps[grf - first_grf] = false;
2953          if (inst->exec_size == 16)
2954             deps[grf - first_grf + 1] = false;
2955       }
2956    }
2957 }
2958
2959 /**
2960  * Implements this workaround for the original 965:
2961  *
2962  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2963  *      check for post destination dependencies on this instruction, software
2964  *      must ensure that there is no destination hazard for the case of ‘write
2965  *      followed by a posted write’ shown in the following example.
2966  *
2967  *      1. mov r3 0
2968  *      2. send r3.xy <rest of send instruction>
2969  *      3. mov r2 r3
2970  *
2971  *      Due to no post-destination dependency check on the ‘send’, the above
2972  *      code sequence could have two instructions (1 and 2) in flight at the
2973  *      same time that both consider ‘r3’ as the target of their final writes.
2974  */
2975 void
2976 fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
2977                                                         fs_inst *inst)
2978 {
2979    int write_len = inst->regs_written;
2980    int first_write_grf = inst->dst.reg;
2981    bool needs_dep[BRW_MAX_MRF];
2982    assert(write_len < (int)sizeof(needs_dep) - 1);
2983
2984    memset(needs_dep, false, sizeof(needs_dep));
2985    memset(needs_dep, true, write_len);
2986
2987    clear_deps_for_inst_src(inst, needs_dep, first_write_grf, write_len);
2988
2989    /* Walk backwards looking for writes to registers we're writing which
2990     * aren't read since being written.  If we hit the start of the program,
2991     * we assume that there are no outstanding dependencies on entry to the
2992     * program.
2993     */
2994    foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2995       /* If we hit control flow, assume that there *are* outstanding
2996        * dependencies, and force their cleanup before our instruction.
2997        */
2998       if (block->start() == scan_inst) {
2999          for (int i = 0; i < write_len; i++) {
3000             if (needs_dep[i]) {
3001                inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
3002             }
3003          }
3004          return;
3005       }
3006
3007       /* We insert our reads as late as possible on the assumption that any
3008        * instruction but a MOV that might have left us an outstanding
3009        * dependency has more latency than a MOV.
3010        */
3011       if (scan_inst->dst.file == GRF) {
3012          for (int i = 0; i < scan_inst->regs_written; i++) {
3013             int reg = scan_inst->dst.reg + i;
3014
3015             if (reg >= first_write_grf &&
3016                 reg < first_write_grf + write_len &&
3017                 needs_dep[reg - first_write_grf]) {
3018                inst->insert_before(block, DEP_RESOLVE_MOV(reg));
3019                needs_dep[reg - first_write_grf] = false;
3020                if (scan_inst->exec_size == 16)
3021                   needs_dep[reg - first_write_grf + 1] = false;
3022             }
3023          }
3024       }
3025
3026       /* Clear the flag for registers that actually got read (as expected). */
3027       clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3028
3029       /* Continue the loop only if we haven't resolved all the dependencies */
3030       int i;
3031       for (i = 0; i < write_len; i++) {
3032          if (needs_dep[i])
3033             break;
3034       }
3035       if (i == write_len)
3036          return;
3037    }
3038 }
3039
3040 /**
3041  * Implements this workaround for the original 965:
3042  *
3043  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
3044  *      used as a destination register until after it has been sourced by an
3045  *      instruction with a different destination register.
3046  */
3047 void
3048 fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
3049 {
3050    int write_len = inst->regs_written;
3051    int first_write_grf = inst->dst.reg;
3052    bool needs_dep[BRW_MAX_MRF];
3053    assert(write_len < (int)sizeof(needs_dep) - 1);
3054
3055    memset(needs_dep, false, sizeof(needs_dep));
3056    memset(needs_dep, true, write_len);
3057    /* Walk forwards looking for writes to registers we're writing which aren't
3058     * read before being written.
3059     */
3060    foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst, block) {
3061       /* If we hit control flow, force resolve all remaining dependencies. */
3062       if (block->end() == scan_inst) {
3063          for (int i = 0; i < write_len; i++) {
3064             if (needs_dep[i])
3065                scan_inst->insert_before(block,
3066                                         DEP_RESOLVE_MOV(first_write_grf + i));
3067          }
3068          return;
3069       }
3070
3071       /* Clear the flag for registers that actually got read (as expected). */
3072       clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3073
3074       /* We insert our reads as late as possible since they're reading the
3075        * result of a SEND, which has massive latency.
3076        */
3077       if (scan_inst->dst.file == GRF &&
3078           scan_inst->dst.reg >= first_write_grf &&
3079           scan_inst->dst.reg < first_write_grf + write_len &&
3080           needs_dep[scan_inst->dst.reg - first_write_grf]) {
3081          scan_inst->insert_before(block, DEP_RESOLVE_MOV(scan_inst->dst.reg));
3082          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
3083       }
3084
3085       /* Continue the loop only if we haven't resolved all the dependencies */
3086       int i;
3087       for (i = 0; i < write_len; i++) {
3088          if (needs_dep[i])
3089             break;
3090       }
3091       if (i == write_len)
3092          return;
3093    }
3094 }
3095
3096 void
3097 fs_visitor::insert_gen4_send_dependency_workarounds()
3098 {
3099    if (brw->gen != 4 || brw->is_g4x)
3100       return;
3101
3102    bool progress = false;
3103
3104    /* Note that we're done with register allocation, so GRF fs_regs always
3105     * have a .reg_offset of 0.
3106     */
3107
3108    foreach_block_and_inst(block, fs_inst, inst, cfg) {
3109       if (inst->mlen != 0 && inst->dst.file == GRF) {
3110          insert_gen4_pre_send_dependency_workarounds(block, inst);
3111          insert_gen4_post_send_dependency_workarounds(block, inst);
3112          progress = true;
3113       }
3114    }
3115
3116    if (progress)
3117       invalidate_live_intervals();
3118 }
3119
3120 /**
3121  * Turns the generic expression-style uniform pull constant load instruction
3122  * into a hardware-specific series of instructions for loading a pull
3123  * constant.
3124  *
3125  * The expression style allows the CSE pass before this to optimize out
3126  * repeated loads from the same offset, and gives the pre-register-allocation
3127  * scheduling full flexibility, while the conversion to native instructions
3128  * allows the post-register-allocation scheduler the best information
3129  * possible.
3130  *
3131  * Note that execution masking for setting up pull constant loads is special:
3132  * the channels that need to be written are unrelated to the current execution
3133  * mask, since a later instruction will use one of the result channels as a
3134  * source operand for all 8 or 16 of its channels.
3135  */
3136 void
3137 fs_visitor::lower_uniform_pull_constant_loads()
3138 {
3139    foreach_block_and_inst (block, fs_inst, inst, cfg) {
3140       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
3141          continue;
3142
3143       if (brw->gen >= 7) {
3144          /* The offset arg before was a vec4-aligned byte offset.  We need to
3145           * turn it into a dword offset.
3146           */
3147          fs_reg const_offset_reg = inst->src[1];
3148          assert(const_offset_reg.file == IMM &&
3149                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
3150          const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
3151          fs_reg payload = fs_reg(GRF, alloc.allocate(1));
3152
3153          /* We have to use a message header on Skylake to get SIMD4x2 mode.
3154           * Reserve space for the register.
3155           */
3156          if (brw->gen >= 9) {
3157             payload.reg_offset++;
3158             alloc.sizes[payload.reg] = 2;
3159          }
3160
3161          /* This is actually going to be a MOV, but since only the first dword
3162           * is accessed, we have a special opcode to do just that one.  Note
3163           * that this needs to be an operation that will be considered a def
3164           * by live variable analysis, or register allocation will explode.
3165           */
3166          fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
3167                                                8, payload, const_offset_reg);
3168          setup->force_writemask_all = true;
3169
3170          setup->ir = inst->ir;
3171          setup->annotation = inst->annotation;
3172          inst->insert_before(block, setup);
3173
3174          /* Similarly, this will only populate the first 4 channels of the
3175           * result register (since we only use smear values from 0-3), but we
3176           * don't tell the optimizer.
3177           */
3178          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
3179          inst->src[1] = payload;
3180
3181          invalidate_live_intervals();
3182       } else {
3183          /* Before register allocation, we didn't tell the scheduler about the
3184           * MRF we use.  We know it's safe to use this MRF because nothing
3185           * else does except for register spill/unspill, which generates and
3186           * uses its MRF within a single IR instruction.
3187           */
3188          inst->base_mrf = 14;
3189          inst->mlen = 1;
3190       }
3191    }
3192 }
3193
3194 bool
3195 fs_visitor::lower_load_payload()
3196 {
3197    bool progress = false;
3198
3199    int vgrf_to_reg[alloc.count];
3200    int reg_count = 0;
3201    for (unsigned i = 0; i < alloc.count; ++i) {
3202       vgrf_to_reg[i] = reg_count;
3203       reg_count += alloc.sizes[i];
3204    }
3205
3206    struct {
3207       bool written:1; /* Whether this register has ever been written */
3208       bool force_writemask_all:1;
3209       bool force_sechalf:1;
3210    } metadata[reg_count];
3211    memset(metadata, 0, sizeof(metadata));
3212
3213    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3214       if (inst->dst.file == GRF) {
3215          const int dst_reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
3216          bool force_sechalf = inst->force_sechalf &&
3217                               !inst->force_writemask_all;
3218          bool toggle_sechalf = inst->dst.width == 16 &&
3219                                type_sz(inst->dst.type) == 4 &&
3220                                !inst->force_writemask_all;
3221          for (int i = 0; i < inst->regs_written; ++i) {
3222             metadata[dst_reg + i].written = true;
3223             metadata[dst_reg + i].force_sechalf = force_sechalf;
3224             metadata[dst_reg + i].force_writemask_all = inst->force_writemask_all;
3225             force_sechalf = (toggle_sechalf != force_sechalf);
3226          }
3227       }
3228
3229       if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
3230          assert(inst->dst.file == MRF || inst->dst.file == GRF);
3231          fs_reg dst = inst->dst;
3232
3233          for (int i = 0; i < inst->sources; i++) {
3234             dst.width = inst->src[i].effective_width;
3235             dst.type = inst->src[i].type;
3236
3237             if (inst->src[i].file == BAD_FILE) {
3238                /* Do nothing but otherwise increment as normal */
3239             } else if (dst.file == MRF &&
3240                        dst.width == 8 &&
3241                        brw->has_compr4 &&
3242                        i + 4 < inst->sources &&
3243                        inst->src[i + 4].equals(horiz_offset(inst->src[i], 8))) {
3244                fs_reg compr4_dst = dst;
3245                compr4_dst.reg += BRW_MRF_COMPR4;
3246                compr4_dst.width = 16;
3247                fs_reg compr4_src = inst->src[i];
3248                compr4_src.width = 16;
3249                fs_inst *mov = MOV(compr4_dst, compr4_src);
3250                mov->force_writemask_all = true;
3251                inst->insert_before(block, mov);
3252                /* Mark i+4 as BAD_FILE so we don't emit a MOV for it */
3253                inst->src[i + 4].file = BAD_FILE;
3254             } else {
3255                fs_inst *mov = MOV(dst, inst->src[i]);
3256                if (inst->src[i].file == GRF) {
3257                   int src_reg = vgrf_to_reg[inst->src[i].reg] +
3258                                 inst->src[i].reg_offset;
3259                   mov->force_sechalf = metadata[src_reg].force_sechalf;
3260                   mov->force_writemask_all = metadata[src_reg].force_writemask_all;
3261                } else {
3262                   /* We don't have any useful metadata for immediates or
3263                    * uniforms.  Assume that any of the channels of the
3264                    * destination may be used.
3265                    */
3266                   assert(inst->src[i].file == IMM ||
3267                          inst->src[i].file == UNIFORM);
3268                   mov->force_writemask_all = true;
3269                }
3270
3271                if (dst.file == GRF) {
3272                   const int dst_reg = vgrf_to_reg[dst.reg] + dst.reg_offset;
3273                   const bool force_writemask = mov->force_writemask_all;
3274                   metadata[dst_reg].force_writemask_all = force_writemask;
3275                   metadata[dst_reg].force_sechalf = mov->force_sechalf;
3276                   if (dst.width * type_sz(dst.type) > 32) {
3277                      assert(!mov->force_sechalf);
3278                      metadata[dst_reg + 1].force_writemask_all = force_writemask;
3279                      metadata[dst_reg + 1].force_sechalf = !force_writemask;
3280                   }
3281                }
3282
3283                inst->insert_before(block, mov);
3284             }
3285
3286             dst = offset(dst, 1);
3287          }
3288
3289          inst->remove(block);
3290          progress = true;
3291       }
3292    }
3293
3294    if (progress)
3295       invalidate_live_intervals();
3296
3297    return progress;
3298 }
3299
3300 void
3301 fs_visitor::dump_instructions()
3302 {
3303    dump_instructions(NULL);
3304 }
3305
3306 void
3307 fs_visitor::dump_instructions(const char *name)
3308 {
3309    FILE *file = stderr;
3310    if (name && geteuid() != 0) {
3311       file = fopen(name, "w");
3312       if (!file)
3313          file = stderr;
3314    }
3315
3316    if (cfg) {
3317       calculate_register_pressure();
3318       int ip = 0, max_pressure = 0;
3319       foreach_block_and_inst(block, backend_instruction, inst, cfg) {
3320          max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
3321          fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
3322          dump_instruction(inst, file);
3323          ip++;
3324       }
3325       fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
3326    } else {
3327       int ip = 0;
3328       foreach_in_list(backend_instruction, inst, &instructions) {
3329          fprintf(file, "%4d: ", ip++);
3330          dump_instruction(inst, file);
3331       }
3332    }
3333
3334    if (file != stderr) {
3335       fclose(file);
3336    }
3337 }
3338
3339 void
3340 fs_visitor::dump_instruction(backend_instruction *be_inst)
3341 {
3342    dump_instruction(be_inst, stderr);
3343 }
3344
3345 void
3346 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
3347 {
3348    fs_inst *inst = (fs_inst *)be_inst;
3349
3350    if (inst->predicate) {
3351       fprintf(file, "(%cf0.%d) ",
3352              inst->predicate_inverse ? '-' : '+',
3353              inst->flag_subreg);
3354    }
3355
3356    fprintf(file, "%s", brw_instruction_name(inst->opcode));
3357    if (inst->saturate)
3358       fprintf(file, ".sat");
3359    if (inst->conditional_mod) {
3360       fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
3361       if (!inst->predicate &&
3362           (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
3363                               inst->opcode != BRW_OPCODE_IF &&
3364                               inst->opcode != BRW_OPCODE_WHILE))) {
3365          fprintf(file, ".f0.%d", inst->flag_subreg);
3366       }
3367    }
3368    fprintf(file, "(%d) ", inst->exec_size);
3369
3370
3371    switch (inst->dst.file) {
3372    case GRF:
3373       fprintf(file, "vgrf%d", inst->dst.reg);
3374       if (inst->dst.width != dispatch_width)
3375          fprintf(file, "@%d", inst->dst.width);
3376       if (alloc.sizes[inst->dst.reg] != inst->dst.width / 8 ||
3377           inst->dst.subreg_offset)
3378          fprintf(file, "+%d.%d",
3379                  inst->dst.reg_offset, inst->dst.subreg_offset);
3380       break;
3381    case MRF:
3382       fprintf(file, "m%d", inst->dst.reg);
3383       break;
3384    case BAD_FILE:
3385       fprintf(file, "(null)");
3386       break;
3387    case UNIFORM:
3388       fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
3389       break;
3390    case ATTR:
3391       fprintf(file, "***attr%d***", inst->dst.reg + inst->dst.reg_offset);
3392       break;
3393    case HW_REG:
3394       if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3395          switch (inst->dst.fixed_hw_reg.nr) {
3396          case BRW_ARF_NULL:
3397             fprintf(file, "null");
3398             break;
3399          case BRW_ARF_ADDRESS:
3400             fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
3401             break;
3402          case BRW_ARF_ACCUMULATOR:
3403             fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
3404             break;
3405          case BRW_ARF_FLAG:
3406             fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3407                              inst->dst.fixed_hw_reg.subnr);
3408             break;
3409          default:
3410             fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3411                                inst->dst.fixed_hw_reg.subnr);
3412             break;
3413          }
3414       } else {
3415          fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
3416       }
3417       if (inst->dst.fixed_hw_reg.subnr)
3418          fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
3419       break;
3420    default:
3421       fprintf(file, "???");
3422       break;
3423    }
3424    fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
3425
3426    for (int i = 0; i < inst->sources; i++) {
3427       if (inst->src[i].negate)
3428          fprintf(file, "-");
3429       if (inst->src[i].abs)
3430          fprintf(file, "|");
3431       switch (inst->src[i].file) {
3432       case GRF:
3433          fprintf(file, "vgrf%d", inst->src[i].reg);
3434          if (inst->src[i].width != dispatch_width)
3435             fprintf(file, "@%d", inst->src[i].width);
3436          if (alloc.sizes[inst->src[i].reg] != inst->src[i].width / 8 ||
3437              inst->src[i].subreg_offset)
3438             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3439                     inst->src[i].subreg_offset);
3440          break;
3441       case MRF:
3442          fprintf(file, "***m%d***", inst->src[i].reg);
3443          break;
3444       case ATTR:
3445          fprintf(file, "attr%d", inst->src[i].reg + inst->src[i].reg_offset);
3446          break;
3447       case UNIFORM:
3448          fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
3449          if (inst->src[i].reladdr) {
3450             fprintf(file, "+reladdr");
3451          } else if (inst->src[i].subreg_offset) {
3452             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3453                     inst->src[i].subreg_offset);
3454          }
3455          break;
3456       case BAD_FILE:
3457          fprintf(file, "(null)");
3458          break;
3459       case IMM:
3460          switch (inst->src[i].type) {
3461          case BRW_REGISTER_TYPE_F:
3462             fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
3463             break;
3464          case BRW_REGISTER_TYPE_W:
3465          case BRW_REGISTER_TYPE_D:
3466             fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
3467             break;
3468          case BRW_REGISTER_TYPE_UW:
3469          case BRW_REGISTER_TYPE_UD:
3470             fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
3471             break;
3472          case BRW_REGISTER_TYPE_VF:
3473             fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
3474                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >>  0) & 0xff),
3475                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >>  8) & 0xff),
3476                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 16) & 0xff),
3477                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 24) & 0xff));
3478             break;
3479          default:
3480             fprintf(file, "???");
3481             break;
3482          }
3483          break;
3484       case HW_REG:
3485          if (inst->src[i].fixed_hw_reg.negate)
3486             fprintf(file, "-");
3487          if (inst->src[i].fixed_hw_reg.abs)
3488             fprintf(file, "|");
3489          if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3490             switch (inst->src[i].fixed_hw_reg.nr) {
3491             case BRW_ARF_NULL:
3492                fprintf(file, "null");
3493                break;
3494             case BRW_ARF_ADDRESS:
3495                fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
3496                break;
3497             case BRW_ARF_ACCUMULATOR:
3498                fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
3499                break;
3500             case BRW_ARF_FLAG:
3501                fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3502                                 inst->src[i].fixed_hw_reg.subnr);
3503                break;
3504             default:
3505                fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3506                                   inst->src[i].fixed_hw_reg.subnr);
3507                break;
3508             }
3509          } else {
3510             fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3511          }
3512          if (inst->src[i].fixed_hw_reg.subnr)
3513             fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
3514          if (inst->src[i].fixed_hw_reg.abs)
3515             fprintf(file, "|");
3516          break;
3517       default:
3518          fprintf(file, "???");
3519          break;
3520       }
3521       if (inst->src[i].abs)
3522          fprintf(file, "|");
3523
3524       if (inst->src[i].file != IMM) {
3525          fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
3526       }
3527
3528       if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
3529          fprintf(file, ", ");
3530    }
3531
3532    fprintf(file, " ");
3533
3534    if (dispatch_width == 16 && inst->exec_size == 8) {
3535       if (inst->force_sechalf)
3536          fprintf(file, "2ndhalf ");
3537       else
3538          fprintf(file, "1sthalf ");
3539    }
3540
3541    fprintf(file, "\n");
3542 }
3543
3544 /**
3545  * Possibly returns an instruction that set up @param reg.
3546  *
3547  * Sometimes we want to take the result of some expression/variable
3548  * dereference tree and rewrite the instruction generating the result
3549  * of the tree.  When processing the tree, we know that the
3550  * instructions generated are all writing temporaries that are dead
3551  * outside of this tree.  So, if we have some instructions that write
3552  * a temporary, we're free to point that temp write somewhere else.
3553  *
3554  * Note that this doesn't guarantee that the instruction generated
3555  * only reg -- it might be the size=4 destination of a texture instruction.
3556  */
3557 fs_inst *
3558 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3559                                            fs_inst *end,
3560                                            const fs_reg &reg)
3561 {
3562    if (end == start ||
3563        end->is_partial_write() ||
3564        reg.reladdr ||
3565        !reg.equals(end->dst)) {
3566       return NULL;
3567    } else {
3568       return end;
3569    }
3570 }
3571
3572 void
3573 fs_visitor::setup_payload_gen6()
3574 {
3575    bool uses_depth =
3576       (prog->InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3577    unsigned barycentric_interp_modes =
3578       (stage == MESA_SHADER_FRAGMENT) ?
3579       ((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
3580
3581    assert(brw->gen >= 6);
3582
3583    /* R0-1: masks, pixel X/Y coordinates. */
3584    payload.num_regs = 2;
3585    /* R2: only for 32-pixel dispatch.*/
3586
3587    /* R3-26: barycentric interpolation coordinates.  These appear in the
3588     * same order that they appear in the brw_wm_barycentric_interp_mode
3589     * enum.  Each set of coordinates occupies 2 registers if dispatch width
3590     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
3591     * appear if they were enabled using the "Barycentric Interpolation
3592     * Mode" bits in WM_STATE.
3593     */
3594    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3595       if (barycentric_interp_modes & (1 << i)) {
3596          payload.barycentric_coord_reg[i] = payload.num_regs;
3597          payload.num_regs += 2;
3598          if (dispatch_width == 16) {
3599             payload.num_regs += 2;
3600          }
3601       }
3602    }
3603
3604    /* R27: interpolated depth if uses source depth */
3605    if (uses_depth) {
3606       payload.source_depth_reg = payload.num_regs;
3607       payload.num_regs++;
3608       if (dispatch_width == 16) {
3609          /* R28: interpolated depth if not SIMD8. */
3610          payload.num_regs++;
3611       }
3612    }
3613    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3614    if (uses_depth) {
3615       payload.source_w_reg = payload.num_regs;
3616       payload.num_regs++;
3617       if (dispatch_width == 16) {
3618          /* R30: interpolated W if not SIMD8. */
3619          payload.num_regs++;
3620       }
3621    }
3622
3623    if (stage == MESA_SHADER_FRAGMENT) {
3624       brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3625       brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3626       prog_data->uses_pos_offset = key->compute_pos_offset;
3627       /* R31: MSAA position offsets. */
3628       if (prog_data->uses_pos_offset) {
3629          payload.sample_pos_reg = payload.num_regs;
3630          payload.num_regs++;
3631       }
3632    }
3633
3634    /* R32: MSAA input coverage mask */
3635    if (prog->SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3636       assert(brw->gen >= 7);
3637       payload.sample_mask_in_reg = payload.num_regs;
3638       payload.num_regs++;
3639       if (dispatch_width == 16) {
3640          /* R33: input coverage mask if not SIMD8. */
3641          payload.num_regs++;
3642       }
3643    }
3644
3645    /* R34-: bary for 32-pixel. */
3646    /* R58-59: interp W for 32-pixel. */
3647
3648    if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3649       source_depth_to_render_target = true;
3650    }
3651 }
3652
3653 void
3654 fs_visitor::setup_vs_payload()
3655 {
3656    /* R0: thread header, R1: urb handles */
3657    payload.num_regs = 2;
3658 }
3659
3660 void
3661 fs_visitor::assign_binding_table_offsets()
3662 {
3663    assert(stage == MESA_SHADER_FRAGMENT);
3664    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3665    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3666    uint32_t next_binding_table_offset = 0;
3667
3668    /* If there are no color regions, we still perform an FB write to a null
3669     * renderbuffer, which we place at surface index 0.
3670     */
3671    prog_data->binding_table.render_target_start = next_binding_table_offset;
3672    next_binding_table_offset += MAX2(key->nr_color_regions, 1);
3673
3674    assign_common_binding_table_offsets(next_binding_table_offset);
3675 }
3676
3677 void
3678 fs_visitor::calculate_register_pressure()
3679 {
3680    invalidate_live_intervals();
3681    calculate_live_intervals();
3682
3683    unsigned num_instructions = 0;
3684    foreach_block(block, cfg)
3685       num_instructions += block->instructions.length();
3686
3687    regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3688
3689    for (unsigned reg = 0; reg < alloc.count; reg++) {
3690       for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3691          regs_live_at_ip[ip] += alloc.sizes[reg];
3692    }
3693 }
3694
3695 void
3696 fs_visitor::optimize()
3697 {
3698    const char *stage_name = stage == MESA_SHADER_VERTEX ? "vs" : "fs";
3699
3700    split_virtual_grfs();
3701
3702    move_uniform_array_access_to_pull_constants();
3703    assign_constant_locations();
3704    demote_pull_constants();
3705
3706 #define OPT(pass, args...) ({                                           \
3707       pass_num++;                                                       \
3708       bool this_progress = pass(args);                                  \
3709                                                                         \
3710       if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) {   \
3711          char filename[64];                                             \
3712          snprintf(filename, 64, "%s%d-%04d-%02d-%02d-" #pass,              \
3713                   stage_name, dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
3714                                                                         \
3715          backend_visitor::dump_instructions(filename);                  \
3716       }                                                                 \
3717                                                                         \
3718       progress = progress || this_progress;                             \
3719       this_progress;                                                    \
3720    })
3721
3722    if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
3723       char filename[64];
3724       snprintf(filename, 64, "%s%d-%04d-00-start",
3725                stage_name, dispatch_width, shader_prog ? shader_prog->Name : 0);
3726
3727       backend_visitor::dump_instructions(filename);
3728    }
3729
3730    bool progress;
3731    int iteration = 0;
3732    int pass_num = 0;
3733    do {
3734       progress = false;
3735       pass_num = 0;
3736       iteration++;
3737
3738       OPT(remove_duplicate_mrf_writes);
3739
3740       OPT(opt_algebraic);
3741       OPT(opt_cse);
3742       OPT(opt_copy_propagate);
3743       OPT(opt_peephole_predicated_break);
3744       OPT(opt_cmod_propagation);
3745       OPT(dead_code_eliminate);
3746       OPT(opt_peephole_sel);
3747       OPT(dead_control_flow_eliminate, this);
3748       OPT(opt_register_renaming);
3749       OPT(opt_redundant_discard_jumps);
3750       OPT(opt_saturate_propagation);
3751       OPT(register_coalesce);
3752       OPT(compute_to_mrf);
3753
3754       OPT(compact_virtual_grfs);
3755    } while (progress);
3756
3757    pass_num = 0;
3758
3759    if (OPT(lower_load_payload)) {
3760       split_virtual_grfs();
3761       OPT(register_coalesce);
3762       OPT(compute_to_mrf);
3763       OPT(dead_code_eliminate);
3764    }
3765
3766    OPT(opt_combine_constants);
3767
3768    lower_uniform_pull_constant_loads();
3769 }
3770
3771 /**
3772  * Three source instruction must have a GRF/MRF destination register.
3773  * ARF NULL is not allowed.  Fix that up by allocating a temporary GRF.
3774  */
3775 void
3776 fs_visitor::fixup_3src_null_dest()
3777 {
3778    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3779       if (inst->is_3src() && inst->dst.is_null()) {
3780          inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
3781                             inst->dst.type);
3782       }
3783    }
3784 }
3785
3786 void
3787 fs_visitor::allocate_registers()
3788 {
3789    bool allocated_without_spills;
3790
3791    static const enum instruction_scheduler_mode pre_modes[] = {
3792       SCHEDULE_PRE,
3793       SCHEDULE_PRE_NON_LIFO,
3794       SCHEDULE_PRE_LIFO,
3795    };
3796
3797    /* Try each scheduling heuristic to see if it can successfully register
3798     * allocate without spilling.  They should be ordered by decreasing
3799     * performance but increasing likelihood of allocating.
3800     */
3801    for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3802       schedule_instructions(pre_modes[i]);
3803
3804       if (0) {
3805          assign_regs_trivial();
3806          allocated_without_spills = true;
3807       } else {
3808          allocated_without_spills = assign_regs(false);
3809       }
3810       if (allocated_without_spills)
3811          break;
3812    }
3813
3814    if (!allocated_without_spills) {
3815       const char *stage_name = stage == MESA_SHADER_VERTEX ?
3816          "Vertex" : "Fragment";
3817
3818       /* We assume that any spilling is worse than just dropping back to
3819        * SIMD8.  There's probably actually some intermediate point where
3820        * SIMD16 with a couple of spills is still better.
3821        */
3822       if (dispatch_width == 16) {
3823          fail("Failure to register allocate.  Reduce number of "
3824               "live scalar values to avoid this.");
3825       } else {
3826          perf_debug("%s shader triggered register spilling.  "
3827                     "Try reducing the number of live scalar values to "
3828                     "improve performance.\n", stage_name);
3829       }
3830
3831       /* Since we're out of heuristics, just go spill registers until we
3832        * get an allocation.
3833        */
3834       while (!assign_regs(true)) {
3835          if (failed)
3836             break;
3837       }
3838    }
3839
3840    /* This must come after all optimization and register allocation, since
3841     * it inserts dead code that happens to have side effects, and it does
3842     * so based on the actual physical registers in use.
3843     */
3844    insert_gen4_send_dependency_workarounds();
3845
3846    if (failed)
3847       return;
3848
3849    if (!allocated_without_spills)
3850       schedule_instructions(SCHEDULE_POST);
3851
3852    if (last_scratch > 0)
3853       prog_data->total_scratch = brw_get_scratch_size(last_scratch);
3854 }
3855
3856 static bool
3857 env_var_as_boolean(const char *var_name, bool default_value)
3858 {
3859    const char *str = getenv(var_name);
3860    if (str == NULL)
3861       return default_value;
3862
3863    if (strcmp(str, "1") == 0 ||
3864        strcasecmp(str, "true") == 0 ||
3865        strcasecmp(str, "yes") == 0) {
3866       return true;
3867    } else if (strcmp(str, "0") == 0 ||
3868               strcasecmp(str, "false") == 0 ||
3869               strcasecmp(str, "no") == 0) {
3870       return false;
3871    } else {
3872       return default_value;
3873    }
3874 }
3875
3876 bool
3877 fs_visitor::run_vs()
3878 {
3879    assert(stage == MESA_SHADER_VERTEX);
3880
3881    assign_common_binding_table_offsets(0);
3882    setup_vs_payload();
3883
3884    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3885       emit_shader_time_begin();
3886
3887    if (env_var_as_boolean("INTEL_USE_NIR", false)) {
3888       emit_nir_code();
3889    } else {
3890       foreach_in_list(ir_instruction, ir, shader->base.ir) {
3891          base_ir = ir;
3892          this->result = reg_undef;
3893          ir->accept(this);
3894       }
3895       base_ir = NULL;
3896    }
3897
3898    if (failed)
3899       return false;
3900
3901    emit_urb_writes();
3902
3903    calculate_cfg();
3904
3905    optimize();
3906
3907    assign_curb_setup();
3908    assign_vs_urb_setup();
3909
3910    fixup_3src_null_dest();
3911    allocate_registers();
3912
3913    return !failed;
3914 }
3915
3916 bool
3917 fs_visitor::run_fs()
3918 {
3919    brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
3920    brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
3921
3922    assert(stage == MESA_SHADER_FRAGMENT);
3923
3924    sanity_param_count = prog->Parameters->NumParameters;
3925
3926    assign_binding_table_offsets();
3927
3928    if (brw->gen >= 6)
3929       setup_payload_gen6();
3930    else
3931       setup_payload_gen4();
3932
3933    if (0) {
3934       emit_dummy_fs();
3935    } else if (brw->use_rep_send && dispatch_width == 16) {
3936       emit_repclear_shader();
3937    } else {
3938       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3939          emit_shader_time_begin();
3940
3941       calculate_urb_setup();
3942       if (prog->InputsRead > 0) {
3943          if (brw->gen < 6)
3944             emit_interpolation_setup_gen4();
3945          else
3946             emit_interpolation_setup_gen6();
3947       }
3948
3949       /* We handle discards by keeping track of the still-live pixels in f0.1.
3950        * Initialize it with the dispatched pixels.
3951        */
3952       if (wm_prog_data->uses_kill) {
3953          fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3954          discard_init->flag_subreg = 1;
3955       }
3956
3957       /* Generate FS IR for main().  (the visitor only descends into
3958        * functions called "main").
3959        */
3960       if (shader) {
3961          if (env_var_as_boolean("INTEL_USE_NIR", false)) {
3962             emit_nir_code();
3963          } else {
3964             foreach_in_list(ir_instruction, ir, shader->base.ir) {
3965                base_ir = ir;
3966                this->result = reg_undef;
3967                ir->accept(this);
3968             }
3969          }
3970       } else {
3971          emit_fragment_program_code();
3972       }
3973       base_ir = NULL;
3974       if (failed)
3975          return false;
3976
3977       emit(FS_OPCODE_PLACEHOLDER_HALT);
3978
3979       if (wm_key->alpha_test_func)
3980          emit_alpha_test();
3981
3982       emit_fb_writes();
3983
3984       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3985          emit_shader_time_end();
3986
3987       calculate_cfg();
3988
3989       optimize();
3990
3991       assign_curb_setup();
3992       assign_urb_setup();
3993
3994       fixup_3src_null_dest();
3995       allocate_registers();
3996
3997       if (failed)
3998          return false;
3999    }
4000
4001    if (dispatch_width == 8)
4002       wm_prog_data->reg_blocks = brw_register_blocks(grf_used);
4003    else
4004       wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
4005
4006    /* If any state parameters were appended, then ParameterValues could have
4007     * been realloced, in which case the driver uniform storage set up by
4008     * _mesa_associate_uniform_storage() would point to freed memory.  Make
4009     * sure that didn't happen.
4010     */
4011    assert(sanity_param_count == prog->Parameters->NumParameters);
4012
4013    return !failed;
4014 }
4015
4016 const unsigned *
4017 brw_wm_fs_emit(struct brw_context *brw,
4018                void *mem_ctx,
4019                const struct brw_wm_prog_key *key,
4020                struct brw_wm_prog_data *prog_data,
4021                struct gl_fragment_program *fp,
4022                struct gl_shader_program *prog,
4023                unsigned *final_assembly_size)
4024 {
4025    bool start_busy = false;
4026    double start_time = 0;
4027
4028    if (unlikely(brw->perf_debug)) {
4029       start_busy = (brw->batch.last_bo &&
4030                     drm_intel_bo_busy(brw->batch.last_bo));
4031       start_time = get_time();
4032    }
4033
4034    struct brw_shader *shader = NULL;
4035    if (prog)
4036       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
4037
4038    if (unlikely(INTEL_DEBUG & DEBUG_WM))
4039       brw_dump_ir("fragment", prog, &shader->base, &fp->Base);
4040
4041    /* Now the main event: Visit the shader IR and generate our FS IR for it.
4042     */
4043    fs_visitor v(brw, mem_ctx, key, prog_data, prog, fp, 8);
4044    if (!v.run_fs()) {
4045       if (prog) {
4046          prog->LinkStatus = false;
4047          ralloc_strcat(&prog->InfoLog, v.fail_msg);
4048       }
4049
4050       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
4051                     v.fail_msg);
4052
4053       return NULL;
4054    }
4055
4056    cfg_t *simd16_cfg = NULL;
4057    fs_visitor v2(brw, mem_ctx, key, prog_data, prog, fp, 16);
4058    if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16) ||
4059                                brw->use_rep_send)) {
4060       if (!v.simd16_unsupported) {
4061          /* Try a SIMD16 compile */
4062          v2.import_uniforms(&v);
4063          if (!v2.run_fs()) {
4064             perf_debug("SIMD16 shader failed to compile, falling back to "
4065                        "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
4066          } else {
4067             simd16_cfg = v2.cfg;
4068          }
4069       } else {
4070          perf_debug("SIMD16 shader unsupported, falling back to "
4071                     "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
4072       }
4073    }
4074
4075    cfg_t *simd8_cfg;
4076    int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8;
4077    if (no_simd8 && simd16_cfg) {
4078       simd8_cfg = NULL;
4079       prog_data->no_8 = true;
4080    } else {
4081       simd8_cfg = v.cfg;
4082       prog_data->no_8 = false;
4083    }
4084
4085    fs_generator g(brw, mem_ctx, (void *) key, &prog_data->base,
4086                   &fp->Base, v.promoted_constants, v.runtime_check_aads_emit, "FS");
4087
4088    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
4089       char *name;
4090       if (prog)
4091          name = ralloc_asprintf(mem_ctx, "%s fragment shader %d",
4092                                 prog->Label ? prog->Label : "unnamed",
4093                                 prog->Name);
4094       else
4095          name = ralloc_asprintf(mem_ctx, "fragment program %d", fp->Base.Id);
4096
4097       g.enable_debug(name);
4098    }
4099
4100    if (simd8_cfg)
4101       g.generate_code(simd8_cfg, 8);
4102    if (simd16_cfg)
4103       prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
4104
4105    if (unlikely(brw->perf_debug) && shader) {
4106       if (shader->compiled_once)
4107          brw_wm_debug_recompile(brw, prog, key);
4108       shader->compiled_once = true;
4109
4110       if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
4111          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
4112                     (get_time() - start_time) * 1000);
4113       }
4114    }
4115
4116    return g.get_assembly(final_assembly_size);
4117 }
4118
4119 extern "C" bool
4120 brw_fs_precompile(struct gl_context *ctx,
4121                   struct gl_shader_program *shader_prog,
4122                   struct gl_program *prog)
4123 {
4124    struct brw_context *brw = brw_context(ctx);
4125    struct brw_wm_prog_key key;
4126
4127    struct gl_fragment_program *fp = (struct gl_fragment_program *) prog;
4128    struct brw_fragment_program *bfp = brw_fragment_program(fp);
4129    bool program_uses_dfdy = fp->UsesDFdy;
4130
4131    memset(&key, 0, sizeof(key));
4132
4133    if (brw->gen < 6) {
4134       if (fp->UsesKill)
4135          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
4136
4137       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
4138          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
4139
4140       /* Just assume depth testing. */
4141       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
4142       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
4143    }
4144
4145    if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
4146                                          BRW_FS_VARYING_INPUT_MASK) > 16)
4147       key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
4148
4149    const bool has_shader_channel_select = brw->is_haswell || brw->gen >= 8;
4150    unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
4151    for (unsigned i = 0; i < sampler_count; i++) {
4152       if (!has_shader_channel_select && (fp->Base.ShadowSamplers & (1 << i))) {
4153          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
4154          key.tex.swizzles[i] =
4155             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
4156       } else {
4157          /* Color sampler: assume no swizzling. */
4158          key.tex.swizzles[i] = SWIZZLE_XYZW;
4159       }
4160    }
4161
4162    if (fp->Base.InputsRead & VARYING_BIT_POS) {
4163       key.drawable_height = ctx->DrawBuffer->Height;
4164    }
4165
4166    key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
4167          ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
4168          BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
4169
4170    if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
4171       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
4172                           key.nr_color_regions > 1;
4173    }
4174
4175    key.program_string_id = bfp->id;
4176
4177    uint32_t old_prog_offset = brw->wm.base.prog_offset;
4178    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
4179
4180    bool success = do_wm_prog(brw, shader_prog, bfp, &key);
4181
4182    brw->wm.base.prog_offset = old_prog_offset;
4183    brw->wm.prog_data = old_prog_data;
4184
4185    return success;
4186 }