src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 #include <sys/types.h>
  32
  33 #include "util/hash_table.h"
  34 #include "main/macros.h"
  35 #include "main/shaderobj.h"
  36 #include "main/fbobject.h"
  37 #include "program/prog_parameter.h"
  38 #include "program/prog_print.h"
  39 #include "util/register_allocate.h"
  40 #include "program/hash_table.h"
  41 #include "brw_context.h"
  42 #include "brw_eu.h"
  43 #include "brw_wm.h"
  44 #include "brw_fs.h"
  45 #include "brw_cfg.h"
  46 #include "brw_dead_control_flow.h"
  47 #include "main/uniforms.h"
  48 #include "brw_fs_live_variables.h"
  49 #include "glsl/glsl_types.h"
  50 #include "program/sampler.h"
  51
  52 void
  53 fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
  54               const fs_reg *src, unsigned sources)
  55 {
  56    memset(this, 0, sizeof(*this));
  57
  58    this->src = new fs_reg[MAX2(sources, 3)];
  59    for (unsigned i = 0; i < sources; i++)
  60       this->src[i] = src[i];
  61
  62    this->opcode = opcode;
  63    this->dst = dst;
  64    this->sources = sources;
  65    this->exec_size = exec_size;
  66
  67    assert(dst.file != IMM && dst.file != UNIFORM);
  68
  69    /* If exec_size == 0, try to guess it from the registers.  Since all
  70     * manner of things may use hardware registers, we first try to guess
  71     * based on GRF registers.  If this fails, we will go ahead and take the
  72     * width from the destination register.
  73     */
  74    if (this->exec_size == 0) {
  75       if (dst.file == GRF) {
  76          this->exec_size = dst.width;
  77       } else {
  78          for (unsigned i = 0; i < sources; ++i) {
  79             if (src[i].file != GRF && src[i].file != ATTR)
  80                continue;
  81
  82             if (this->exec_size <= 1)
  83                this->exec_size = src[i].width;
  84             assert(src[i].width == 1 || src[i].width == this->exec_size);
  85          }
  86       }
  87
  88       if (this->exec_size == 0 && dst.file != BAD_FILE)
  89          this->exec_size = dst.width;
  90    }
  91    assert(this->exec_size != 0);
  92
  93    for (unsigned i = 0; i < sources; ++i) {
  94       switch (this->src[i].file) {
  95       case BAD_FILE:
  96          this->src[i].effective_width = 8;
  97          break;
  98       case GRF:
  99       case HW_REG:
 100       case ATTR:
 101          assert(this->src[i].width > 0);
 102          if (this->src[i].width == 1) {
 103             this->src[i].effective_width = this->exec_size;
 104          } else {
 105             this->src[i].effective_width = this->src[i].width;
 106          }
 107          break;
 108       case IMM:
 109       case UNIFORM:
 110          this->src[i].effective_width = this->exec_size;
 111          break;
 112       default:
 113          unreachable("Invalid source register file");
 114       }
 115    }
 116    this->dst.effective_width = this->exec_size;
 117
 118    this->conditional_mod = BRW_CONDITIONAL_NONE;
 119
 120    /* This will be the case for almost all instructions. */
 121    switch (dst.file) {
 122    case GRF:
 123    case HW_REG:
 124    case MRF:
 125    case ATTR:
 126       this->regs_written =
 127          DIV_ROUND_UP(MAX2(dst.width * dst.stride, 1) * type_sz(dst.type), 32);
 128       break;
 129    case BAD_FILE:
 130       this->regs_written = 0;
 131       break;
 132    case IMM:
 133    case UNIFORM:
 134       unreachable("Invalid destination register file");
 135    default:
 136       unreachable("Invalid register file");
 137    }
 138
 139    this->writes_accumulator = false;
 140 }
 141
 142 fs_inst::fs_inst()
 143 {
 144    init(BRW_OPCODE_NOP, 8, dst, NULL, 0);
 145 }
 146
 147 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
 148 {
 149    init(opcode, exec_size, reg_undef, NULL, 0);
 150 }
 151
 152 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
 153 {
 154    init(opcode, 0, dst, NULL, 0);
 155 }
 156
 157 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 158                  const fs_reg &src0)
 159 {
 160    const fs_reg src[1] = { src0 };
 161    init(opcode, exec_size, dst, src, 1);
 162 }
 163
 164 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
 165 {
 166    const fs_reg src[1] = { src0 };
 167    init(opcode, 0, dst, src, 1);
 168 }
 169
 170 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 171                  const fs_reg &src0, const fs_reg &src1)
 172 {
 173    const fs_reg src[2] = { src0, src1 };
 174    init(opcode, exec_size, dst, src, 2);
 175 }
 176
 177 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 178                  const fs_reg &src1)
 179 {
 180    const fs_reg src[2] = { src0, src1 };
 181    init(opcode, 0, dst, src, 2);
 182 }
 183
 184 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 185                  const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
 186 {
 187    const fs_reg src[3] = { src0, src1, src2 };
 188    init(opcode, exec_size, dst, src, 3);
 189 }
 190
 191 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 192                  const fs_reg &src1, const fs_reg &src2)
 193 {
 194    const fs_reg src[3] = { src0, src1, src2 };
 195    init(opcode, 0, dst, src, 3);
 196 }
 197
 198 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst,
 199                  const fs_reg src[], unsigned sources)
 200 {
 201    init(opcode, 0, dst, src, sources);
 202 }
 203
 204 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
 205                  const fs_reg src[], unsigned sources)
 206 {
 207    init(opcode, exec_width, dst, src, sources);
 208 }
 209
 210 fs_inst::fs_inst(const fs_inst &that)
 211 {
 212    memcpy(this, &that, sizeof(that));
 213
 214    this->src = new fs_reg[MAX2(that.sources, 3)];
 215
 216    for (unsigned i = 0; i < that.sources; i++)
 217       this->src[i] = that.src[i];
 218 }
 219
 220 fs_inst::~fs_inst()
 221 {
 222    delete[] this->src;
 223 }
 224
 225 void
 226 fs_inst::resize_sources(uint8_t num_sources)
 227 {
 228    if (this->sources != num_sources) {
 229       fs_reg *src = new fs_reg[MAX2(num_sources, 3)];
 230
 231       for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i)
 232          src[i] = this->src[i];
 233
 234       delete[] this->src;
 235       this->src = src;
 236       this->sources = num_sources;
 237    }
 238 }
 239
 240 #define ALU1(op)                                                        \
 241    fs_inst *                                                            \
 242    fs_visitor::op(const fs_reg &dst, const fs_reg &src0)                \
 243    {                                                                    \
 244       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
 245    }
 246
 247 #define ALU2(op)                                                        \
 248    fs_inst *                                                            \
 249    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 250                   const fs_reg &src1)                                   \
 251    {                                                                    \
 252       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
 253    }
 254
 255 #define ALU2_ACC(op)                                                    \
 256    fs_inst *                                                            \
 257    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 258                   const fs_reg &src1)                                   \
 259    {                                                                    \
 260       fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
 261       inst->writes_accumulator = true;                                  \
 262       return inst;                                                      \
 263    }
 264
 265 #define ALU3(op)                                                        \
 266    fs_inst *                                                            \
 267    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 268                   const fs_reg &src1, const fs_reg &src2)               \
 269    {                                                                    \
 270       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
 271    }
 272
 273 ALU1(NOT)
 274 ALU1(MOV)
 275 ALU1(FRC)
 276 ALU1(RNDD)
 277 ALU1(RNDE)
 278 ALU1(RNDZ)
 279 ALU2(ADD)
 280 ALU2(MUL)
 281 ALU2_ACC(MACH)
 282 ALU2(AND)
 283 ALU2(OR)
 284 ALU2(XOR)
 285 ALU2(SHL)
 286 ALU2(SHR)
 287 ALU2(ASR)
 288 ALU3(LRP)
 289 ALU1(BFREV)
 290 ALU3(BFE)
 291 ALU2(BFI1)
 292 ALU3(BFI2)
 293 ALU1(FBH)
 294 ALU1(FBL)
 295 ALU1(CBIT)
 296 ALU3(MAD)
 297 ALU2_ACC(ADDC)
 298 ALU2_ACC(SUBB)
 299 ALU2(SEL)
 300 ALU2(MAC)
 301
 302 /** Gen4 predicated IF. */
 303 fs_inst *
 304 fs_visitor::IF(enum brw_predicate predicate)
 305 {
 306    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width);
 307    inst->predicate = predicate;
 308    return inst;
 309 }
 310
 311 /** Gen6 IF with embedded comparison. */
 312 fs_inst *
 313 fs_visitor::IF(const fs_reg &src0, const fs_reg &src1,
 314                enum brw_conditional_mod condition)
 315 {
 316    assert(brw->gen == 6);
 317    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width,
 318                                         reg_null_d, src0, src1);
 319    inst->conditional_mod = condition;
 320    return inst;
 321 }
 322
 323 /**
 324  * CMP: Sets the low bit of the destination channels with the result
 325  * of the comparison, while the upper bits are undefined, and updates
 326  * the flag register with the packed 16 bits of the result.
 327  */
 328 fs_inst *
 329 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
 330                 enum brw_conditional_mod condition)
 331 {
 332    fs_inst *inst;
 333
 334    /* Take the instruction:
 335     *
 336     * CMP null<d> src0<f> src1<f>
 337     *
 338     * Original gen4 does type conversion to the destination type before
 339     * comparison, producing garbage results for floating point comparisons.
 340     *
 341     * The destination type doesn't matter on newer generations, so we set the
 342     * type to match src0 so we can compact the instruction.
 343     */
 344    dst.type = src0.type;
 345    if (dst.file == HW_REG)
 346       dst.fixed_hw_reg.type = dst.type;
 347
 348    resolve_ud_negate(&src0);
 349    resolve_ud_negate(&src1);
 350
 351    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
 352    inst->conditional_mod = condition;
 353
 354    return inst;
 355 }
 356
 357 fs_inst *
 358 fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources)
 359 {
 360    uint8_t exec_size = dst.width;
 361    for (int i = 0; i < sources; ++i) {
 362       assert(src[i].width % dst.width == 0);
 363       if (src[i].width > exec_size)
 364          exec_size = src[i].width;
 365    }
 366
 367    fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, exec_size,
 368                                         dst, src, sources);
 369    inst->regs_written = 0;
 370    for (int i = 0; i < sources; ++i) {
 371       /* The LOAD_PAYLOAD instruction only really makes sense if we are
 372        * dealing with whole registers.  If this ever changes, we can deal
 373        * with it later.
 374        */
 375       int size = inst->src[i].effective_width * type_sz(src[i].type);
 376       assert(size % 32 == 0);
 377       inst->regs_written += (size + 31) / 32;
 378    }
 379
 380    return inst;
 381 }
 382
 383 exec_list
 384 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
 385                                        const fs_reg &surf_index,
 386                                        const fs_reg &varying_offset,
 387                                        uint32_t const_offset)
 388 {
 389    exec_list instructions;
 390    fs_inst *inst;
 391
 392    /* We have our constant surface use a pitch of 4 bytes, so our index can
 393     * be any component of a vector, and then we load 4 contiguous
 394     * components starting from that.
 395     *
 396     * We break down the const_offset to a portion added to the variable
 397     * offset and a portion done using reg_offset, which means that if you
 398     * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
 399     * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
 400     * CSE can later notice that those loads are all the same and eliminate
 401     * the redundant ones.
 402     */
 403    fs_reg vec4_offset = vgrf(glsl_type::int_type);
 404    instructions.push_tail(ADD(vec4_offset,
 405                               varying_offset, fs_reg(const_offset & ~3)));
 406
 407    int scale = 1;
 408    if (brw->gen == 4 && dst.width == 8) {
 409       /* Pre-gen5, we can either use a SIMD8 message that requires (header,
 410        * u, v, r) as parameters, or we can just use the SIMD16 message
 411        * consisting of (header, u).  We choose the second, at the cost of a
 412        * longer return length.
 413        */
 414       scale = 2;
 415    }
 416
 417    enum opcode op;
 418    if (brw->gen >= 7)
 419       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
 420    else
 421       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
 422
 423    assert(dst.width % 8 == 0);
 424    int regs_written = 4 * (dst.width / 8) * scale;
 425    fs_reg vec4_result = fs_reg(GRF, alloc.allocate(regs_written),
 426                                dst.type, dst.width);
 427    inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
 428    inst->regs_written = regs_written;
 429    instructions.push_tail(inst);
 430
 431    if (brw->gen < 7) {
 432       inst->base_mrf = 13;
 433       inst->header_present = true;
 434       if (brw->gen == 4)
 435          inst->mlen = 3;
 436       else
 437          inst->mlen = 1 + dispatch_width / 8;
 438    }
 439
 440    fs_reg result = offset(vec4_result, (const_offset & 3) * scale);
 441    instructions.push_tail(MOV(dst, result));
 442
 443    return instructions;
 444 }
 445
 446 /**
 447  * A helper for MOV generation for fixing up broken hardware SEND dependency
 448  * handling.
 449  */
 450 fs_inst *
 451 fs_visitor::DEP_RESOLVE_MOV(int grf)
 452 {
 453    fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
 454
 455    inst->ir = NULL;
 456    inst->annotation = "send dependency resolve";
 457
 458    /* The caller always wants uncompressed to emit the minimal extra
 459     * dependencies, and to avoid having to deal with aligning its regs to 2.
 460     */
 461    inst->exec_size = 8;
 462
 463    return inst;
 464 }
 465
 466 bool
 467 fs_inst::equals(fs_inst *inst) const
 468 {
 469    return (opcode == inst->opcode &&
 470            dst.equals(inst->dst) &&
 471            src[0].equals(inst->src[0]) &&
 472            src[1].equals(inst->src[1]) &&
 473            src[2].equals(inst->src[2]) &&
 474            saturate == inst->saturate &&
 475            predicate == inst->predicate &&
 476            conditional_mod == inst->conditional_mod &&
 477            mlen == inst->mlen &&
 478            base_mrf == inst->base_mrf &&
 479            target == inst->target &&
 480            eot == inst->eot &&
 481            header_present == inst->header_present &&
 482            shadow_compare == inst->shadow_compare &&
 483            exec_size == inst->exec_size &&
 484            offset == inst->offset);
 485 }
 486
 487 bool
 488 fs_inst::overwrites_reg(const fs_reg &reg) const
 489 {
 490    return (reg.file == dst.file &&
 491            reg.reg == dst.reg &&
 492            reg.reg_offset >= dst.reg_offset  &&
 493            reg.reg_offset < dst.reg_offset + regs_written);
 494 }
 495
 496 bool
 497 fs_inst::is_send_from_grf() const
 498 {
 499    switch (opcode) {
 500    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
 501    case SHADER_OPCODE_SHADER_TIME_ADD:
 502    case FS_OPCODE_INTERPOLATE_AT_CENTROID:
 503    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
 504    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
 505    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
 506    case SHADER_OPCODE_UNTYPED_ATOMIC:
 507    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
 508    case SHADER_OPCODE_URB_WRITE_SIMD8:
 509       return true;
 510    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 511       return src[1].file == GRF;
 512    case FS_OPCODE_FB_WRITE:
 513       return src[0].file == GRF;
 514    default:
 515       if (is_tex())
 516          return src[0].file == GRF;
 517
 518       return false;
 519    }
 520 }
 521
 522 bool
 523 fs_inst::can_do_source_mods(struct brw_context *brw)
 524 {
 525    if (brw->gen == 6 && is_math())
 526       return false;
 527
 528    if (is_send_from_grf())
 529       return false;
 530
 531    if (!backend_instruction::can_do_source_mods())
 532       return false;
 533
 534    return true;
 535 }
 536
 537 void
 538 fs_reg::init()
 539 {
 540    memset(this, 0, sizeof(*this));
 541    stride = 1;
 542 }
 543
 544 /** Generic unset register constructor. */
 545 fs_reg::fs_reg()
 546 {
 547    init();
 548    this->file = BAD_FILE;
 549 }
 550
 551 /** Immediate value constructor. */
 552 fs_reg::fs_reg(float f)
 553 {
 554    init();
 555    this->file = IMM;
 556    this->type = BRW_REGISTER_TYPE_F;
 557    this->fixed_hw_reg.dw1.f = f;
 558    this->width = 1;
 559 }
 560
 561 /** Immediate value constructor. */
 562 fs_reg::fs_reg(int32_t i)
 563 {
 564    init();
 565    this->file = IMM;
 566    this->type = BRW_REGISTER_TYPE_D;
 567    this->fixed_hw_reg.dw1.d = i;
 568    this->width = 1;
 569 }
 570
 571 /** Immediate value constructor. */
 572 fs_reg::fs_reg(uint32_t u)
 573 {
 574    init();
 575    this->file = IMM;
 576    this->type = BRW_REGISTER_TYPE_UD;
 577    this->fixed_hw_reg.dw1.ud = u;
 578    this->width = 1;
 579 }
 580
 581 /** Vector float immediate value constructor. */
 582 fs_reg::fs_reg(uint8_t vf[4])
 583 {
 584    init();
 585    this->file = IMM;
 586    this->type = BRW_REGISTER_TYPE_VF;
 587    memcpy(&this->fixed_hw_reg.dw1.ud, vf, sizeof(unsigned));
 588 }
 589
 590 /** Vector float immediate value constructor. */
 591 fs_reg::fs_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
 592 {
 593    init();
 594    this->file = IMM;
 595    this->type = BRW_REGISTER_TYPE_VF;
 596    this->fixed_hw_reg.dw1.ud = (vf0 <<  0) |
 597                                (vf1 <<  8) |
 598                                (vf2 << 16) |
 599                                (vf3 << 24);
 600 }
 601
 602 /** Fixed brw_reg. */
 603 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 604 {
 605    init();
 606    this->file = HW_REG;
 607    this->fixed_hw_reg = fixed_hw_reg;
 608    this->type = fixed_hw_reg.type;
 609    this->width = 1 << fixed_hw_reg.width;
 610 }
 611
 612 bool
 613 fs_reg::equals(const fs_reg &r) const
 614 {
 615    return (file == r.file &&
 616            reg == r.reg &&
 617            reg_offset == r.reg_offset &&
 618            subreg_offset == r.subreg_offset &&
 619            type == r.type &&
 620            negate == r.negate &&
 621            abs == r.abs &&
 622            !reladdr && !r.reladdr &&
 623            memcmp(&fixed_hw_reg, &r.fixed_hw_reg, sizeof(fixed_hw_reg)) == 0 &&
 624            width == r.width &&
 625            stride == r.stride);
 626 }
 627
 628 fs_reg &
 629 fs_reg::set_smear(unsigned subreg)
 630 {
 631    assert(file != HW_REG && file != IMM);
 632    subreg_offset = subreg * type_sz(type);
 633    stride = 0;
 634    return *this;
 635 }
 636
 637 bool
 638 fs_reg::is_contiguous() const
 639 {
 640    return stride == 1;
 641 }
 642
 643 int
 644 fs_visitor::type_size(const struct glsl_type *type)
 645 {
 646    unsigned int size, i;
 647
 648    switch (type->base_type) {
 649    case GLSL_TYPE_UINT:
 650    case GLSL_TYPE_INT:
 651    case GLSL_TYPE_FLOAT:
 652    case GLSL_TYPE_BOOL:
 653       return type->components();
 654    case GLSL_TYPE_ARRAY:
 655       return type_size(type->fields.array) * type->length;
 656    case GLSL_TYPE_STRUCT:
 657       size = 0;
 658       for (i = 0; i < type->length; i++) {
 659          size += type_size(type->fields.structure[i].type);
 660       }
 661       return size;
 662    case GLSL_TYPE_SAMPLER:
 663       /* Samplers take up no register space, since they're baked in at
 664        * link time.
 665        */
 666       return 0;
 667    case GLSL_TYPE_ATOMIC_UINT:
 668       return 0;
 669    case GLSL_TYPE_IMAGE:
 670    case GLSL_TYPE_VOID:
 671    case GLSL_TYPE_ERROR:
 672    case GLSL_TYPE_INTERFACE:
 673    case GLSL_TYPE_DOUBLE:
 674       unreachable("not reached");
 675    }
 676
 677    return 0;
 678 }
 679
 680 /**
 681  * Create a MOV to read the timestamp register.
 682  *
 683  * The caller is responsible for emitting the MOV.  The return value is
 684  * the destination of the MOV, with extra parameters set.
 685  */
 686 fs_reg
 687 fs_visitor::get_timestamp(fs_inst **out_mov)
 688 {
 689    assert(brw->gen >= 7);
 690
 691    fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 692                                           BRW_ARF_TIMESTAMP,
 693                                           0),
 694                              BRW_REGISTER_TYPE_UD));
 695
 696    fs_reg dst = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 4);
 697
 698    fs_inst *mov = MOV(dst, ts);
 699    /* We want to read the 3 fields we care about even if it's not enabled in
 700     * the dispatch.
 701     */
 702    mov->force_writemask_all = true;
 703
 704    /* The caller wants the low 32 bits of the timestamp.  Since it's running
 705     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 706     * which is plenty of time for our purposes.  It is identical across the
 707     * EUs, but since it's tracking GPU core speed it will increment at a
 708     * varying rate as render P-states change.
 709     *
 710     * The caller could also check if render P-states have changed (or anything
 711     * else that might disrupt timing) by setting smear to 2 and checking if
 712     * that field is != 0.
 713     */
 714    dst.set_smear(0);
 715
 716    *out_mov = mov;
 717    return dst;
 718 }
 719
 720 void
 721 fs_visitor::emit_shader_time_begin()
 722 {
 723    current_annotation = "shader time start";
 724    fs_inst *mov;
 725    shader_start_time = get_timestamp(&mov);
 726    emit(mov);
 727 }
 728
 729 void
 730 fs_visitor::emit_shader_time_end()
 731 {
 732    current_annotation = "shader time end";
 733
 734    enum shader_time_shader_type type, written_type, reset_type;
 735    switch (stage) {
 736    case MESA_SHADER_VERTEX:
 737       type = ST_VS;
 738       written_type = ST_VS_WRITTEN;
 739       reset_type = ST_VS_RESET;
 740       break;
 741    case MESA_SHADER_GEOMETRY:
 742       type = ST_GS;
 743       written_type = ST_GS_WRITTEN;
 744       reset_type = ST_GS_RESET;
 745       break;
 746    case MESA_SHADER_FRAGMENT:
 747       if (dispatch_width == 8) {
 748          type = ST_FS8;
 749          written_type = ST_FS8_WRITTEN;
 750          reset_type = ST_FS8_RESET;
 751       } else {
 752          assert(dispatch_width == 16);
 753          type = ST_FS16;
 754          written_type = ST_FS16_WRITTEN;
 755          reset_type = ST_FS16_RESET;
 756       }
 757       break;
 758    default:
 759       unreachable("fs_visitor::emit_shader_time_end missing code");
 760    }
 761
 762    fs_inst *tm_read;
 763    fs_reg shader_end_time = get_timestamp(&tm_read);
 764    emit(tm_read);
 765
 766    /* Check that there weren't any timestamp reset events (assuming these
 767     * were the only two timestamp reads that happened).
 768     */
 769    fs_reg reset = shader_end_time;
 770    reset.set_smear(2);
 771    fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
 772    test->conditional_mod = BRW_CONDITIONAL_Z;
 773    test->force_writemask_all = true;
 774    emit(IF(BRW_PREDICATE_NORMAL));
 775
 776    fs_reg start = shader_start_time;
 777    start.negate = true;
 778    fs_reg diff = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 1);
 779    diff.set_smear(0);
 780    fs_inst *add = ADD(diff, start, shader_end_time);
 781    add->force_writemask_all = true;
 782    emit(add);
 783
 784    /* If there were no instructions between the two timestamp gets, the diff
 785     * is 2 cycles.  Remove that overhead, so I can forget about that when
 786     * trying to determine the time taken for single instructions.
 787     */
 788    add = ADD(diff, diff, fs_reg(-2u));
 789    add->force_writemask_all = true;
 790    emit(add);
 791
 792    emit(SHADER_TIME_ADD(type, diff));
 793    emit(SHADER_TIME_ADD(written_type, fs_reg(1u)));
 794    emit(BRW_OPCODE_ELSE);
 795    emit(SHADER_TIME_ADD(reset_type, fs_reg(1u)));
 796    emit(BRW_OPCODE_ENDIF);
 797 }
 798
 799 fs_inst *
 800 fs_visitor::SHADER_TIME_ADD(enum shader_time_shader_type type, fs_reg value)
 801 {
 802    int shader_time_index =
 803       brw_get_shader_time_index(brw, shader_prog, prog, type);
 804    fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
 805
 806    fs_reg payload;
 807    if (dispatch_width == 8)
 808       payload = vgrf(glsl_type::uvec2_type);
 809    else
 810       payload = vgrf(glsl_type::uint_type);
 811
 812    return new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
 813                                fs_reg(), payload, offset, value);
 814 }
 815
 816 void
 817 fs_visitor::vfail(const char *format, va_list va)
 818 {
 819    char *msg;
 820
 821    if (failed)
 822       return;
 823
 824    failed = true;
 825
 826    msg = ralloc_vasprintf(mem_ctx, format, va);
 827    msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
 828
 829    this->fail_msg = msg;
 830
 831    if (debug_enabled) {
 832       fprintf(stderr, "%s",  msg);
 833    }
 834 }
 835
 836 void
 837 fs_visitor::fail(const char *format, ...)
 838 {
 839    va_list va;
 840
 841    va_start(va, format);
 842    vfail(format, va);
 843    va_end(va);
 844 }
 845
 846 /**
 847  * Mark this program as impossible to compile in SIMD16 mode.
 848  *
 849  * During the SIMD8 compile (which happens first), we can detect and flag
 850  * things that are unsupported in SIMD16 mode, so the compiler can skip
 851  * the SIMD16 compile altogether.
 852  *
 853  * During a SIMD16 compile (if one happens anyway), this just calls fail().
 854  */
 855 void
 856 fs_visitor::no16(const char *format, ...)
 857 {
 858    va_list va;
 859
 860    va_start(va, format);
 861
 862    if (dispatch_width == 16) {
 863       vfail(format, va);
 864    } else {
 865       simd16_unsupported = true;
 866
 867       if (brw->perf_debug) {
 868          if (no16_msg)
 869             ralloc_vasprintf_append(&no16_msg, format, va);
 870          else
 871             no16_msg = ralloc_vasprintf(mem_ctx, format, va);
 872       }
 873    }
 874
 875    va_end(va);
 876 }
 877
 878 fs_inst *
 879 fs_visitor::emit(enum opcode opcode)
 880 {
 881    return emit(new(mem_ctx) fs_inst(opcode, dispatch_width));
 882 }
 883
 884 fs_inst *
 885 fs_visitor::emit(enum opcode opcode, const fs_reg &dst)
 886 {
 887    return emit(new(mem_ctx) fs_inst(opcode, dst));
 888 }
 889
 890 fs_inst *
 891 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
 892 {
 893    return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
 894 }
 895
 896 fs_inst *
 897 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 898                  const fs_reg &src1)
 899 {
 900    return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
 901 }
 902
 903 fs_inst *
 904 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 905                  const fs_reg &src1, const fs_reg &src2)
 906 {
 907    return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
 908 }
 909
 910 fs_inst *
 911 fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
 912                  fs_reg src[], int sources)
 913 {
 914    return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
 915 }
 916
 917 /**
 918  * Returns true if the instruction has a flag that means it won't
 919  * update an entire destination register.
 920  *
 921  * For example, dead code elimination and live variable analysis want to know
 922  * when a write to a variable screens off any preceding values that were in
 923  * it.
 924  */
 925 bool
 926 fs_inst::is_partial_write() const
 927 {
 928    return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
 929            (this->dst.width * type_sz(this->dst.type)) < 32 ||
 930            !this->dst.is_contiguous());
 931 }
 932
 933 int
 934 fs_inst::regs_read(int arg) const
 935 {
 936    if (is_tex() && arg == 0 && src[0].file == GRF) {
 937       return mlen;
 938    } else if (opcode == FS_OPCODE_FB_WRITE && arg == 0) {
 939       return mlen;
 940    } else if (opcode == SHADER_OPCODE_URB_WRITE_SIMD8 && arg == 0) {
 941       return mlen;
 942    } else if (opcode == SHADER_OPCODE_UNTYPED_ATOMIC && arg == 0) {
 943       return mlen;
 944    } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ && arg == 0) {
 945       return mlen;
 946    } else if (opcode == FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET && arg == 0) {
 947       return mlen;
 948    }
 949
 950    switch (src[arg].file) {
 951    case BAD_FILE:
 952    case UNIFORM:
 953    case IMM:
 954       return 1;
 955    case GRF:
 956    case HW_REG:
 957       if (src[arg].stride == 0) {
 958          return 1;
 959       } else {
 960          int size = src[arg].width * src[arg].stride * type_sz(src[arg].type);
 961          return (size + 31) / 32;
 962       }
 963    case MRF:
 964       unreachable("MRF registers are not allowed as sources");
 965    default:
 966       unreachable("Invalid register file");
 967    }
 968 }
 969
 970 bool
 971 fs_inst::reads_flag() const
 972 {
 973    return predicate;
 974 }
 975
 976 bool
 977 fs_inst::writes_flag() const
 978 {
 979    return (conditional_mod && (opcode != BRW_OPCODE_SEL &&
 980                                opcode != BRW_OPCODE_IF &&
 981                                opcode != BRW_OPCODE_WHILE)) ||
 982           opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
 983 }
 984
 985 /**
 986  * Returns how many MRFs an FS opcode will write over.
 987  *
 988  * Note that this is not the 0 or 1 implied writes in an actual gen
 989  * instruction -- the FS opcodes often generate MOVs in addition.
 990  */
 991 int
 992 fs_visitor::implied_mrf_writes(fs_inst *inst)
 993 {
 994    if (inst->mlen == 0)
 995       return 0;
 996
 997    if (inst->base_mrf == -1)
 998       return 0;
 999
1000    switch (inst->opcode) {
1001    case SHADER_OPCODE_RCP:
1002    case SHADER_OPCODE_RSQ:
1003    case SHADER_OPCODE_SQRT:
1004    case SHADER_OPCODE_EXP2:
1005    case SHADER_OPCODE_LOG2:
1006    case SHADER_OPCODE_SIN:
1007    case SHADER_OPCODE_COS:
1008       return 1 * dispatch_width / 8;
1009    case SHADER_OPCODE_POW:
1010    case SHADER_OPCODE_INT_QUOTIENT:
1011    case SHADER_OPCODE_INT_REMAINDER:
1012       return 2 * dispatch_width / 8;
1013    case SHADER_OPCODE_TEX:
1014    case FS_OPCODE_TXB:
1015    case SHADER_OPCODE_TXD:
1016    case SHADER_OPCODE_TXF:
1017    case SHADER_OPCODE_TXF_CMS:
1018    case SHADER_OPCODE_TXF_MCS:
1019    case SHADER_OPCODE_TG4:
1020    case SHADER_OPCODE_TG4_OFFSET:
1021    case SHADER_OPCODE_TXL:
1022    case SHADER_OPCODE_TXS:
1023    case SHADER_OPCODE_LOD:
1024       return 1;
1025    case FS_OPCODE_FB_WRITE:
1026       return 2;
1027    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
1028    case SHADER_OPCODE_GEN4_SCRATCH_READ:
1029       return 1;
1030    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
1031       return inst->mlen;
1032    case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
1033       return 2;
1034    case SHADER_OPCODE_UNTYPED_ATOMIC:
1035    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
1036    case SHADER_OPCODE_URB_WRITE_SIMD8:
1037    case FS_OPCODE_INTERPOLATE_AT_CENTROID:
1038    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
1039    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
1040    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
1041       return 0;
1042    default:
1043       unreachable("not reached");
1044    }
1045 }
1046
1047 fs_reg
1048 fs_visitor::vgrf(const glsl_type *const type)
1049 {
1050    int reg_width = dispatch_width / 8;
1051    return fs_reg(GRF, alloc.allocate(type_size(type) * reg_width),
1052                  brw_type_for_base_type(type), dispatch_width);
1053 }
1054
1055 fs_reg
1056 fs_visitor::vgrf(int num_components)
1057 {
1058    int reg_width = dispatch_width / 8;
1059    return fs_reg(GRF, alloc.allocate(num_components * reg_width),
1060                  BRW_REGISTER_TYPE_F, dispatch_width);
1061 }
1062
1063 /** Fixed HW reg constructor. */
1064 fs_reg::fs_reg(enum register_file file, int reg)
1065 {
1066    init();
1067    this->file = file;
1068    this->reg = reg;
1069    this->type = BRW_REGISTER_TYPE_F;
1070
1071    switch (file) {
1072    case UNIFORM:
1073       this->width = 1;
1074       break;
1075    default:
1076       this->width = 8;
1077    }
1078 }
1079
1080 /** Fixed HW reg constructor. */
1081 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
1082 {
1083    init();
1084    this->file = file;
1085    this->reg = reg;
1086    this->type = type;
1087
1088    switch (file) {
1089    case UNIFORM:
1090       this->width = 1;
1091       break;
1092    default:
1093       this->width = 8;
1094    }
1095 }
1096
1097 /** Fixed HW reg constructor. */
1098 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type,
1099                uint8_t width)
1100 {
1101    init();
1102    this->file = file;
1103    this->reg = reg;
1104    this->type = type;
1105    this->width = width;
1106 }
1107
1108 fs_reg *
1109 fs_visitor::variable_storage(ir_variable *var)
1110 {
1111    return (fs_reg *)hash_table_find(this->variable_ht, var);
1112 }
1113
1114 void
1115 import_uniforms_callback(const void *key,
1116                          void *data,
1117                          void *closure)
1118 {
1119    struct hash_table *dst_ht = (struct hash_table *)closure;
1120    const fs_reg *reg = (const fs_reg *)data;
1121
1122    if (reg->file != UNIFORM)
1123       return;
1124
1125    hash_table_insert(dst_ht, data, key);
1126 }
1127
1128 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
1129  * This brings in those uniform definitions
1130  */
1131 void
1132 fs_visitor::import_uniforms(fs_visitor *v)
1133 {
1134    hash_table_call_foreach(v->variable_ht,
1135                            import_uniforms_callback,
1136                            variable_ht);
1137    this->push_constant_loc = v->push_constant_loc;
1138    this->pull_constant_loc = v->pull_constant_loc;
1139    this->uniforms = v->uniforms;
1140    this->param_size = v->param_size;
1141 }
1142
1143 /* Our support for uniforms is piggy-backed on the struct
1144  * gl_fragment_program, because that's where the values actually
1145  * get stored, rather than in some global gl_shader_program uniform
1146  * store.
1147  */
1148 void
1149 fs_visitor::setup_uniform_values(ir_variable *ir)
1150 {
1151    int namelen = strlen(ir->name);
1152
1153    /* The data for our (non-builtin) uniforms is stored in a series of
1154     * gl_uniform_driver_storage structs for each subcomponent that
1155     * glGetUniformLocation() could name.  We know it's been set up in the same
1156     * order we'd walk the type, so walk the list of storage and find anything
1157     * with our name, or the prefix of a component that starts with our name.
1158     */
1159    unsigned params_before = uniforms;
1160    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
1161       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
1162
1163       if (strncmp(ir->name, storage->name, namelen) != 0 ||
1164           (storage->name[namelen] != 0 &&
1165            storage->name[namelen] != '.' &&
1166            storage->name[namelen] != '[')) {
1167          continue;
1168       }
1169
1170       unsigned slots = storage->type->component_slots();
1171       if (storage->array_elements)
1172          slots *= storage->array_elements;
1173
1174       for (unsigned i = 0; i < slots; i++) {
1175          stage_prog_data->param[uniforms++] = &storage->storage[i];
1176       }
1177    }
1178
1179    /* Make sure we actually initialized the right amount of stuff here. */
1180    assert(params_before + ir->type->component_slots() == uniforms);
1181    (void)params_before;
1182 }
1183
1184
1185 /* Our support for builtin uniforms is even scarier than non-builtin.
1186  * It sits on top of the PROG_STATE_VAR parameters that are
1187  * automatically updated from GL context state.
1188  */
1189 void
1190 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
1191 {
1192    const ir_state_slot *const slots = ir->get_state_slots();
1193    assert(slots != NULL);
1194
1195    for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
1196       /* This state reference has already been setup by ir_to_mesa, but we'll
1197        * get the same index back here.
1198        */
1199       int index = _mesa_add_state_reference(this->prog->Parameters,
1200                                             (gl_state_index *)slots[i].tokens);
1201
1202       /* Add each of the unique swizzles of the element as a parameter.
1203        * This'll end up matching the expected layout of the
1204        * array/matrix/structure we're trying to fill in.
1205        */
1206       int last_swiz = -1;
1207       for (unsigned int j = 0; j < 4; j++) {
1208          int swiz = GET_SWZ(slots[i].swizzle, j);
1209          if (swiz == last_swiz)
1210             break;
1211          last_swiz = swiz;
1212
1213          stage_prog_data->param[uniforms++] =
1214             &prog->Parameters->ParameterValues[index][swiz];
1215       }
1216    }
1217 }
1218
1219 fs_reg *
1220 fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
1221                                          bool origin_upper_left)
1222 {
1223    assert(stage == MESA_SHADER_FRAGMENT);
1224    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1225    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec4_type));
1226    fs_reg wpos = *reg;
1227    bool flip = !origin_upper_left ^ key->render_to_fbo;
1228
1229    /* gl_FragCoord.x */
1230    if (pixel_center_integer) {
1231       emit(MOV(wpos, this->pixel_x));
1232    } else {
1233       emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
1234    }
1235    wpos = offset(wpos, 1);
1236
1237    /* gl_FragCoord.y */
1238    if (!flip && pixel_center_integer) {
1239       emit(MOV(wpos, this->pixel_y));
1240    } else {
1241       fs_reg pixel_y = this->pixel_y;
1242       float offset = (pixel_center_integer ? 0.0 : 0.5);
1243
1244       if (flip) {
1245          pixel_y.negate = true;
1246          offset += key->drawable_height - 1.0;
1247       }
1248
1249       emit(ADD(wpos, pixel_y, fs_reg(offset)));
1250    }
1251    wpos = offset(wpos, 1);
1252
1253    /* gl_FragCoord.z */
1254    if (brw->gen >= 6) {
1255       emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
1256    } else {
1257       emit(FS_OPCODE_LINTERP, wpos,
1258            this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1259            this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1260            interp_reg(VARYING_SLOT_POS, 2));
1261    }
1262    wpos = offset(wpos, 1);
1263
1264    /* gl_FragCoord.w: Already set up in emit_interpolation */
1265    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1266
1267    return reg;
1268 }
1269
1270 fs_inst *
1271 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1272                          glsl_interp_qualifier interpolation_mode,
1273                          bool is_centroid, bool is_sample)
1274 {
1275    brw_wm_barycentric_interp_mode barycoord_mode;
1276    if (brw->gen >= 6) {
1277       if (is_centroid) {
1278          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1279             barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1280          else
1281             barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1282       } else if (is_sample) {
1283           if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1284             barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1285          else
1286             barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1287       } else {
1288          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1289             barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1290          else
1291             barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1292       }
1293    } else {
1294       /* On Ironlake and below, there is only one interpolation mode.
1295        * Centroid interpolation doesn't mean anything on this hardware --
1296        * there is no multisampling.
1297        */
1298       barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1299    }
1300    return emit(FS_OPCODE_LINTERP, attr,
1301                this->delta_x[barycoord_mode],
1302                this->delta_y[barycoord_mode], interp);
1303 }
1304
1305 void
1306 fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
1307                                        const glsl_type *type,
1308                                        glsl_interp_qualifier interpolation_mode,
1309                                        int location, bool mod_centroid,
1310                                        bool mod_sample)
1311 {
1312    attr.type = brw_type_for_base_type(type->get_scalar_type());
1313
1314    assert(stage == MESA_SHADER_FRAGMENT);
1315    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1316    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1317
1318    unsigned int array_elements;
1319
1320    if (type->is_array()) {
1321       array_elements = type->length;
1322       if (array_elements == 0) {
1323          fail("dereferenced array '%s' has length 0\n", name);
1324       }
1325       type = type->fields.array;
1326    } else {
1327       array_elements = 1;
1328    }
1329
1330    if (interpolation_mode == INTERP_QUALIFIER_NONE) {
1331       bool is_gl_Color =
1332          location == VARYING_SLOT_COL0 || location == VARYING_SLOT_COL1;
1333       if (key->flat_shade && is_gl_Color) {
1334          interpolation_mode = INTERP_QUALIFIER_FLAT;
1335       } else {
1336          interpolation_mode = INTERP_QUALIFIER_SMOOTH;
1337       }
1338    }
1339
1340    for (unsigned int i = 0; i < array_elements; i++) {
1341       for (unsigned int j = 0; j < type->matrix_columns; j++) {
1342          if (prog_data->urb_setup[location] == -1) {
1343             /* If there's no incoming setup data for this slot, don't
1344              * emit interpolation for it.
1345              */
1346             attr = offset(attr, type->vector_elements);
1347             location++;
1348             continue;
1349          }
1350
1351          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1352             /* Constant interpolation (flat shading) case. The SF has
1353              * handed us defined values in only the constant offset
1354              * field of the setup reg.
1355              */
1356             for (unsigned int k = 0; k < type->vector_elements; k++) {
1357                struct brw_reg interp = interp_reg(location, k);
1358                interp = suboffset(interp, 3);
1359                interp.type = attr.type;
1360                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1361                attr = offset(attr, 1);
1362             }
1363          } else {
1364             /* Smooth/noperspective interpolation case. */
1365             for (unsigned int k = 0; k < type->vector_elements; k++) {
1366                struct brw_reg interp = interp_reg(location, k);
1367                if (brw->needs_unlit_centroid_workaround && mod_centroid) {
1368                   /* Get the pixel/sample mask into f0 so that we know
1369                    * which pixels are lit.  Then, for each channel that is
1370                    * unlit, replace the centroid data with non-centroid
1371                    * data.
1372                    */
1373                   emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1374
1375                   fs_inst *inst;
1376                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1377                                       false, false);
1378                   inst->predicate = BRW_PREDICATE_NORMAL;
1379                   inst->predicate_inverse = true;
1380                   if (brw->has_pln)
1381                      inst->no_dd_clear = true;
1382
1383                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1384                                       mod_centroid && !key->persample_shading,
1385                                       mod_sample || key->persample_shading);
1386                   inst->predicate = BRW_PREDICATE_NORMAL;
1387                   inst->predicate_inverse = false;
1388                   if (brw->has_pln)
1389                      inst->no_dd_check = true;
1390
1391                } else {
1392                   emit_linterp(attr, fs_reg(interp), interpolation_mode,
1393                                mod_centroid && !key->persample_shading,
1394                                mod_sample || key->persample_shading);
1395                }
1396                if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1397                   emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1398                }
1399                attr = offset(attr, 1);
1400             }
1401
1402          }
1403          location++;
1404       }
1405    }
1406 }
1407
1408 fs_reg *
1409 fs_visitor::emit_frontfacing_interpolation()
1410 {
1411    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type));
1412
1413    if (brw->gen >= 6) {
1414       /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
1415        * a boolean result from this (~0/true or 0/false).
1416        *
1417        * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
1418        * this task in only one instruction:
1419        *    - a negation source modifier will flip the bit; and
1420        *    - a W -> D type conversion will sign extend the bit into the high
1421        *      word of the destination.
1422        *
1423        * An ASR 15 fills the low word of the destination.
1424        */
1425       fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
1426       g0.negate = true;
1427
1428       emit(ASR(*reg, g0, fs_reg(15)));
1429    } else {
1430       /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
1431        * a boolean result from this (1/true or 0/false).
1432        *
1433        * Like in the above case, since the bit is the MSB of g1.6:UD we can use
1434        * the negation source modifier to flip it. Unfortunately the SHR
1435        * instruction only operates on UD (or D with an abs source modifier)
1436        * sources without negation.
1437        *
1438        * Instead, use ASR (which will give ~0/true or 0/false).
1439        */
1440       fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
1441       g1_6.negate = true;
1442
1443       emit(ASR(*reg, g1_6, fs_reg(31)));
1444    }
1445
1446    return reg;
1447 }
1448
1449 void
1450 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1451 {
1452    assert(stage == MESA_SHADER_FRAGMENT);
1453    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1454    assert(dst.type == BRW_REGISTER_TYPE_F);
1455
1456    if (key->compute_pos_offset) {
1457       /* Convert int_sample_pos to floating point */
1458       emit(MOV(dst, int_sample_pos));
1459       /* Scale to the range [0, 1] */
1460       emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1461    }
1462    else {
1463       /* From ARB_sample_shading specification:
1464        * "When rendering to a non-multisample buffer, or if multisample
1465        *  rasterization is disabled, gl_SamplePosition will always be
1466        *  (0.5, 0.5).
1467        */
1468       emit(MOV(dst, fs_reg(0.5f)));
1469    }
1470 }
1471
1472 fs_reg *
1473 fs_visitor::emit_samplepos_setup()
1474 {
1475    assert(brw->gen >= 6);
1476
1477    this->current_annotation = "compute sample position";
1478    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type));
1479    fs_reg pos = *reg;
1480    fs_reg int_sample_x = vgrf(glsl_type::int_type);
1481    fs_reg int_sample_y = vgrf(glsl_type::int_type);
1482
1483    /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1484     * mode will be enabled.
1485     *
1486     * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1487     * R31.1:0         Position Offset X/Y for Slot[3:0]
1488     * R31.3:2         Position Offset X/Y for Slot[7:4]
1489     * .....
1490     *
1491     * The X, Y sample positions come in as bytes in  thread payload. So, read
1492     * the positions using vstride=16, width=8, hstride=2.
1493     */
1494    struct brw_reg sample_pos_reg =
1495       stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1496                     BRW_REGISTER_TYPE_B), 16, 8, 2);
1497
1498    if (dispatch_width == 8) {
1499       emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1500    } else {
1501       emit(MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg)));
1502       emit(MOV(half(int_sample_x, 1), fs_reg(suboffset(sample_pos_reg, 16))))
1503          ->force_sechalf = true;
1504    }
1505    /* Compute gl_SamplePosition.x */
1506    compute_sample_position(pos, int_sample_x);
1507    pos = offset(pos, 1);
1508    if (dispatch_width == 8) {
1509       emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1510    } else {
1511       emit(MOV(half(int_sample_y, 0),
1512                fs_reg(suboffset(sample_pos_reg, 1))));
1513       emit(MOV(half(int_sample_y, 1), fs_reg(suboffset(sample_pos_reg, 17))))
1514          ->force_sechalf = true;
1515    }
1516    /* Compute gl_SamplePosition.y */
1517    compute_sample_position(pos, int_sample_y);
1518    return reg;
1519 }
1520
1521 fs_reg *
1522 fs_visitor::emit_sampleid_setup()
1523 {
1524    assert(stage == MESA_SHADER_FRAGMENT);
1525    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1526    assert(brw->gen >= 6);
1527
1528    this->current_annotation = "compute sample id";
1529    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
1530
1531    if (key->compute_sample_id) {
1532       fs_reg t1 = vgrf(glsl_type::int_type);
1533       fs_reg t2 = vgrf(glsl_type::int_type);
1534       t2.type = BRW_REGISTER_TYPE_UW;
1535
1536       /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1537        * 8x multisampling, subspan 0 will represent sample N (where N
1538        * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1539        * 7. We can find the value of N by looking at R0.0 bits 7:6
1540        * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1541        * (since samples are always delivered in pairs). That is, we
1542        * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1543        * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1544        * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1545        * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1546        * populating a temporary variable with the sequence (0, 1, 2, 3),
1547        * and then reading from it using vstride=1, width=4, hstride=0.
1548        * These computations hold good for 4x multisampling as well.
1549        *
1550        * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1551        * the first four slots are sample 0 of subspan 0; the next four
1552        * are sample 1 of subspan 0; the third group is sample 0 of
1553        * subspan 1, and finally sample 1 of subspan 1.
1554        */
1555       fs_inst *inst;
1556       inst = emit(BRW_OPCODE_AND, t1,
1557                   fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1558                   fs_reg(0xc0));
1559       inst->force_writemask_all = true;
1560       inst = emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1561       inst->force_writemask_all = true;
1562       /* This works for both SIMD8 and SIMD16 */
1563       inst = emit(MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210)));
1564       inst->force_writemask_all = true;
1565       /* This special instruction takes care of setting vstride=1,
1566        * width=4, hstride=0 of t2 during an ADD instruction.
1567        */
1568       emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1569    } else {
1570       /* As per GL_ARB_sample_shading specification:
1571        * "When rendering to a non-multisample buffer, or if multisample
1572        *  rasterization is disabled, gl_SampleID will always be zero."
1573        */
1574       emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1575    }
1576
1577    return reg;
1578 }
1579
1580 void
1581 fs_visitor::resolve_source_modifiers(fs_reg *src)
1582 {
1583    if (!src->abs && !src->negate)
1584       return;
1585
1586    fs_reg temp = retype(vgrf(1), src->type);
1587    emit(MOV(temp, *src));
1588    *src = temp;
1589 }
1590
1591 fs_reg
1592 fs_visitor::fix_math_operand(fs_reg src)
1593 {
1594    /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1595     * might be able to do better by doing execsize = 1 math and then
1596     * expanding that result out, but we would need to be careful with
1597     * masking.
1598     *
1599     * The hardware ignores source modifiers (negate and abs) on math
1600     * instructions, so we also move to a temp to set those up.
1601     */
1602    if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1603        !src.abs && !src.negate)
1604       return src;
1605
1606    /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1607     * operands to math
1608     */
1609    if (brw->gen >= 7 && src.file != IMM)
1610       return src;
1611
1612    fs_reg expanded = vgrf(glsl_type::float_type);
1613    expanded.type = src.type;
1614    emit(BRW_OPCODE_MOV, expanded, src);
1615    return expanded;
1616 }
1617
1618 fs_inst *
1619 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1620 {
1621    switch (opcode) {
1622    case SHADER_OPCODE_RCP:
1623    case SHADER_OPCODE_RSQ:
1624    case SHADER_OPCODE_SQRT:
1625    case SHADER_OPCODE_EXP2:
1626    case SHADER_OPCODE_LOG2:
1627    case SHADER_OPCODE_SIN:
1628    case SHADER_OPCODE_COS:
1629       break;
1630    default:
1631       unreachable("not reached: bad math opcode");
1632    }
1633
1634    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
1635     * might be able to do better by doing execsize = 1 math and then
1636     * expanding that result out, but we would need to be careful with
1637     * masking.
1638     *
1639     * Gen 6 hardware ignores source modifiers (negate and abs) on math
1640     * instructions, so we also move to a temp to set those up.
1641     */
1642    if (brw->gen == 6 || brw->gen == 7)
1643       src = fix_math_operand(src);
1644
1645    fs_inst *inst = emit(opcode, dst, src);
1646
1647    if (brw->gen < 6) {
1648       inst->base_mrf = 2;
1649       inst->mlen = dispatch_width / 8;
1650    }
1651
1652    return inst;
1653 }
1654
1655 fs_inst *
1656 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1657 {
1658    int base_mrf = 2;
1659    fs_inst *inst;
1660
1661    if (brw->gen >= 8) {
1662       inst = emit(opcode, dst, src0, src1);
1663    } else if (brw->gen >= 6) {
1664       src0 = fix_math_operand(src0);
1665       src1 = fix_math_operand(src1);
1666
1667       inst = emit(opcode, dst, src0, src1);
1668    } else {
1669       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1670        * "Message Payload":
1671        *
1672        * "Operand0[7].  For the INT DIV functions, this operand is the
1673        *  denominator."
1674        *  ...
1675        * "Operand1[7].  For the INT DIV functions, this operand is the
1676        *  numerator."
1677        */
1678       bool is_int_div = opcode != SHADER_OPCODE_POW;
1679       fs_reg &op0 = is_int_div ? src1 : src0;
1680       fs_reg &op1 = is_int_div ? src0 : src1;
1681
1682       emit(MOV(fs_reg(MRF, base_mrf + 1, op1.type, dispatch_width), op1));
1683       inst = emit(opcode, dst, op0, reg_null_f);
1684
1685       inst->base_mrf = base_mrf;
1686       inst->mlen = 2 * dispatch_width / 8;
1687    }
1688    return inst;
1689 }
1690
1691 void
1692 fs_visitor::assign_curb_setup()
1693 {
1694    if (dispatch_width == 8) {
1695       prog_data->dispatch_grf_start_reg = payload.num_regs;
1696    } else {
1697       assert(stage == MESA_SHADER_FRAGMENT);
1698       brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1699       prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1700    }
1701
1702    prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1703
1704    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1705    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1706       for (unsigned int i = 0; i < inst->sources; i++) {
1707          if (inst->src[i].file == UNIFORM) {
1708             int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1709             int constant_nr;
1710             if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1711                constant_nr = push_constant_loc[uniform_nr];
1712             } else {
1713                /* Section 5.11 of the OpenGL 4.1 spec says:
1714                 * "Out-of-bounds reads return undefined values, which include
1715                 *  values from other variables of the active program or zero."
1716                 * Just return the first push constant.
1717                 */
1718                constant_nr = 0;
1719             }
1720
1721             struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1722                                                   constant_nr / 8,
1723                                                   constant_nr % 8);
1724
1725             inst->src[i].file = HW_REG;
1726             inst->src[i].fixed_hw_reg = byte_offset(
1727                retype(brw_reg, inst->src[i].type),
1728                inst->src[i].subreg_offset);
1729          }
1730       }
1731    }
1732 }
1733
1734 void
1735 fs_visitor::calculate_urb_setup()
1736 {
1737    assert(stage == MESA_SHADER_FRAGMENT);
1738    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1739    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1740
1741    memset(prog_data->urb_setup, -1,
1742           sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
1743
1744    int urb_next = 0;
1745    /* Figure out where each of the incoming setup attributes lands. */
1746    if (brw->gen >= 6) {
1747       if (_mesa_bitcount_64(prog->InputsRead &
1748                             BRW_FS_VARYING_INPUT_MASK) <= 16) {
1749          /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1750           * first 16 varying inputs, so we can put them wherever we want.
1751           * Just put them in order.
1752           *
1753           * This is useful because it means that (a) inputs not used by the
1754           * fragment shader won't take up valuable register space, and (b) we
1755           * won't have to recompile the fragment shader if it gets paired with
1756           * a different vertex (or geometry) shader.
1757           */
1758          for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1759             if (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1760                 BITFIELD64_BIT(i)) {
1761                prog_data->urb_setup[i] = urb_next++;
1762             }
1763          }
1764       } else {
1765          /* We have enough input varyings that the SF/SBE pipeline stage can't
1766           * arbitrarily rearrange them to suit our whim; we have to put them
1767           * in an order that matches the output of the previous pipeline stage
1768           * (geometry or vertex shader).
1769           */
1770          struct brw_vue_map prev_stage_vue_map;
1771          brw_compute_vue_map(brw, &prev_stage_vue_map,
1772                              key->input_slots_valid);
1773          int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1774          assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1775          for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1776               slot++) {
1777             int varying = prev_stage_vue_map.slot_to_varying[slot];
1778             /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1779              * unused.
1780              */
1781             if (varying != BRW_VARYING_SLOT_COUNT &&
1782                 (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1783                  BITFIELD64_BIT(varying))) {
1784                prog_data->urb_setup[varying] = slot - first_slot;
1785             }
1786          }
1787          urb_next = prev_stage_vue_map.num_slots - first_slot;
1788       }
1789    } else {
1790       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1791       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1792          /* Point size is packed into the header, not as a general attribute */
1793          if (i == VARYING_SLOT_PSIZ)
1794             continue;
1795
1796          if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1797             /* The back color slot is skipped when the front color is
1798              * also written to.  In addition, some slots can be
1799              * written in the vertex shader and not read in the
1800              * fragment shader.  So the register number must always be
1801              * incremented, mapped or not.
1802              */
1803             if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1804                prog_data->urb_setup[i] = urb_next;
1805             urb_next++;
1806          }
1807       }
1808
1809       /*
1810        * It's a FS only attribute, and we did interpolation for this attribute
1811        * in SF thread. So, count it here, too.
1812        *
1813        * See compile_sf_prog() for more info.
1814        */
1815       if (prog->InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1816          prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1817    }
1818
1819    prog_data->num_varying_inputs = urb_next;
1820 }
1821
1822 void
1823 fs_visitor::assign_urb_setup()
1824 {
1825    assert(stage == MESA_SHADER_FRAGMENT);
1826    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1827
1828    int urb_start = payload.num_regs + prog_data->base.curb_read_length;
1829
1830    /* Offset all the urb_setup[] index by the actual position of the
1831     * setup regs, now that the location of the constants has been chosen.
1832     */
1833    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1834       if (inst->opcode == FS_OPCODE_LINTERP) {
1835          assert(inst->src[2].file == HW_REG);
1836          inst->src[2].fixed_hw_reg.nr += urb_start;
1837       }
1838
1839       if (inst->opcode == FS_OPCODE_CINTERP) {
1840          assert(inst->src[0].file == HW_REG);
1841          inst->src[0].fixed_hw_reg.nr += urb_start;
1842       }
1843    }
1844
1845    /* Each attribute is 4 setup channels, each of which is half a reg. */
1846    this->first_non_payload_grf =
1847       urb_start + prog_data->num_varying_inputs * 2;
1848 }
1849
1850 void
1851 fs_visitor::assign_vs_urb_setup()
1852 {
1853    brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
1854    int grf, count, slot, channel, attr;
1855
1856    assert(stage == MESA_SHADER_VERTEX);
1857    count = _mesa_bitcount_64(vs_prog_data->inputs_read);
1858    if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid)
1859       count++;
1860
1861    /* Each attribute is 4 regs. */
1862    this->first_non_payload_grf =
1863       payload.num_regs + prog_data->curb_read_length + count * 4;
1864
1865    unsigned vue_entries =
1866       MAX2(count, vs_prog_data->base.vue_map.num_slots);
1867
1868    vs_prog_data->base.urb_entry_size = ALIGN(vue_entries, 4) / 4;
1869    vs_prog_data->base.urb_read_length = (count + 1) / 2;
1870
1871    assert(vs_prog_data->base.urb_read_length <= 15);
1872
1873    /* Rewrite all ATTR file references to the hw grf that they land in. */
1874    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1875       for (int i = 0; i < inst->sources; i++) {
1876          if (inst->src[i].file == ATTR) {
1877
1878             if (inst->src[i].reg == VERT_ATTRIB_MAX) {
1879                slot = count - 1;
1880             } else {
1881                /* Attributes come in in a contiguous block, ordered by their
1882                 * gl_vert_attrib value.  That means we can compute the slot
1883                 * number for an attribute by masking out the enabled
1884                 * attributes before it and counting the bits.
1885                 */
1886                attr = inst->src[i].reg + inst->src[i].reg_offset / 4;
1887                slot = _mesa_bitcount_64(vs_prog_data->inputs_read &
1888                                         BITFIELD64_MASK(attr));
1889             }
1890
1891             channel = inst->src[i].reg_offset & 3;
1892
1893             grf = payload.num_regs +
1894                prog_data->curb_read_length +
1895                slot * 4 + channel;
1896
1897             inst->src[i].file = HW_REG;
1898             inst->src[i].fixed_hw_reg =
1899                retype(brw_vec8_grf(grf, 0), inst->src[i].type);
1900          }
1901       }
1902    }
1903 }
1904
1905 /**
1906  * Split large virtual GRFs into separate components if we can.
1907  *
1908  * This is mostly duplicated with what brw_fs_vector_splitting does,
1909  * but that's really conservative because it's afraid of doing
1910  * splitting that doesn't result in real progress after the rest of
1911  * the optimization phases, which would cause infinite looping in
1912  * optimization.  We can do it once here, safely.  This also has the
1913  * opportunity to split interpolated values, or maybe even uniforms,
1914  * which we don't have at the IR level.
1915  *
1916  * We want to split, because virtual GRFs are what we register
1917  * allocate and spill (due to contiguousness requirements for some
1918  * instructions), and they're what we naturally generate in the
1919  * codegen process, but most virtual GRFs don't actually need to be
1920  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1921  * live intervals and better dead code elimination and coalescing.
1922  */
1923 void
1924 fs_visitor::split_virtual_grfs()
1925 {
1926    int num_vars = this->alloc.count;
1927
1928    /* Count the total number of registers */
1929    int reg_count = 0;
1930    int vgrf_to_reg[num_vars];
1931    for (int i = 0; i < num_vars; i++) {
1932       vgrf_to_reg[i] = reg_count;
1933       reg_count += alloc.sizes[i];
1934    }
1935
1936    /* An array of "split points".  For each register slot, this indicates
1937     * if this slot can be separated from the previous slot.  Every time an
1938     * instruction uses multiple elements of a register (as a source or
1939     * destination), we mark the used slots as inseparable.  Then we go
1940     * through and split the registers into the smallest pieces we can.
1941     */
1942    bool split_points[reg_count];
1943    memset(split_points, 0, sizeof(split_points));
1944
1945    /* Mark all used registers as fully splittable */
1946    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1947       if (inst->dst.file == GRF) {
1948          int reg = vgrf_to_reg[inst->dst.reg];
1949          for (unsigned j = 1; j < this->alloc.sizes[inst->dst.reg]; j++)
1950             split_points[reg + j] = true;
1951       }
1952
1953       for (int i = 0; i < inst->sources; i++) {
1954          if (inst->src[i].file == GRF) {
1955             int reg = vgrf_to_reg[inst->src[i].reg];
1956             for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].reg]; j++)
1957                split_points[reg + j] = true;
1958          }
1959       }
1960    }
1961
1962    if (brw->has_pln &&
1963        this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1964       /* PLN opcodes rely on the delta_xy being contiguous.  We only have to
1965        * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1966        * Gen6, that was the only supported interpolation mode, and since Gen6,
1967        * delta_x and delta_y are in fixed hardware registers.
1968        */
1969       int vgrf = this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg;
1970       split_points[vgrf_to_reg[vgrf] + 1] = false;
1971    }
1972
1973    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1974       if (inst->dst.file == GRF) {
1975          int reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
1976          for (int j = 1; j < inst->regs_written; j++)
1977             split_points[reg + j] = false;
1978       }
1979       for (int i = 0; i < inst->sources; i++) {
1980          if (inst->src[i].file == GRF) {
1981             int reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
1982             for (int j = 1; j < inst->regs_read(i); j++)
1983                split_points[reg + j] = false;
1984          }
1985       }
1986    }
1987
1988    int new_virtual_grf[reg_count];
1989    int new_reg_offset[reg_count];
1990
1991    int reg = 0;
1992    for (int i = 0; i < num_vars; i++) {
1993       /* The first one should always be 0 as a quick sanity check. */
1994       assert(split_points[reg] == false);
1995
1996       /* j = 0 case */
1997       new_reg_offset[reg] = 0;
1998       reg++;
1999       int offset = 1;
2000
2001       /* j > 0 case */
2002       for (unsigned j = 1; j < alloc.sizes[i]; j++) {
2003          /* If this is a split point, reset the offset to 0 and allocate a
2004           * new virtual GRF for the previous offset many registers
2005           */
2006          if (split_points[reg]) {
2007             assert(offset <= MAX_VGRF_SIZE);
2008             int grf = alloc.allocate(offset);
2009             for (int k = reg - offset; k < reg; k++)
2010                new_virtual_grf[k] = grf;
2011             offset = 0;
2012          }
2013          new_reg_offset[reg] = offset;
2014          offset++;
2015          reg++;
2016       }
2017
2018       /* The last one gets the original register number */
2019       assert(offset <= MAX_VGRF_SIZE);
2020       alloc.sizes[i] = offset;
2021       for (int k = reg - offset; k < reg; k++)
2022          new_virtual_grf[k] = i;
2023    }
2024    assert(reg == reg_count);
2025
2026    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2027       if (inst->dst.file == GRF) {
2028          reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
2029          inst->dst.reg = new_virtual_grf[reg];
2030          inst->dst.reg_offset = new_reg_offset[reg];
2031          assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
2032       }
2033       for (int i = 0; i < inst->sources; i++) {
2034          if (inst->src[i].file == GRF) {
2035             reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
2036             inst->src[i].reg = new_virtual_grf[reg];
2037             inst->src[i].reg_offset = new_reg_offset[reg];
2038             assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
2039          }
2040       }
2041    }
2042    invalidate_live_intervals();
2043 }
2044
2045 /**
2046  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
2047  *
2048  * During code generation, we create tons of temporary variables, many of
2049  * which get immediately killed and are never used again.  Yet, in later
2050  * optimization and analysis passes, such as compute_live_intervals, we need
2051  * to loop over all the virtual GRFs.  Compacting them can save a lot of
2052  * overhead.
2053  */
2054 bool
2055 fs_visitor::compact_virtual_grfs()
2056 {
2057    bool progress = false;
2058    int remap_table[this->alloc.count];
2059    memset(remap_table, -1, sizeof(remap_table));
2060
2061    /* Mark which virtual GRFs are used. */
2062    foreach_block_and_inst(block, const fs_inst, inst, cfg) {
2063       if (inst->dst.file == GRF)
2064          remap_table[inst->dst.reg] = 0;
2065
2066       for (int i = 0; i < inst->sources; i++) {
2067          if (inst->src[i].file == GRF)
2068             remap_table[inst->src[i].reg] = 0;
2069       }
2070    }
2071
2072    /* Compact the GRF arrays. */
2073    int new_index = 0;
2074    for (unsigned i = 0; i < this->alloc.count; i++) {
2075       if (remap_table[i] == -1) {
2076          /* We just found an unused register.  This means that we are
2077           * actually going to compact something.
2078           */
2079          progress = true;
2080       } else {
2081          remap_table[i] = new_index;
2082          alloc.sizes[new_index] = alloc.sizes[i];
2083          invalidate_live_intervals();
2084          ++new_index;
2085       }
2086    }
2087
2088    this->alloc.count = new_index;
2089
2090    /* Patch all the instructions to use the newly renumbered registers */
2091    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2092       if (inst->dst.file == GRF)
2093          inst->dst.reg = remap_table[inst->dst.reg];
2094
2095       for (int i = 0; i < inst->sources; i++) {
2096          if (inst->src[i].file == GRF)
2097             inst->src[i].reg = remap_table[inst->src[i].reg];
2098       }
2099    }
2100
2101    /* Patch all the references to delta_x/delta_y, since they're used in
2102     * register allocation.  If they're unused, switch them to BAD_FILE so
2103     * we don't think some random VGRF is delta_x/delta_y.
2104     */
2105    for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
2106       if (delta_x[i].file == GRF) {
2107          if (remap_table[delta_x[i].reg] != -1) {
2108             delta_x[i].reg = remap_table[delta_x[i].reg];
2109          } else {
2110             delta_x[i].file = BAD_FILE;
2111          }
2112       }
2113    }
2114    for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
2115       if (delta_y[i].file == GRF) {
2116          if (remap_table[delta_y[i].reg] != -1) {
2117             delta_y[i].reg = remap_table[delta_y[i].reg];
2118          } else {
2119             delta_y[i].file = BAD_FILE;
2120          }
2121       }
2122    }
2123
2124    return progress;
2125 }
2126
2127 /*
2128  * Implements array access of uniforms by inserting a
2129  * PULL_CONSTANT_LOAD instruction.
2130  *
2131  * Unlike temporary GRF array access (where we don't support it due to
2132  * the difficulty of doing relative addressing on instruction
2133  * destinations), we could potentially do array access of uniforms
2134  * that were loaded in GRF space as push constants.  In real-world
2135  * usage we've seen, though, the arrays being used are always larger
2136  * than we could load as push constants, so just always move all
2137  * uniform array access out to a pull constant buffer.
2138  */
2139 void
2140 fs_visitor::move_uniform_array_access_to_pull_constants()
2141 {
2142    if (dispatch_width != 8)
2143       return;
2144
2145    pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2146    memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
2147
2148    /* Walk through and find array access of uniforms.  Put a copy of that
2149     * uniform in the pull constant buffer.
2150     *
2151     * Note that we don't move constant-indexed accesses to arrays.  No
2152     * testing has been done of the performance impact of this choice.
2153     */
2154    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2155       for (int i = 0 ; i < inst->sources; i++) {
2156          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2157             continue;
2158
2159          int uniform = inst->src[i].reg;
2160
2161          /* If this array isn't already present in the pull constant buffer,
2162           * add it.
2163           */
2164          if (pull_constant_loc[uniform] == -1) {
2165             const gl_constant_value **values = &stage_prog_data->param[uniform];
2166
2167             assert(param_size[uniform]);
2168
2169             for (int j = 0; j < param_size[uniform]; j++) {
2170                pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
2171
2172                stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
2173                   values[j];
2174             }
2175          }
2176       }
2177    }
2178 }
2179
2180 /**
2181  * Assign UNIFORM file registers to either push constants or pull constants.
2182  *
2183  * We allow a fragment shader to have more than the specified minimum
2184  * maximum number of fragment shader uniform components (64).  If
2185  * there are too many of these, they'd fill up all of register space.
2186  * So, this will push some of them out to the pull constant buffer and
2187  * update the program to load them.
2188  */
2189 void
2190 fs_visitor::assign_constant_locations()
2191 {
2192    /* Only the first compile (SIMD8 mode) gets to decide on locations. */
2193    if (dispatch_width != 8)
2194       return;
2195
2196    /* Find which UNIFORM registers are still in use. */
2197    bool is_live[uniforms];
2198    for (unsigned int i = 0; i < uniforms; i++) {
2199       is_live[i] = false;
2200    }
2201
2202    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2203       for (int i = 0; i < inst->sources; i++) {
2204          if (inst->src[i].file != UNIFORM)
2205             continue;
2206
2207          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
2208          if (constant_nr >= 0 && constant_nr < (int) uniforms)
2209             is_live[constant_nr] = true;
2210       }
2211    }
2212
2213    /* Only allow 16 registers (128 uniform components) as push constants.
2214     *
2215     * Just demote the end of the list.  We could probably do better
2216     * here, demoting things that are rarely used in the program first.
2217     *
2218     * If changing this value, note the limitation about total_regs in
2219     * brw_curbe.c.
2220     */
2221    unsigned int max_push_components = 16 * 8;
2222    unsigned int num_push_constants = 0;
2223
2224    push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2225
2226    for (unsigned int i = 0; i < uniforms; i++) {
2227       if (!is_live[i] || pull_constant_loc[i] != -1) {
2228          /* This UNIFORM register is either dead, or has already been demoted
2229           * to a pull const.  Mark it as no longer living in the param[] array.
2230           */
2231          push_constant_loc[i] = -1;
2232          continue;
2233       }
2234
2235       if (num_push_constants < max_push_components) {
2236          /* Retain as a push constant.  Record the location in the params[]
2237           * array.
2238           */
2239          push_constant_loc[i] = num_push_constants++;
2240       } else {
2241          /* Demote to a pull constant. */
2242          push_constant_loc[i] = -1;
2243
2244          int pull_index = stage_prog_data->nr_pull_params++;
2245          stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
2246          pull_constant_loc[i] = pull_index;
2247       }
2248    }
2249
2250    stage_prog_data->nr_params = num_push_constants;
2251
2252    /* Up until now, the param[] array has been indexed by reg + reg_offset
2253     * of UNIFORM registers.  Condense it to only contain the uniforms we
2254     * chose to upload as push constants.
2255     */
2256    for (unsigned int i = 0; i < uniforms; i++) {
2257       int remapped = push_constant_loc[i];
2258
2259       if (remapped == -1)
2260          continue;
2261
2262       assert(remapped <= (int)i);
2263       stage_prog_data->param[remapped] = stage_prog_data->param[i];
2264    }
2265 }
2266
2267 /**
2268  * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
2269  * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
2270  */
2271 void
2272 fs_visitor::demote_pull_constants()
2273 {
2274    foreach_block_and_inst (block, fs_inst, inst, cfg) {
2275       for (int i = 0; i < inst->sources; i++) {
2276          if (inst->src[i].file != UNIFORM)
2277             continue;
2278
2279          int pull_index = pull_constant_loc[inst->src[i].reg +
2280                                             inst->src[i].reg_offset];
2281          if (pull_index == -1)
2282             continue;
2283
2284          /* Set up the annotation tracking for new generated instructions. */
2285          base_ir = inst->ir;
2286          current_annotation = inst->annotation;
2287
2288          fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
2289          fs_reg dst = vgrf(glsl_type::float_type);
2290
2291          /* Generate a pull load into dst. */
2292          if (inst->src[i].reladdr) {
2293             exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
2294                                                         surf_index,
2295                                                         *inst->src[i].reladdr,
2296                                                         pull_index);
2297             inst->insert_before(block, &list);
2298             inst->src[i].reladdr = NULL;
2299          } else {
2300             fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
2301             fs_inst *pull =
2302                new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 8,
2303                                     dst, surf_index, offset);
2304             inst->insert_before(block, pull);
2305             inst->src[i].set_smear(pull_index & 3);
2306          }
2307
2308          /* Rewrite the instruction to use the temporary VGRF. */
2309          inst->src[i].file = GRF;
2310          inst->src[i].reg = dst.reg;
2311          inst->src[i].reg_offset = 0;
2312          inst->src[i].width = dispatch_width;
2313       }
2314    }
2315    invalidate_live_intervals();
2316 }
2317
2318 bool
2319 fs_visitor::opt_algebraic()
2320 {
2321    bool progress = false;
2322
2323    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2324       switch (inst->opcode) {
2325       case BRW_OPCODE_MOV:
2326          if (inst->src[0].file != IMM)
2327             break;
2328
2329          if (inst->saturate) {
2330             if (inst->dst.type != inst->src[0].type)
2331                assert(!"unimplemented: saturate mixed types");
2332
2333             if (brw_saturate_immediate(inst->dst.type,
2334                                        &inst->src[0].fixed_hw_reg)) {
2335                inst->saturate = false;
2336                progress = true;
2337             }
2338          }
2339          break;
2340
2341       case BRW_OPCODE_MUL:
2342          if (inst->src[1].file != IMM)
2343             continue;
2344
2345          /* a * 1.0 = a */
2346          if (inst->src[1].is_one()) {
2347             inst->opcode = BRW_OPCODE_MOV;
2348             inst->src[1] = reg_undef;
2349             progress = true;
2350             break;
2351          }
2352
2353          /* a * -1.0 = -a */
2354          if (inst->src[1].is_negative_one()) {
2355             inst->opcode = BRW_OPCODE_MOV;
2356             inst->src[0].negate = !inst->src[0].negate;
2357             inst->src[1] = reg_undef;
2358             progress = true;
2359             break;
2360          }
2361
2362          /* a * 0.0 = 0.0 */
2363          if (inst->src[1].is_zero()) {
2364             inst->opcode = BRW_OPCODE_MOV;
2365             inst->src[0] = inst->src[1];
2366             inst->src[1] = reg_undef;
2367             progress = true;
2368             break;
2369          }
2370
2371          if (inst->src[0].file == IMM) {
2372             assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2373             inst->opcode = BRW_OPCODE_MOV;
2374             inst->src[0].fixed_hw_reg.dw1.f *= inst->src[1].fixed_hw_reg.dw1.f;
2375             inst->src[1] = reg_undef;
2376             progress = true;
2377             break;
2378          }
2379          break;
2380       case BRW_OPCODE_ADD:
2381          if (inst->src[1].file != IMM)
2382             continue;
2383
2384          /* a + 0.0 = a */
2385          if (inst->src[1].is_zero()) {
2386             inst->opcode = BRW_OPCODE_MOV;
2387             inst->src[1] = reg_undef;
2388             progress = true;
2389             break;
2390          }
2391
2392          if (inst->src[0].file == IMM) {
2393             assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2394             inst->opcode = BRW_OPCODE_MOV;
2395             inst->src[0].fixed_hw_reg.dw1.f += inst->src[1].fixed_hw_reg.dw1.f;
2396             inst->src[1] = reg_undef;
2397             progress = true;
2398             break;
2399          }
2400          break;
2401       case BRW_OPCODE_OR:
2402          if (inst->src[0].equals(inst->src[1])) {
2403             inst->opcode = BRW_OPCODE_MOV;
2404             inst->src[1] = reg_undef;
2405             progress = true;
2406             break;
2407          }
2408          break;
2409       case BRW_OPCODE_LRP:
2410          if (inst->src[1].equals(inst->src[2])) {
2411             inst->opcode = BRW_OPCODE_MOV;
2412             inst->src[0] = inst->src[1];
2413             inst->src[1] = reg_undef;
2414             inst->src[2] = reg_undef;
2415             progress = true;
2416             break;
2417          }
2418          break;
2419       case BRW_OPCODE_CMP:
2420          if (inst->conditional_mod == BRW_CONDITIONAL_GE &&
2421              inst->src[0].abs &&
2422              inst->src[0].negate &&
2423              inst->src[1].is_zero()) {
2424             inst->src[0].abs = false;
2425             inst->src[0].negate = false;
2426             inst->conditional_mod = BRW_CONDITIONAL_Z;
2427             progress = true;
2428             break;
2429          }
2430          break;
2431       case BRW_OPCODE_SEL:
2432          if (inst->src[0].equals(inst->src[1])) {
2433             inst->opcode = BRW_OPCODE_MOV;
2434             inst->src[1] = reg_undef;
2435             inst->predicate = BRW_PREDICATE_NONE;
2436             inst->predicate_inverse = false;
2437             progress = true;
2438          } else if (inst->saturate && inst->src[1].file == IMM) {
2439             switch (inst->conditional_mod) {
2440             case BRW_CONDITIONAL_LE:
2441             case BRW_CONDITIONAL_L:
2442                switch (inst->src[1].type) {
2443                case BRW_REGISTER_TYPE_F:
2444                   if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
2445                      inst->opcode = BRW_OPCODE_MOV;
2446                      inst->src[1] = reg_undef;
2447                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
2448                      progress = true;
2449                   }
2450                   break;
2451                default:
2452                   break;
2453                }
2454                break;
2455             case BRW_CONDITIONAL_GE:
2456             case BRW_CONDITIONAL_G:
2457                switch (inst->src[1].type) {
2458                case BRW_REGISTER_TYPE_F:
2459                   if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
2460                      inst->opcode = BRW_OPCODE_MOV;
2461                      inst->src[1] = reg_undef;
2462                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
2463                      progress = true;
2464                   }
2465                   break;
2466                default:
2467                   break;
2468                }
2469             default:
2470                break;
2471             }
2472          }
2473          break;
2474       case BRW_OPCODE_MAD:
2475          if (inst->src[1].is_zero() || inst->src[2].is_zero()) {
2476             inst->opcode = BRW_OPCODE_MOV;
2477             inst->src[1] = reg_undef;
2478             inst->src[2] = reg_undef;
2479             progress = true;
2480          } else if (inst->src[0].is_zero()) {
2481             inst->opcode = BRW_OPCODE_MUL;
2482             inst->src[0] = inst->src[2];
2483             inst->src[2] = reg_undef;
2484          } else if (inst->src[1].is_one()) {
2485             inst->opcode = BRW_OPCODE_ADD;
2486             inst->src[1] = inst->src[2];
2487             inst->src[2] = reg_undef;
2488             progress = true;
2489          } else if (inst->src[2].is_one()) {
2490             inst->opcode = BRW_OPCODE_ADD;
2491             inst->src[2] = reg_undef;
2492             progress = true;
2493          } else if (inst->src[1].file == IMM && inst->src[2].file == IMM) {
2494             inst->opcode = BRW_OPCODE_ADD;
2495             inst->src[1].fixed_hw_reg.dw1.f *= inst->src[2].fixed_hw_reg.dw1.f;
2496             inst->src[2] = reg_undef;
2497             progress = true;
2498          }
2499          break;
2500       case SHADER_OPCODE_RCP: {
2501          fs_inst *prev = (fs_inst *)inst->prev;
2502          if (prev->opcode == SHADER_OPCODE_SQRT) {
2503             if (inst->src[0].equals(prev->dst)) {
2504                inst->opcode = SHADER_OPCODE_RSQ;
2505                inst->src[0] = prev->src[0];
2506                progress = true;
2507             }
2508          }
2509          break;
2510       }
2511       default:
2512          break;
2513       }
2514    }
2515
2516    return progress;
2517 }
2518
2519 bool
2520 fs_visitor::opt_register_renaming()
2521 {
2522    bool progress = false;
2523    int depth = 0;
2524
2525    int remap[alloc.count];
2526    memset(remap, -1, sizeof(int) * alloc.count);
2527
2528    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2529       if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
2530          depth++;
2531       } else if (inst->opcode == BRW_OPCODE_ENDIF ||
2532                  inst->opcode == BRW_OPCODE_WHILE) {
2533          depth--;
2534       }
2535
2536       /* Rewrite instruction sources. */
2537       for (int i = 0; i < inst->sources; i++) {
2538          if (inst->src[i].file == GRF &&
2539              remap[inst->src[i].reg] != -1 &&
2540              remap[inst->src[i].reg] != inst->src[i].reg) {
2541             inst->src[i].reg = remap[inst->src[i].reg];
2542             progress = true;
2543          }
2544       }
2545
2546       const int dst = inst->dst.reg;
2547
2548       if (depth == 0 &&
2549           inst->dst.file == GRF &&
2550           alloc.sizes[inst->dst.reg] == inst->dst.width / 8 &&
2551           !inst->is_partial_write()) {
2552          if (remap[dst] == -1) {
2553             remap[dst] = dst;
2554          } else {
2555             remap[dst] = alloc.allocate(inst->dst.width / 8);
2556             inst->dst.reg = remap[dst];
2557             progress = true;
2558          }
2559       } else if (inst->dst.file == GRF &&
2560                  remap[dst] != -1 &&
2561                  remap[dst] != dst) {
2562          inst->dst.reg = remap[dst];
2563          progress = true;
2564       }
2565    }
2566
2567    if (progress) {
2568       invalidate_live_intervals();
2569
2570       for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
2571          if (delta_x[i].file == GRF && remap[delta_x[i].reg] != -1) {
2572             delta_x[i].reg = remap[delta_x[i].reg];
2573          }
2574       }
2575       for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
2576          if (delta_y[i].file == GRF && remap[delta_y[i].reg] != -1) {
2577             delta_y[i].reg = remap[delta_y[i].reg];
2578          }
2579       }
2580    }
2581
2582    return progress;
2583 }
2584
2585 /**
2586  * Remove redundant or useless discard jumps.
2587  *
2588  * For example, we can eliminate jumps in the following sequence:
2589  *
2590  * discard-jump       (redundant with the next jump)
2591  * discard-jump       (useless; jumps to the next instruction)
2592  * placeholder-halt
2593  */
2594 bool
2595 fs_visitor::opt_redundant_discard_jumps()
2596 {
2597    bool progress = false;
2598
2599    bblock_t *last_bblock = cfg->blocks[cfg->num_blocks - 1];
2600
2601    fs_inst *placeholder_halt = NULL;
2602    foreach_inst_in_block_reverse(fs_inst, inst, last_bblock) {
2603       if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT) {
2604          placeholder_halt = inst;
2605          break;
2606       }
2607    }
2608
2609    if (!placeholder_halt)
2610       return false;
2611
2612    /* Delete any HALTs immediately before the placeholder halt. */
2613    for (fs_inst *prev = (fs_inst *) placeholder_halt->prev;
2614         !prev->is_head_sentinel() && prev->opcode == FS_OPCODE_DISCARD_JUMP;
2615         prev = (fs_inst *) placeholder_halt->prev) {
2616       prev->remove(last_bblock);
2617       progress = true;
2618    }
2619
2620    if (progress)
2621       invalidate_live_intervals();
2622
2623    return progress;
2624 }
2625
2626 bool
2627 fs_visitor::compute_to_mrf()
2628 {
2629    bool progress = false;
2630    int next_ip = 0;
2631
2632    /* No MRFs on Gen >= 7. */
2633    if (brw->gen >= 7)
2634       return false;
2635
2636    calculate_live_intervals();
2637
2638    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2639       int ip = next_ip;
2640       next_ip++;
2641
2642       if (inst->opcode != BRW_OPCODE_MOV ||
2643           inst->is_partial_write() ||
2644           inst->dst.file != MRF || inst->src[0].file != GRF ||
2645           inst->dst.type != inst->src[0].type ||
2646           inst->src[0].abs || inst->src[0].negate ||
2647           !inst->src[0].is_contiguous() ||
2648           inst->src[0].subreg_offset)
2649          continue;
2650
2651       /* Work out which hardware MRF registers are written by this
2652        * instruction.
2653        */
2654       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2655       int mrf_high;
2656       if (inst->dst.reg & BRW_MRF_COMPR4) {
2657          mrf_high = mrf_low + 4;
2658       } else if (inst->exec_size == 16) {
2659          mrf_high = mrf_low + 1;
2660       } else {
2661          mrf_high = mrf_low;
2662       }
2663
2664       /* Can't compute-to-MRF this GRF if someone else was going to
2665        * read it later.
2666        */
2667       if (this->virtual_grf_end[inst->src[0].reg] > ip)
2668          continue;
2669
2670       /* Found a move of a GRF to a MRF.  Let's see if we can go
2671        * rewrite the thing that made this GRF to write into the MRF.
2672        */
2673       foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2674          if (scan_inst->dst.file == GRF &&
2675              scan_inst->dst.reg == inst->src[0].reg) {
2676             /* Found the last thing to write our reg we want to turn
2677              * into a compute-to-MRF.
2678              */
2679
2680             /* If this one instruction didn't populate all the
2681              * channels, bail.  We might be able to rewrite everything
2682              * that writes that reg, but it would require smarter
2683              * tracking to delay the rewriting until complete success.
2684              */
2685             if (scan_inst->is_partial_write())
2686                break;
2687
2688             /* Things returning more than one register would need us to
2689              * understand coalescing out more than one MOV at a time.
2690              */
2691             if (scan_inst->regs_written > scan_inst->dst.width / 8)
2692                break;
2693
2694             /* SEND instructions can't have MRF as a destination. */
2695             if (scan_inst->mlen)
2696                break;
2697
2698             if (brw->gen == 6) {
2699                /* gen6 math instructions must have the destination be
2700                 * GRF, so no compute-to-MRF for them.
2701                 */
2702                if (scan_inst->is_math()) {
2703                   break;
2704                }
2705             }
2706
2707             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2708                /* Found the creator of our MRF's source value. */
2709                scan_inst->dst.file = MRF;
2710                scan_inst->dst.reg = inst->dst.reg;
2711                scan_inst->saturate |= inst->saturate;
2712                inst->remove(block);
2713                progress = true;
2714             }
2715             break;
2716          }
2717
2718          /* We don't handle control flow here.  Most computation of
2719           * values that end up in MRFs are shortly before the MRF
2720           * write anyway.
2721           */
2722          if (block->start() == scan_inst)
2723             break;
2724
2725          /* You can't read from an MRF, so if someone else reads our
2726           * MRF's source GRF that we wanted to rewrite, that stops us.
2727           */
2728          bool interfered = false;
2729          for (int i = 0; i < scan_inst->sources; i++) {
2730             if (scan_inst->src[i].file == GRF &&
2731                 scan_inst->src[i].reg == inst->src[0].reg &&
2732                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2733                interfered = true;
2734             }
2735          }
2736          if (interfered)
2737             break;
2738
2739          if (scan_inst->dst.file == MRF) {
2740             /* If somebody else writes our MRF here, we can't
2741              * compute-to-MRF before that.
2742              */
2743             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2744             int scan_mrf_high;
2745
2746             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2747                scan_mrf_high = scan_mrf_low + 4;
2748             } else if (scan_inst->exec_size == 16) {
2749                scan_mrf_high = scan_mrf_low + 1;
2750             } else {
2751                scan_mrf_high = scan_mrf_low;
2752             }
2753
2754             if (mrf_low == scan_mrf_low ||
2755                 mrf_low == scan_mrf_high ||
2756                 mrf_high == scan_mrf_low ||
2757                 mrf_high == scan_mrf_high) {
2758                break;
2759             }
2760          }
2761
2762          if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2763             /* Found a SEND instruction, which means that there are
2764              * live values in MRFs from base_mrf to base_mrf +
2765              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2766              * above it.
2767              */
2768             if (mrf_low >= scan_inst->base_mrf &&
2769                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2770                break;
2771             }
2772             if (mrf_high >= scan_inst->base_mrf &&
2773                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2774                break;
2775             }
2776          }
2777       }
2778    }
2779
2780    if (progress)
2781       invalidate_live_intervals();
2782
2783    return progress;
2784 }
2785
2786 /**
2787  * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
2788  * instructions to FS_OPCODE_REP_FB_WRITE.
2789  */
2790 void
2791 fs_visitor::emit_repclear_shader()
2792 {
2793    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2794    int base_mrf = 1;
2795    int color_mrf = base_mrf + 2;
2796
2797    fs_inst *mov = emit(MOV(vec4(brw_message_reg(color_mrf)),
2798                            fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F)));
2799    mov->force_writemask_all = true;
2800
2801    fs_inst *write;
2802    if (key->nr_color_regions == 1) {
2803       write = emit(FS_OPCODE_REP_FB_WRITE);
2804       write->saturate = key->clamp_fragment_color;
2805       write->base_mrf = color_mrf;
2806       write->target = 0;
2807       write->header_present = false;
2808       write->mlen = 1;
2809    } else {
2810       assume(key->nr_color_regions > 0);
2811       for (int i = 0; i < key->nr_color_regions; ++i) {
2812          write = emit(FS_OPCODE_REP_FB_WRITE);
2813          write->saturate = key->clamp_fragment_color;
2814          write->base_mrf = base_mrf;
2815          write->target = i;
2816          write->header_present = true;
2817          write->mlen = 3;
2818       }
2819    }
2820    write->eot = true;
2821
2822    calculate_cfg();
2823
2824    assign_constant_locations();
2825    assign_curb_setup();
2826
2827    /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
2828    assert(mov->src[0].file == HW_REG);
2829    mov->src[0] = brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
2830 }
2831
2832 /**
2833  * Walks through basic blocks, looking for repeated MRF writes and
2834  * removing the later ones.
2835  */
2836 bool
2837 fs_visitor::remove_duplicate_mrf_writes()
2838 {
2839    fs_inst *last_mrf_move[16];
2840    bool progress = false;
2841
2842    /* Need to update the MRF tracking for compressed instructions. */
2843    if (dispatch_width == 16)
2844       return false;
2845
2846    memset(last_mrf_move, 0, sizeof(last_mrf_move));
2847
2848    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
2849       if (inst->is_control_flow()) {
2850          memset(last_mrf_move, 0, sizeof(last_mrf_move));
2851       }
2852
2853       if (inst->opcode == BRW_OPCODE_MOV &&
2854           inst->dst.file == MRF) {
2855          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2856          if (prev_inst && inst->equals(prev_inst)) {
2857             inst->remove(block);
2858             progress = true;
2859             continue;
2860          }
2861       }
2862
2863       /* Clear out the last-write records for MRFs that were overwritten. */
2864       if (inst->dst.file == MRF) {
2865          last_mrf_move[inst->dst.reg] = NULL;
2866       }
2867
2868       if (inst->mlen > 0 && inst->base_mrf != -1) {
2869          /* Found a SEND instruction, which will include two or fewer
2870           * implied MRF writes.  We could do better here.
2871           */
2872          for (int i = 0; i < implied_mrf_writes(inst); i++) {
2873             last_mrf_move[inst->base_mrf + i] = NULL;
2874          }
2875       }
2876
2877       /* Clear out any MRF move records whose sources got overwritten. */
2878       if (inst->dst.file == GRF) {
2879          for (unsigned int i = 0; i < ARRAY_SIZE(last_mrf_move); i++) {
2880             if (last_mrf_move[i] &&
2881                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2882                last_mrf_move[i] = NULL;
2883             }
2884          }
2885       }
2886
2887       if (inst->opcode == BRW_OPCODE_MOV &&
2888           inst->dst.file == MRF &&
2889           inst->src[0].file == GRF &&
2890           !inst->is_partial_write()) {
2891          last_mrf_move[inst->dst.reg] = inst;
2892       }
2893    }
2894
2895    if (progress)
2896       invalidate_live_intervals();
2897
2898    return progress;
2899 }
2900
2901 static void
2902 clear_deps_for_inst_src(fs_inst *inst, bool *deps, int first_grf, int grf_len)
2903 {
2904    /* Clear the flag for registers that actually got read (as expected). */
2905    for (int i = 0; i < inst->sources; i++) {
2906       int grf;
2907       if (inst->src[i].file == GRF) {
2908          grf = inst->src[i].reg;
2909       } else if (inst->src[i].file == HW_REG &&
2910                  inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2911          grf = inst->src[i].fixed_hw_reg.nr;
2912       } else {
2913          continue;
2914       }
2915
2916       if (grf >= first_grf &&
2917           grf < first_grf + grf_len) {
2918          deps[grf - first_grf] = false;
2919          if (inst->exec_size == 16)
2920             deps[grf - first_grf + 1] = false;
2921       }
2922    }
2923 }
2924
2925 /**
2926  * Implements this workaround for the original 965:
2927  *
2928  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2929  *      check for post destination dependencies on this instruction, software
2930  *      must ensure that there is no destination hazard for the case of ‘write
2931  *      followed by a posted write’ shown in the following example.
2932  *
2933  *      1. mov r3 0
2934  *      2. send r3.xy <rest of send instruction>
2935  *      3. mov r2 r3
2936  *
2937  *      Due to no post-destination dependency check on the ‘send’, the above
2938  *      code sequence could have two instructions (1 and 2) in flight at the
2939  *      same time that both consider ‘r3’ as the target of their final writes.
2940  */
2941 void
2942 fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
2943                                                         fs_inst *inst)
2944 {
2945    int write_len = inst->regs_written;
2946    int first_write_grf = inst->dst.reg;
2947    bool needs_dep[BRW_MAX_MRF];
2948    assert(write_len < (int)sizeof(needs_dep) - 1);
2949
2950    memset(needs_dep, false, sizeof(needs_dep));
2951    memset(needs_dep, true, write_len);
2952
2953    clear_deps_for_inst_src(inst, needs_dep, first_write_grf, write_len);
2954
2955    /* Walk backwards looking for writes to registers we're writing which
2956     * aren't read since being written.  If we hit the start of the program,
2957     * we assume that there are no outstanding dependencies on entry to the
2958     * program.
2959     */
2960    foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2961       /* If we hit control flow, assume that there *are* outstanding
2962        * dependencies, and force their cleanup before our instruction.
2963        */
2964       if (block->start() == scan_inst) {
2965          for (int i = 0; i < write_len; i++) {
2966             if (needs_dep[i]) {
2967                inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
2968             }
2969          }
2970          return;
2971       }
2972
2973       /* We insert our reads as late as possible on the assumption that any
2974        * instruction but a MOV that might have left us an outstanding
2975        * dependency has more latency than a MOV.
2976        */
2977       if (scan_inst->dst.file == GRF) {
2978          for (int i = 0; i < scan_inst->regs_written; i++) {
2979             int reg = scan_inst->dst.reg + i;
2980
2981             if (reg >= first_write_grf &&
2982                 reg < first_write_grf + write_len &&
2983                 needs_dep[reg - first_write_grf]) {
2984                inst->insert_before(block, DEP_RESOLVE_MOV(reg));
2985                needs_dep[reg - first_write_grf] = false;
2986                if (scan_inst->exec_size == 16)
2987                   needs_dep[reg - first_write_grf + 1] = false;
2988             }
2989          }
2990       }
2991
2992       /* Clear the flag for registers that actually got read (as expected). */
2993       clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
2994
2995       /* Continue the loop only if we haven't resolved all the dependencies */
2996       int i;
2997       for (i = 0; i < write_len; i++) {
2998          if (needs_dep[i])
2999             break;
3000       }
3001       if (i == write_len)
3002          return;
3003    }
3004 }
3005
3006 /**
3007  * Implements this workaround for the original 965:
3008  *
3009  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
3010  *      used as a destination register until after it has been sourced by an
3011  *      instruction with a different destination register.
3012  */
3013 void
3014 fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
3015 {
3016    int write_len = inst->regs_written;
3017    int first_write_grf = inst->dst.reg;
3018    bool needs_dep[BRW_MAX_MRF];
3019    assert(write_len < (int)sizeof(needs_dep) - 1);
3020
3021    memset(needs_dep, false, sizeof(needs_dep));
3022    memset(needs_dep, true, write_len);
3023    /* Walk forwards looking for writes to registers we're writing which aren't
3024     * read before being written.
3025     */
3026    foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst, block) {
3027       /* If we hit control flow, force resolve all remaining dependencies. */
3028       if (block->end() == scan_inst) {
3029          for (int i = 0; i < write_len; i++) {
3030             if (needs_dep[i])
3031                scan_inst->insert_before(block,
3032                                         DEP_RESOLVE_MOV(first_write_grf + i));
3033          }
3034          return;
3035       }
3036
3037       /* Clear the flag for registers that actually got read (as expected). */
3038       clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3039
3040       /* We insert our reads as late as possible since they're reading the
3041        * result of a SEND, which has massive latency.
3042        */
3043       if (scan_inst->dst.file == GRF &&
3044           scan_inst->dst.reg >= first_write_grf &&
3045           scan_inst->dst.reg < first_write_grf + write_len &&
3046           needs_dep[scan_inst->dst.reg - first_write_grf]) {
3047          scan_inst->insert_before(block, DEP_RESOLVE_MOV(scan_inst->dst.reg));
3048          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
3049       }
3050
3051       /* Continue the loop only if we haven't resolved all the dependencies */
3052       int i;
3053       for (i = 0; i < write_len; i++) {
3054          if (needs_dep[i])
3055             break;
3056       }
3057       if (i == write_len)
3058          return;
3059    }
3060 }
3061
3062 void
3063 fs_visitor::insert_gen4_send_dependency_workarounds()
3064 {
3065    if (brw->gen != 4 || brw->is_g4x)
3066       return;
3067
3068    bool progress = false;
3069
3070    /* Note that we're done with register allocation, so GRF fs_regs always
3071     * have a .reg_offset of 0.
3072     */
3073
3074    foreach_block_and_inst(block, fs_inst, inst, cfg) {
3075       if (inst->mlen != 0 && inst->dst.file == GRF) {
3076          insert_gen4_pre_send_dependency_workarounds(block, inst);
3077          insert_gen4_post_send_dependency_workarounds(block, inst);
3078          progress = true;
3079       }
3080    }
3081
3082    if (progress)
3083       invalidate_live_intervals();
3084 }
3085
3086 /**
3087  * Turns the generic expression-style uniform pull constant load instruction
3088  * into a hardware-specific series of instructions for loading a pull
3089  * constant.
3090  *
3091  * The expression style allows the CSE pass before this to optimize out
3092  * repeated loads from the same offset, and gives the pre-register-allocation
3093  * scheduling full flexibility, while the conversion to native instructions
3094  * allows the post-register-allocation scheduler the best information
3095  * possible.
3096  *
3097  * Note that execution masking for setting up pull constant loads is special:
3098  * the channels that need to be written are unrelated to the current execution
3099  * mask, since a later instruction will use one of the result channels as a
3100  * source operand for all 8 or 16 of its channels.
3101  */
3102 void
3103 fs_visitor::lower_uniform_pull_constant_loads()
3104 {
3105    foreach_block_and_inst (block, fs_inst, inst, cfg) {
3106       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
3107          continue;
3108
3109       if (brw->gen >= 7) {
3110          /* The offset arg before was a vec4-aligned byte offset.  We need to
3111           * turn it into a dword offset.
3112           */
3113          fs_reg const_offset_reg = inst->src[1];
3114          assert(const_offset_reg.file == IMM &&
3115                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
3116          const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
3117          fs_reg payload = fs_reg(GRF, alloc.allocate(1));
3118
3119          /* We have to use a message header on Skylake to get SIMD4x2 mode.
3120           * Reserve space for the register.
3121           */
3122          if (brw->gen >= 9) {
3123             payload.reg_offset++;
3124             alloc.sizes[payload.reg] = 2;
3125          }
3126
3127          /* This is actually going to be a MOV, but since only the first dword
3128           * is accessed, we have a special opcode to do just that one.  Note
3129           * that this needs to be an operation that will be considered a def
3130           * by live variable analysis, or register allocation will explode.
3131           */
3132          fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
3133                                                8, payload, const_offset_reg);
3134          setup->force_writemask_all = true;
3135
3136          setup->ir = inst->ir;
3137          setup->annotation = inst->annotation;
3138          inst->insert_before(block, setup);
3139
3140          /* Similarly, this will only populate the first 4 channels of the
3141           * result register (since we only use smear values from 0-3), but we
3142           * don't tell the optimizer.
3143           */
3144          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
3145          inst->src[1] = payload;
3146
3147          invalidate_live_intervals();
3148       } else {
3149          /* Before register allocation, we didn't tell the scheduler about the
3150           * MRF we use.  We know it's safe to use this MRF because nothing
3151           * else does except for register spill/unspill, which generates and
3152           * uses its MRF within a single IR instruction.
3153           */
3154          inst->base_mrf = 14;
3155          inst->mlen = 1;
3156       }
3157    }
3158 }
3159
3160 bool
3161 fs_visitor::lower_load_payload()
3162 {
3163    bool progress = false;
3164
3165    int vgrf_to_reg[alloc.count];
3166    int reg_count = 0;
3167    for (unsigned i = 0; i < alloc.count; ++i) {
3168       vgrf_to_reg[i] = reg_count;
3169       reg_count += alloc.sizes[i];
3170    }
3171
3172    struct {
3173       bool written:1; /* Whether this register has ever been written */
3174       bool force_writemask_all:1;
3175       bool force_sechalf:1;
3176    } metadata[reg_count];
3177    memset(metadata, 0, sizeof(metadata));
3178
3179    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3180       if (inst->dst.file == GRF) {
3181          const int dst_reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
3182          bool force_sechalf = inst->force_sechalf &&
3183                               !inst->force_writemask_all;
3184          bool toggle_sechalf = inst->dst.width == 16 &&
3185                                type_sz(inst->dst.type) == 4 &&
3186                                !inst->force_writemask_all;
3187          for (int i = 0; i < inst->regs_written; ++i) {
3188             metadata[dst_reg + i].written = true;
3189             metadata[dst_reg + i].force_sechalf = force_sechalf;
3190             metadata[dst_reg + i].force_writemask_all = inst->force_writemask_all;
3191             force_sechalf = (toggle_sechalf != force_sechalf);
3192          }
3193       }
3194
3195       if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
3196          assert(inst->dst.file == MRF || inst->dst.file == GRF);
3197          fs_reg dst = inst->dst;
3198
3199          for (int i = 0; i < inst->sources; i++) {
3200             dst.width = inst->src[i].effective_width;
3201             dst.type = inst->src[i].type;
3202
3203             if (inst->src[i].file == BAD_FILE) {
3204                /* Do nothing but otherwise increment as normal */
3205             } else if (dst.file == MRF &&
3206                        dst.width == 8 &&
3207                        brw->has_compr4 &&
3208                        i + 4 < inst->sources &&
3209                        inst->src[i + 4].equals(horiz_offset(inst->src[i], 8))) {
3210                fs_reg compr4_dst = dst;
3211                compr4_dst.reg += BRW_MRF_COMPR4;
3212                compr4_dst.width = 16;
3213                fs_reg compr4_src = inst->src[i];
3214                compr4_src.width = 16;
3215                fs_inst *mov = MOV(compr4_dst, compr4_src);
3216                mov->force_writemask_all = true;
3217                inst->insert_before(block, mov);
3218                /* Mark i+4 as BAD_FILE so we don't emit a MOV for it */
3219                inst->src[i + 4].file = BAD_FILE;
3220             } else {
3221                fs_inst *mov = MOV(dst, inst->src[i]);
3222                if (inst->src[i].file == GRF) {
3223                   int src_reg = vgrf_to_reg[inst->src[i].reg] +
3224                                 inst->src[i].reg_offset;
3225                   mov->force_sechalf = metadata[src_reg].force_sechalf;
3226                   mov->force_writemask_all = metadata[src_reg].force_writemask_all;
3227                } else {
3228                   /* We don't have any useful metadata for immediates or
3229                    * uniforms.  Assume that any of the channels of the
3230                    * destination may be used.
3231                    */
3232                   assert(inst->src[i].file == IMM ||
3233                          inst->src[i].file == UNIFORM);
3234                   mov->force_writemask_all = true;
3235                }
3236
3237                if (dst.file == GRF) {
3238                   const int dst_reg = vgrf_to_reg[dst.reg] + dst.reg_offset;
3239                   const bool force_writemask = mov->force_writemask_all;
3240                   metadata[dst_reg].force_writemask_all = force_writemask;
3241                   metadata[dst_reg].force_sechalf = mov->force_sechalf;
3242                   if (dst.width * type_sz(dst.type) > 32) {
3243                      assert(!mov->force_sechalf);
3244                      metadata[dst_reg + 1].force_writemask_all = force_writemask;
3245                      metadata[dst_reg + 1].force_sechalf = !force_writemask;
3246                   }
3247                }
3248
3249                inst->insert_before(block, mov);
3250             }
3251
3252             dst = offset(dst, 1);
3253          }
3254
3255          inst->remove(block);
3256          progress = true;
3257       }
3258    }
3259
3260    if (progress)
3261       invalidate_live_intervals();
3262
3263    return progress;
3264 }
3265
3266 void
3267 fs_visitor::dump_instructions()
3268 {
3269    dump_instructions(NULL);
3270 }
3271
3272 void
3273 fs_visitor::dump_instructions(const char *name)
3274 {
3275    FILE *file = stderr;
3276    if (name && geteuid() != 0) {
3277       file = fopen(name, "w");
3278       if (!file)
3279          file = stderr;
3280    }
3281
3282    if (cfg) {
3283       calculate_register_pressure();
3284       int ip = 0, max_pressure = 0;
3285       foreach_block_and_inst(block, backend_instruction, inst, cfg) {
3286          max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
3287          fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
3288          dump_instruction(inst, file);
3289          ip++;
3290       }
3291       fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
3292    } else {
3293       int ip = 0;
3294       foreach_in_list(backend_instruction, inst, &instructions) {
3295          fprintf(file, "%4d: ", ip++);
3296          dump_instruction(inst, file);
3297       }
3298    }
3299
3300    if (file != stderr) {
3301       fclose(file);
3302    }
3303 }
3304
3305 void
3306 fs_visitor::dump_instruction(backend_instruction *be_inst)
3307 {
3308    dump_instruction(be_inst, stderr);
3309 }
3310
3311 void
3312 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
3313 {
3314    fs_inst *inst = (fs_inst *)be_inst;
3315
3316    if (inst->predicate) {
3317       fprintf(file, "(%cf0.%d) ",
3318              inst->predicate_inverse ? '-' : '+',
3319              inst->flag_subreg);
3320    }
3321
3322    fprintf(file, "%s", brw_instruction_name(inst->opcode));
3323    if (inst->saturate)
3324       fprintf(file, ".sat");
3325    if (inst->conditional_mod) {
3326       fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
3327       if (!inst->predicate &&
3328           (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
3329                               inst->opcode != BRW_OPCODE_IF &&
3330                               inst->opcode != BRW_OPCODE_WHILE))) {
3331          fprintf(file, ".f0.%d", inst->flag_subreg);
3332       }
3333    }
3334    fprintf(file, "(%d) ", inst->exec_size);
3335
3336
3337    switch (inst->dst.file) {
3338    case GRF:
3339       fprintf(file, "vgrf%d", inst->dst.reg);
3340       if (inst->dst.width != dispatch_width)
3341          fprintf(file, "@%d", inst->dst.width);
3342       if (alloc.sizes[inst->dst.reg] != inst->dst.width / 8 ||
3343           inst->dst.subreg_offset)
3344          fprintf(file, "+%d.%d",
3345                  inst->dst.reg_offset, inst->dst.subreg_offset);
3346       break;
3347    case MRF:
3348       fprintf(file, "m%d", inst->dst.reg);
3349       break;
3350    case BAD_FILE:
3351       fprintf(file, "(null)");
3352       break;
3353    case UNIFORM:
3354       fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
3355       break;
3356    case ATTR:
3357       fprintf(file, "***attr%d***", inst->dst.reg + inst->dst.reg_offset);
3358       break;
3359    case HW_REG:
3360       if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3361          switch (inst->dst.fixed_hw_reg.nr) {
3362          case BRW_ARF_NULL:
3363             fprintf(file, "null");
3364             break;
3365          case BRW_ARF_ADDRESS:
3366             fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
3367             break;
3368          case BRW_ARF_ACCUMULATOR:
3369             fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
3370             break;
3371          case BRW_ARF_FLAG:
3372             fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3373                              inst->dst.fixed_hw_reg.subnr);
3374             break;
3375          default:
3376             fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3377                                inst->dst.fixed_hw_reg.subnr);
3378             break;
3379          }
3380       } else {
3381          fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
3382       }
3383       if (inst->dst.fixed_hw_reg.subnr)
3384          fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
3385       break;
3386    default:
3387       fprintf(file, "???");
3388       break;
3389    }
3390    fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
3391
3392    for (int i = 0; i < inst->sources; i++) {
3393       if (inst->src[i].negate)
3394          fprintf(file, "-");
3395       if (inst->src[i].abs)
3396          fprintf(file, "|");
3397       switch (inst->src[i].file) {
3398       case GRF:
3399          fprintf(file, "vgrf%d", inst->src[i].reg);
3400          if (inst->src[i].width != dispatch_width)
3401             fprintf(file, "@%d", inst->src[i].width);
3402          if (alloc.sizes[inst->src[i].reg] != inst->src[i].width / 8 ||
3403              inst->src[i].subreg_offset)
3404             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3405                     inst->src[i].subreg_offset);
3406          break;
3407       case MRF:
3408          fprintf(file, "***m%d***", inst->src[i].reg);
3409          break;
3410       case ATTR:
3411          fprintf(file, "attr%d", inst->src[i].reg + inst->src[i].reg_offset);
3412          break;
3413       case UNIFORM:
3414          fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
3415          if (inst->src[i].reladdr) {
3416             fprintf(file, "+reladdr");
3417          } else if (inst->src[i].subreg_offset) {
3418             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3419                     inst->src[i].subreg_offset);
3420          }
3421          break;
3422       case BAD_FILE:
3423          fprintf(file, "(null)");
3424          break;
3425       case IMM:
3426          switch (inst->src[i].type) {
3427          case BRW_REGISTER_TYPE_F:
3428             fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
3429             break;
3430          case BRW_REGISTER_TYPE_W:
3431          case BRW_REGISTER_TYPE_D:
3432             fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
3433             break;
3434          case BRW_REGISTER_TYPE_UW:
3435          case BRW_REGISTER_TYPE_UD:
3436             fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
3437             break;
3438          case BRW_REGISTER_TYPE_VF:
3439             fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
3440                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >>  0) & 0xff),
3441                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >>  8) & 0xff),
3442                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 16) & 0xff),
3443                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 24) & 0xff));
3444             break;
3445          default:
3446             fprintf(file, "???");
3447             break;
3448          }
3449          break;
3450       case HW_REG:
3451          if (inst->src[i].fixed_hw_reg.negate)
3452             fprintf(file, "-");
3453          if (inst->src[i].fixed_hw_reg.abs)
3454             fprintf(file, "|");
3455          if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3456             switch (inst->src[i].fixed_hw_reg.nr) {
3457             case BRW_ARF_NULL:
3458                fprintf(file, "null");
3459                break;
3460             case BRW_ARF_ADDRESS:
3461                fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
3462                break;
3463             case BRW_ARF_ACCUMULATOR:
3464                fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
3465                break;
3466             case BRW_ARF_FLAG:
3467                fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3468                                 inst->src[i].fixed_hw_reg.subnr);
3469                break;
3470             default:
3471                fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3472                                   inst->src[i].fixed_hw_reg.subnr);
3473                break;
3474             }
3475          } else {
3476             fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3477          }
3478          if (inst->src[i].fixed_hw_reg.subnr)
3479             fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
3480          if (inst->src[i].fixed_hw_reg.abs)
3481             fprintf(file, "|");
3482          break;
3483       default:
3484          fprintf(file, "???");
3485          break;
3486       }
3487       if (inst->src[i].abs)
3488          fprintf(file, "|");
3489
3490       if (inst->src[i].file != IMM) {
3491          fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
3492       }
3493
3494       if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
3495          fprintf(file, ", ");
3496    }
3497
3498    fprintf(file, " ");
3499
3500    if (dispatch_width == 16 && inst->exec_size == 8) {
3501       if (inst->force_sechalf)
3502          fprintf(file, "2ndhalf ");
3503       else
3504          fprintf(file, "1sthalf ");
3505    }
3506
3507    fprintf(file, "\n");
3508 }
3509
3510 /**
3511  * Possibly returns an instruction that set up @param reg.
3512  *
3513  * Sometimes we want to take the result of some expression/variable
3514  * dereference tree and rewrite the instruction generating the result
3515  * of the tree.  When processing the tree, we know that the
3516  * instructions generated are all writing temporaries that are dead
3517  * outside of this tree.  So, if we have some instructions that write
3518  * a temporary, we're free to point that temp write somewhere else.
3519  *
3520  * Note that this doesn't guarantee that the instruction generated
3521  * only reg -- it might be the size=4 destination of a texture instruction.
3522  */
3523 fs_inst *
3524 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3525                                            fs_inst *end,
3526                                            const fs_reg &reg)
3527 {
3528    if (end == start ||
3529        end->is_partial_write() ||
3530        reg.reladdr ||
3531        !reg.equals(end->dst)) {
3532       return NULL;
3533    } else {
3534       return end;
3535    }
3536 }
3537
3538 void
3539 fs_visitor::setup_payload_gen6()
3540 {
3541    bool uses_depth =
3542       (prog->InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3543    unsigned barycentric_interp_modes =
3544       (stage == MESA_SHADER_FRAGMENT) ?
3545       ((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
3546
3547    assert(brw->gen >= 6);
3548
3549    /* R0-1: masks, pixel X/Y coordinates. */
3550    payload.num_regs = 2;
3551    /* R2: only for 32-pixel dispatch.*/
3552
3553    /* R3-26: barycentric interpolation coordinates.  These appear in the
3554     * same order that they appear in the brw_wm_barycentric_interp_mode
3555     * enum.  Each set of coordinates occupies 2 registers if dispatch width
3556     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
3557     * appear if they were enabled using the "Barycentric Interpolation
3558     * Mode" bits in WM_STATE.
3559     */
3560    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3561       if (barycentric_interp_modes & (1 << i)) {
3562          payload.barycentric_coord_reg[i] = payload.num_regs;
3563          payload.num_regs += 2;
3564          if (dispatch_width == 16) {
3565             payload.num_regs += 2;
3566          }
3567       }
3568    }
3569
3570    /* R27: interpolated depth if uses source depth */
3571    if (uses_depth) {
3572       payload.source_depth_reg = payload.num_regs;
3573       payload.num_regs++;
3574       if (dispatch_width == 16) {
3575          /* R28: interpolated depth if not SIMD8. */
3576          payload.num_regs++;
3577       }
3578    }
3579    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3580    if (uses_depth) {
3581       payload.source_w_reg = payload.num_regs;
3582       payload.num_regs++;
3583       if (dispatch_width == 16) {
3584          /* R30: interpolated W if not SIMD8. */
3585          payload.num_regs++;
3586       }
3587    }
3588
3589    if (stage == MESA_SHADER_FRAGMENT) {
3590       brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3591       brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3592       prog_data->uses_pos_offset = key->compute_pos_offset;
3593       /* R31: MSAA position offsets. */
3594       if (prog_data->uses_pos_offset) {
3595          payload.sample_pos_reg = payload.num_regs;
3596          payload.num_regs++;
3597       }
3598    }
3599
3600    /* R32: MSAA input coverage mask */
3601    if (prog->SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3602       assert(brw->gen >= 7);
3603       payload.sample_mask_in_reg = payload.num_regs;
3604       payload.num_regs++;
3605       if (dispatch_width == 16) {
3606          /* R33: input coverage mask if not SIMD8. */
3607          payload.num_regs++;
3608       }
3609    }
3610
3611    /* R34-: bary for 32-pixel. */
3612    /* R58-59: interp W for 32-pixel. */
3613
3614    if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3615       source_depth_to_render_target = true;
3616    }
3617 }
3618
3619 void
3620 fs_visitor::setup_vs_payload()
3621 {
3622    /* R0: thread header, R1: urb handles */
3623    payload.num_regs = 2;
3624 }
3625
3626 void
3627 fs_visitor::assign_binding_table_offsets()
3628 {
3629    assert(stage == MESA_SHADER_FRAGMENT);
3630    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3631    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3632    uint32_t next_binding_table_offset = 0;
3633
3634    /* If there are no color regions, we still perform an FB write to a null
3635     * renderbuffer, which we place at surface index 0.
3636     */
3637    prog_data->binding_table.render_target_start = next_binding_table_offset;
3638    next_binding_table_offset += MAX2(key->nr_color_regions, 1);
3639
3640    assign_common_binding_table_offsets(next_binding_table_offset);
3641 }
3642
3643 void
3644 fs_visitor::calculate_register_pressure()
3645 {
3646    invalidate_live_intervals();
3647    calculate_live_intervals();
3648
3649    unsigned num_instructions = 0;
3650    foreach_block(block, cfg)
3651       num_instructions += block->instructions.length();
3652
3653    regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3654
3655    for (unsigned reg = 0; reg < alloc.count; reg++) {
3656       for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3657          regs_live_at_ip[ip] += alloc.sizes[reg];
3658    }
3659 }
3660
3661 void
3662 fs_visitor::optimize()
3663 {
3664    const char *stage_name = stage == MESA_SHADER_VERTEX ? "vs" : "fs";
3665
3666    split_virtual_grfs();
3667
3668    move_uniform_array_access_to_pull_constants();
3669    assign_constant_locations();
3670    demote_pull_constants();
3671
3672 #define OPT(pass, args...) ({                                           \
3673       pass_num++;                                                       \
3674       bool this_progress = pass(args);                                  \
3675                                                                         \
3676       if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) {   \
3677          char filename[64];                                             \
3678          snprintf(filename, 64, "%s%d-%04d-%02d-%02d-" #pass,              \
3679                   stage_name, dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
3680                                                                         \
3681          backend_visitor::dump_instructions(filename);                  \
3682       }                                                                 \
3683                                                                         \
3684       progress = progress || this_progress;                             \
3685       this_progress;                                                    \
3686    })
3687
3688    if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
3689       char filename[64];
3690       snprintf(filename, 64, "%s%d-%04d-00-start",
3691                stage_name, dispatch_width, shader_prog ? shader_prog->Name : 0);
3692
3693       backend_visitor::dump_instructions(filename);
3694    }
3695
3696    bool progress;
3697    int iteration = 0;
3698    int pass_num = 0;
3699    do {
3700       progress = false;
3701       pass_num = 0;
3702       iteration++;
3703
3704       OPT(remove_duplicate_mrf_writes);
3705
3706       OPT(opt_algebraic);
3707       OPT(opt_cse);
3708       OPT(opt_copy_propagate);
3709       OPT(opt_peephole_predicated_break);
3710       OPT(opt_cmod_propagation);
3711       OPT(dead_code_eliminate);
3712       OPT(opt_peephole_sel);
3713       OPT(dead_control_flow_eliminate, this);
3714       OPT(opt_register_renaming);
3715       OPT(opt_redundant_discard_jumps);
3716       OPT(opt_saturate_propagation);
3717       OPT(register_coalesce);
3718       OPT(compute_to_mrf);
3719
3720       OPT(compact_virtual_grfs);
3721    } while (progress);
3722
3723    pass_num = 0;
3724
3725    if (OPT(lower_load_payload)) {
3726       split_virtual_grfs();
3727       OPT(register_coalesce);
3728       OPT(compute_to_mrf);
3729       OPT(dead_code_eliminate);
3730    }
3731
3732    OPT(opt_combine_constants);
3733
3734    lower_uniform_pull_constant_loads();
3735 }
3736
3737 /**
3738  * Three source instruction must have a GRF/MRF destination register.
3739  * ARF NULL is not allowed.  Fix that up by allocating a temporary GRF.
3740  */
3741 void
3742 fs_visitor::fixup_3src_null_dest()
3743 {
3744    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3745       if (inst->is_3src() && inst->dst.is_null()) {
3746          inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
3747                             inst->dst.type);
3748       }
3749    }
3750 }
3751
3752 void
3753 fs_visitor::allocate_registers()
3754 {
3755    bool allocated_without_spills;
3756
3757    static const enum instruction_scheduler_mode pre_modes[] = {
3758       SCHEDULE_PRE,
3759       SCHEDULE_PRE_NON_LIFO,
3760       SCHEDULE_PRE_LIFO,
3761    };
3762
3763    /* Try each scheduling heuristic to see if it can successfully register
3764     * allocate without spilling.  They should be ordered by decreasing
3765     * performance but increasing likelihood of allocating.
3766     */
3767    for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3768       schedule_instructions(pre_modes[i]);
3769
3770       if (0) {
3771          assign_regs_trivial();
3772          allocated_without_spills = true;
3773       } else {
3774          allocated_without_spills = assign_regs(false);
3775       }
3776       if (allocated_without_spills)
3777          break;
3778    }
3779
3780    if (!allocated_without_spills) {
3781       const char *stage_name = stage == MESA_SHADER_VERTEX ?
3782          "Vertex" : "Fragment";
3783
3784       /* We assume that any spilling is worse than just dropping back to
3785        * SIMD8.  There's probably actually some intermediate point where
3786        * SIMD16 with a couple of spills is still better.
3787        */
3788       if (dispatch_width == 16) {
3789          fail("Failure to register allocate.  Reduce number of "
3790               "live scalar values to avoid this.");
3791       } else {
3792          perf_debug("%s shader triggered register spilling.  "
3793                     "Try reducing the number of live scalar values to "
3794                     "improve performance.\n", stage_name);
3795       }
3796
3797       /* Since we're out of heuristics, just go spill registers until we
3798        * get an allocation.
3799        */
3800       while (!assign_regs(true)) {
3801          if (failed)
3802             break;
3803       }
3804    }
3805
3806    /* This must come after all optimization and register allocation, since
3807     * it inserts dead code that happens to have side effects, and it does
3808     * so based on the actual physical registers in use.
3809     */
3810    insert_gen4_send_dependency_workarounds();
3811
3812    if (failed)
3813       return;
3814
3815    if (!allocated_without_spills)
3816       schedule_instructions(SCHEDULE_POST);
3817
3818    if (last_scratch > 0)
3819       prog_data->total_scratch = brw_get_scratch_size(last_scratch);
3820 }
3821
3822 bool
3823 fs_visitor::run_vs()
3824 {
3825    assert(stage == MESA_SHADER_VERTEX);
3826
3827    assign_common_binding_table_offsets(0);
3828    setup_vs_payload();
3829
3830    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3831       emit_shader_time_begin();
3832
3833    foreach_in_list(ir_instruction, ir, shader->base.ir) {
3834       base_ir = ir;
3835       this->result = reg_undef;
3836       ir->accept(this);
3837    }
3838    base_ir = NULL;
3839    if (failed)
3840       return false;
3841
3842    emit_urb_writes();
3843
3844    calculate_cfg();
3845
3846    optimize();
3847
3848    assign_curb_setup();
3849    assign_vs_urb_setup();
3850
3851    fixup_3src_null_dest();
3852    allocate_registers();
3853
3854    return !failed;
3855 }
3856
3857 bool
3858 fs_visitor::run_fs()
3859 {
3860    brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
3861    brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
3862
3863    assert(stage == MESA_SHADER_FRAGMENT);
3864
3865    sanity_param_count = prog->Parameters->NumParameters;
3866
3867    assign_binding_table_offsets();
3868
3869    if (brw->gen >= 6)
3870       setup_payload_gen6();
3871    else
3872       setup_payload_gen4();
3873
3874    if (0) {
3875       emit_dummy_fs();
3876    } else if (brw->use_rep_send && dispatch_width == 16) {
3877       emit_repclear_shader();
3878    } else {
3879       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3880          emit_shader_time_begin();
3881
3882       calculate_urb_setup();
3883       if (prog->InputsRead > 0) {
3884          if (brw->gen < 6)
3885             emit_interpolation_setup_gen4();
3886          else
3887             emit_interpolation_setup_gen6();
3888       }
3889
3890       /* We handle discards by keeping track of the still-live pixels in f0.1.
3891        * Initialize it with the dispatched pixels.
3892        */
3893       if (wm_prog_data->uses_kill) {
3894          fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3895          discard_init->flag_subreg = 1;
3896       }
3897
3898       /* Generate FS IR for main().  (the visitor only descends into
3899        * functions called "main").
3900        */
3901       if (shader) {
3902          if (getenv("INTEL_USE_NIR") != NULL) {
3903             emit_nir_code();
3904          } else {
3905             foreach_in_list(ir_instruction, ir, shader->base.ir) {
3906                base_ir = ir;
3907                this->result = reg_undef;
3908                ir->accept(this);
3909             }
3910          }
3911       } else {
3912          emit_fragment_program_code();
3913       }
3914       base_ir = NULL;
3915       if (failed)
3916          return false;
3917
3918       emit(FS_OPCODE_PLACEHOLDER_HALT);
3919
3920       if (wm_key->alpha_test_func)
3921          emit_alpha_test();
3922
3923       emit_fb_writes();
3924
3925       calculate_cfg();
3926
3927       optimize();
3928
3929       assign_curb_setup();
3930       assign_urb_setup();
3931
3932       fixup_3src_null_dest();
3933       allocate_registers();
3934
3935       if (failed)
3936          return false;
3937    }
3938
3939    if (dispatch_width == 8)
3940       wm_prog_data->reg_blocks = brw_register_blocks(grf_used);
3941    else
3942       wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
3943
3944    /* If any state parameters were appended, then ParameterValues could have
3945     * been realloced, in which case the driver uniform storage set up by
3946     * _mesa_associate_uniform_storage() would point to freed memory.  Make
3947     * sure that didn't happen.
3948     */
3949    assert(sanity_param_count == prog->Parameters->NumParameters);
3950
3951    return !failed;
3952 }
3953
3954 const unsigned *
3955 brw_wm_fs_emit(struct brw_context *brw,
3956                void *mem_ctx,
3957                const struct brw_wm_prog_key *key,
3958                struct brw_wm_prog_data *prog_data,
3959                struct gl_fragment_program *fp,
3960                struct gl_shader_program *prog,
3961                unsigned *final_assembly_size)
3962 {
3963    bool start_busy = false;
3964    double start_time = 0;
3965
3966    if (unlikely(brw->perf_debug)) {
3967       start_busy = (brw->batch.last_bo &&
3968                     drm_intel_bo_busy(brw->batch.last_bo));
3969       start_time = get_time();
3970    }
3971
3972    struct brw_shader *shader = NULL;
3973    if (prog)
3974       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3975
3976    if (unlikely(INTEL_DEBUG & DEBUG_WM))
3977       brw_dump_ir("fragment", prog, &shader->base, &fp->Base);
3978
3979    /* Now the main event: Visit the shader IR and generate our FS IR for it.
3980     */
3981    fs_visitor v(brw, mem_ctx, key, prog_data, prog, fp, 8);
3982    if (!v.run_fs()) {
3983       if (prog) {
3984          prog->LinkStatus = false;
3985          ralloc_strcat(&prog->InfoLog, v.fail_msg);
3986       }
3987
3988       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3989                     v.fail_msg);
3990
3991       return NULL;
3992    }
3993
3994    cfg_t *simd16_cfg = NULL;
3995    fs_visitor v2(brw, mem_ctx, key, prog_data, prog, fp, 16);
3996    if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16) ||
3997                                brw->use_rep_send)) {
3998       if (!v.simd16_unsupported) {
3999          /* Try a SIMD16 compile */
4000          v2.import_uniforms(&v);
4001          if (!v2.run_fs()) {
4002             perf_debug("SIMD16 shader failed to compile, falling back to "
4003                        "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
4004          } else {
4005             simd16_cfg = v2.cfg;
4006          }
4007       } else {
4008          perf_debug("SIMD16 shader unsupported, falling back to "
4009                     "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
4010       }
4011    }
4012
4013    cfg_t *simd8_cfg;
4014    int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8;
4015    if (no_simd8 && simd16_cfg) {
4016       simd8_cfg = NULL;
4017       prog_data->no_8 = true;
4018    } else {
4019       simd8_cfg = v.cfg;
4020       prog_data->no_8 = false;
4021    }
4022
4023    fs_generator g(brw, mem_ctx, (void *) key, &prog_data->base,
4024                   &fp->Base, v.runtime_check_aads_emit, "FS");
4025
4026    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
4027       char *name;
4028       if (prog)
4029          name = ralloc_asprintf(mem_ctx, "%s fragment shader %d",
4030                                 prog->Label ? prog->Label : "unnamed",
4031                                 prog->Name);
4032       else
4033          name = ralloc_asprintf(mem_ctx, "fragment program %d", fp->Base.Id);
4034
4035       g.enable_debug(name);
4036    }
4037
4038    if (simd8_cfg)
4039       g.generate_code(simd8_cfg, 8);
4040    if (simd16_cfg)
4041       prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
4042
4043    if (unlikely(brw->perf_debug) && shader) {
4044       if (shader->compiled_once)
4045          brw_wm_debug_recompile(brw, prog, key);
4046       shader->compiled_once = true;
4047
4048       if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
4049          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
4050                     (get_time() - start_time) * 1000);
4051       }
4052    }
4053
4054    return g.get_assembly(final_assembly_size);
4055 }
4056
4057 extern "C" bool
4058 brw_fs_precompile(struct gl_context *ctx,
4059                   struct gl_shader_program *shader_prog,
4060                   struct gl_program *prog)
4061 {
4062    struct brw_context *brw = brw_context(ctx);
4063    struct brw_wm_prog_key key;
4064
4065    struct gl_fragment_program *fp = (struct gl_fragment_program *) prog;
4066    struct brw_fragment_program *bfp = brw_fragment_program(fp);
4067    bool program_uses_dfdy = fp->UsesDFdy;
4068
4069    memset(&key, 0, sizeof(key));
4070
4071    if (brw->gen < 6) {
4072       if (fp->UsesKill)
4073          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
4074
4075       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
4076          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
4077
4078       /* Just assume depth testing. */
4079       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
4080       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
4081    }
4082
4083    if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
4084                                          BRW_FS_VARYING_INPUT_MASK) > 16)
4085       key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
4086
4087    const bool has_shader_channel_select = brw->is_haswell || brw->gen >= 8;
4088    unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
4089    for (unsigned i = 0; i < sampler_count; i++) {
4090       if (!has_shader_channel_select && (fp->Base.ShadowSamplers & (1 << i))) {
4091          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
4092          key.tex.swizzles[i] =
4093             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
4094       } else {
4095          /* Color sampler: assume no swizzling. */
4096          key.tex.swizzles[i] = SWIZZLE_XYZW;
4097       }
4098    }
4099
4100    if (fp->Base.InputsRead & VARYING_BIT_POS) {
4101       key.drawable_height = ctx->DrawBuffer->Height;
4102    }
4103
4104    key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
4105          ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
4106          BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
4107
4108    if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
4109       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
4110                           key.nr_color_regions > 1;
4111    }
4112
4113    key.program_string_id = bfp->id;
4114
4115    uint32_t old_prog_offset = brw->wm.base.prog_offset;
4116    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
4117
4118    bool success = do_wm_prog(brw, shader_prog, bfp, &key);
4119
4120    brw->wm.base.prog_offset = old_prog_offset;
4121    brw->wm.prog_data = old_prog_data;
4122
4123    return success;
4124 }