src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 #include <sys/types.h>
  32
  33 #include "util/hash_table.h"
  34 #include "main/macros.h"
  35 #include "main/shaderobj.h"
  36 #include "main/fbobject.h"
  37 #include "program/prog_parameter.h"
  38 #include "program/prog_print.h"
  39 #include "util/register_allocate.h"
  40 #include "program/hash_table.h"
  41 #include "brw_context.h"
  42 #include "brw_eu.h"
  43 #include "brw_wm.h"
  44 #include "brw_fs.h"
  45 #include "brw_cfg.h"
  46 #include "brw_dead_control_flow.h"
  47 #include "main/uniforms.h"
  48 #include "brw_fs_live_variables.h"
  49 #include "glsl/glsl_types.h"
  50 #include "program/sampler.h"
  51
  52 void
  53 fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
  54               const fs_reg *src, unsigned sources)
  55 {
  56    memset(this, 0, sizeof(*this));
  57
  58    this->src = new fs_reg[MAX2(sources, 3)];
  59    for (unsigned i = 0; i < sources; i++)
  60       this->src[i] = src[i];
  61
  62    this->opcode = opcode;
  63    this->dst = dst;
  64    this->sources = sources;
  65    this->exec_size = exec_size;
  66
  67    assert(dst.file != IMM && dst.file != UNIFORM);
  68
  69    /* If exec_size == 0, try to guess it from the registers.  Since all
  70     * manner of things may use hardware registers, we first try to guess
  71     * based on GRF registers.  If this fails, we will go ahead and take the
  72     * width from the destination register.
  73     */
  74    if (this->exec_size == 0) {
  75       if (dst.file == GRF) {
  76          this->exec_size = dst.width;
  77       } else {
  78          for (unsigned i = 0; i < sources; ++i) {
  79             if (src[i].file != GRF && src[i].file != ATTR)
  80                continue;
  81
  82             if (this->exec_size <= 1)
  83                this->exec_size = src[i].width;
  84             assert(src[i].width == 1 || src[i].width == this->exec_size);
  85          }
  86       }
  87
  88       if (this->exec_size == 0 && dst.file != BAD_FILE)
  89          this->exec_size = dst.width;
  90    }
  91    assert(this->exec_size != 0);
  92
  93    for (unsigned i = 0; i < sources; ++i) {
  94       switch (this->src[i].file) {
  95       case BAD_FILE:
  96          this->src[i].effective_width = 8;
  97          break;
  98       case GRF:
  99       case HW_REG:
 100       case ATTR:
 101          assert(this->src[i].width > 0);
 102          if (this->src[i].width == 1) {
 103             this->src[i].effective_width = this->exec_size;
 104          } else {
 105             this->src[i].effective_width = this->src[i].width;
 106          }
 107          break;
 108       case IMM:
 109       case UNIFORM:
 110          this->src[i].effective_width = this->exec_size;
 111          break;
 112       default:
 113          unreachable("Invalid source register file");
 114       }
 115    }
 116    this->dst.effective_width = this->exec_size;
 117
 118    this->conditional_mod = BRW_CONDITIONAL_NONE;
 119
 120    /* This will be the case for almost all instructions. */
 121    switch (dst.file) {
 122    case GRF:
 123    case HW_REG:
 124    case MRF:
 125    case ATTR:
 126       this->regs_written =
 127          DIV_ROUND_UP(MAX2(dst.width * dst.stride, 1) * type_sz(dst.type), 32);
 128       break;
 129    case BAD_FILE:
 130       this->regs_written = 0;
 131       break;
 132    case IMM:
 133    case UNIFORM:
 134       unreachable("Invalid destination register file");
 135    default:
 136       unreachable("Invalid register file");
 137    }
 138
 139    this->writes_accumulator = false;
 140 }
 141
 142 fs_inst::fs_inst()
 143 {
 144    init(BRW_OPCODE_NOP, 8, dst, NULL, 0);
 145 }
 146
 147 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
 148 {
 149    init(opcode, exec_size, reg_undef, NULL, 0);
 150 }
 151
 152 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
 153 {
 154    init(opcode, 0, dst, NULL, 0);
 155 }
 156
 157 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 158                  const fs_reg &src0)
 159 {
 160    const fs_reg src[1] = { src0 };
 161    init(opcode, exec_size, dst, src, 1);
 162 }
 163
 164 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
 165 {
 166    const fs_reg src[1] = { src0 };
 167    init(opcode, 0, dst, src, 1);
 168 }
 169
 170 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 171                  const fs_reg &src0, const fs_reg &src1)
 172 {
 173    const fs_reg src[2] = { src0, src1 };
 174    init(opcode, exec_size, dst, src, 2);
 175 }
 176
 177 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 178                  const fs_reg &src1)
 179 {
 180    const fs_reg src[2] = { src0, src1 };
 181    init(opcode, 0, dst, src, 2);
 182 }
 183
 184 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 185                  const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
 186 {
 187    const fs_reg src[3] = { src0, src1, src2 };
 188    init(opcode, exec_size, dst, src, 3);
 189 }
 190
 191 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 192                  const fs_reg &src1, const fs_reg &src2)
 193 {
 194    const fs_reg src[3] = { src0, src1, src2 };
 195    init(opcode, 0, dst, src, 3);
 196 }
 197
 198 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst,
 199                  const fs_reg src[], unsigned sources)
 200 {
 201    init(opcode, 0, dst, src, sources);
 202 }
 203
 204 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
 205                  const fs_reg src[], unsigned sources)
 206 {
 207    init(opcode, exec_width, dst, src, sources);
 208 }
 209
 210 fs_inst::fs_inst(const fs_inst &that)
 211 {
 212    memcpy(this, &that, sizeof(that));
 213
 214    this->src = new fs_reg[MAX2(that.sources, 3)];
 215
 216    for (unsigned i = 0; i < that.sources; i++)
 217       this->src[i] = that.src[i];
 218 }
 219
 220 fs_inst::~fs_inst()
 221 {
 222    delete[] this->src;
 223 }
 224
 225 void
 226 fs_inst::resize_sources(uint8_t num_sources)
 227 {
 228    if (this->sources != num_sources) {
 229       fs_reg *src = new fs_reg[MAX2(num_sources, 3)];
 230
 231       for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i)
 232          src[i] = this->src[i];
 233
 234       delete[] this->src;
 235       this->src = src;
 236       this->sources = num_sources;
 237    }
 238 }
 239
 240 #define ALU1(op)                                                        \
 241    fs_inst *                                                            \
 242    fs_visitor::op(const fs_reg &dst, const fs_reg &src0)                \
 243    {                                                                    \
 244       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
 245    }
 246
 247 #define ALU2(op)                                                        \
 248    fs_inst *                                                            \
 249    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 250                   const fs_reg &src1)                                   \
 251    {                                                                    \
 252       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
 253    }
 254
 255 #define ALU2_ACC(op)                                                    \
 256    fs_inst *                                                            \
 257    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 258                   const fs_reg &src1)                                   \
 259    {                                                                    \
 260       fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
 261       inst->writes_accumulator = true;                                  \
 262       return inst;                                                      \
 263    }
 264
 265 #define ALU3(op)                                                        \
 266    fs_inst *                                                            \
 267    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 268                   const fs_reg &src1, const fs_reg &src2)               \
 269    {                                                                    \
 270       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
 271    }
 272
 273 ALU1(NOT)
 274 ALU1(MOV)
 275 ALU1(FRC)
 276 ALU1(RNDD)
 277 ALU1(RNDE)
 278 ALU1(RNDZ)
 279 ALU2(ADD)
 280 ALU2(MUL)
 281 ALU2_ACC(MACH)
 282 ALU2(AND)
 283 ALU2(OR)
 284 ALU2(XOR)
 285 ALU2(SHL)
 286 ALU2(SHR)
 287 ALU2(ASR)
 288 ALU3(LRP)
 289 ALU1(BFREV)
 290 ALU3(BFE)
 291 ALU2(BFI1)
 292 ALU3(BFI2)
 293 ALU1(FBH)
 294 ALU1(FBL)
 295 ALU1(CBIT)
 296 ALU3(MAD)
 297 ALU2_ACC(ADDC)
 298 ALU2_ACC(SUBB)
 299 ALU2(SEL)
 300 ALU2(MAC)
 301
 302 /** Gen4 predicated IF. */
 303 fs_inst *
 304 fs_visitor::IF(enum brw_predicate predicate)
 305 {
 306    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width);
 307    inst->predicate = predicate;
 308    return inst;
 309 }
 310
 311 /** Gen6 IF with embedded comparison. */
 312 fs_inst *
 313 fs_visitor::IF(const fs_reg &src0, const fs_reg &src1,
 314                enum brw_conditional_mod condition)
 315 {
 316    assert(brw->gen == 6);
 317    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width,
 318                                         reg_null_d, src0, src1);
 319    inst->conditional_mod = condition;
 320    return inst;
 321 }
 322
 323 /**
 324  * CMP: Sets the low bit of the destination channels with the result
 325  * of the comparison, while the upper bits are undefined, and updates
 326  * the flag register with the packed 16 bits of the result.
 327  */
 328 fs_inst *
 329 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
 330                 enum brw_conditional_mod condition)
 331 {
 332    fs_inst *inst;
 333
 334    /* Take the instruction:
 335     *
 336     * CMP null<d> src0<f> src1<f>
 337     *
 338     * Original gen4 does type conversion to the destination type before
 339     * comparison, producing garbage results for floating point comparisons.
 340     *
 341     * The destination type doesn't matter on newer generations, so we set the
 342     * type to match src0 so we can compact the instruction.
 343     */
 344    dst.type = src0.type;
 345    if (dst.file == HW_REG)
 346       dst.fixed_hw_reg.type = dst.type;
 347
 348    resolve_ud_negate(&src0);
 349    resolve_ud_negate(&src1);
 350
 351    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
 352    inst->conditional_mod = condition;
 353
 354    return inst;
 355 }
 356
 357 fs_inst *
 358 fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources)
 359 {
 360    uint8_t exec_size = dst.width;
 361    for (int i = 0; i < sources; ++i) {
 362       assert(src[i].width % dst.width == 0);
 363       if (src[i].width > exec_size)
 364          exec_size = src[i].width;
 365    }
 366
 367    fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, exec_size,
 368                                         dst, src, sources);
 369    inst->regs_written = 0;
 370    for (int i = 0; i < sources; ++i) {
 371       /* The LOAD_PAYLOAD instruction only really makes sense if we are
 372        * dealing with whole registers.  If this ever changes, we can deal
 373        * with it later.
 374        */
 375       int size = inst->src[i].effective_width * type_sz(src[i].type);
 376       assert(size % 32 == 0);
 377       inst->regs_written += (size + 31) / 32;
 378    }
 379
 380    return inst;
 381 }
 382
 383 exec_list
 384 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
 385                                        const fs_reg &surf_index,
 386                                        const fs_reg &varying_offset,
 387                                        uint32_t const_offset)
 388 {
 389    exec_list instructions;
 390    fs_inst *inst;
 391
 392    /* We have our constant surface use a pitch of 4 bytes, so our index can
 393     * be any component of a vector, and then we load 4 contiguous
 394     * components starting from that.
 395     *
 396     * We break down the const_offset to a portion added to the variable
 397     * offset and a portion done using reg_offset, which means that if you
 398     * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
 399     * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
 400     * CSE can later notice that those loads are all the same and eliminate
 401     * the redundant ones.
 402     */
 403    fs_reg vec4_offset = vgrf(glsl_type::int_type);
 404    instructions.push_tail(ADD(vec4_offset,
 405                               varying_offset, fs_reg(const_offset & ~3)));
 406
 407    int scale = 1;
 408    if (brw->gen == 4 && dst.width == 8) {
 409       /* Pre-gen5, we can either use a SIMD8 message that requires (header,
 410        * u, v, r) as parameters, or we can just use the SIMD16 message
 411        * consisting of (header, u).  We choose the second, at the cost of a
 412        * longer return length.
 413        */
 414       scale = 2;
 415    }
 416
 417    enum opcode op;
 418    if (brw->gen >= 7)
 419       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
 420    else
 421       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
 422
 423    assert(dst.width % 8 == 0);
 424    int regs_written = 4 * (dst.width / 8) * scale;
 425    fs_reg vec4_result = fs_reg(GRF, alloc.allocate(regs_written),
 426                                dst.type, dst.width);
 427    inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
 428    inst->regs_written = regs_written;
 429    instructions.push_tail(inst);
 430
 431    if (brw->gen < 7) {
 432       inst->base_mrf = 13;
 433       inst->header_present = true;
 434       if (brw->gen == 4)
 435          inst->mlen = 3;
 436       else
 437          inst->mlen = 1 + dispatch_width / 8;
 438    }
 439
 440    fs_reg result = offset(vec4_result, (const_offset & 3) * scale);
 441    instructions.push_tail(MOV(dst, result));
 442
 443    return instructions;
 444 }
 445
 446 /**
 447  * A helper for MOV generation for fixing up broken hardware SEND dependency
 448  * handling.
 449  */
 450 fs_inst *
 451 fs_visitor::DEP_RESOLVE_MOV(int grf)
 452 {
 453    fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
 454
 455    inst->ir = NULL;
 456    inst->annotation = "send dependency resolve";
 457
 458    /* The caller always wants uncompressed to emit the minimal extra
 459     * dependencies, and to avoid having to deal with aligning its regs to 2.
 460     */
 461    inst->exec_size = 8;
 462
 463    return inst;
 464 }
 465
 466 bool
 467 fs_inst::equals(fs_inst *inst) const
 468 {
 469    return (opcode == inst->opcode &&
 470            dst.equals(inst->dst) &&
 471            src[0].equals(inst->src[0]) &&
 472            src[1].equals(inst->src[1]) &&
 473            src[2].equals(inst->src[2]) &&
 474            saturate == inst->saturate &&
 475            predicate == inst->predicate &&
 476            conditional_mod == inst->conditional_mod &&
 477            mlen == inst->mlen &&
 478            base_mrf == inst->base_mrf &&
 479            target == inst->target &&
 480            eot == inst->eot &&
 481            header_present == inst->header_present &&
 482            shadow_compare == inst->shadow_compare &&
 483            exec_size == inst->exec_size &&
 484            offset == inst->offset);
 485 }
 486
 487 bool
 488 fs_inst::overwrites_reg(const fs_reg &reg) const
 489 {
 490    return (reg.file == dst.file &&
 491            reg.reg == dst.reg &&
 492            reg.reg_offset >= dst.reg_offset  &&
 493            reg.reg_offset < dst.reg_offset + regs_written);
 494 }
 495
 496 bool
 497 fs_inst::is_send_from_grf() const
 498 {
 499    switch (opcode) {
 500    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
 501    case SHADER_OPCODE_SHADER_TIME_ADD:
 502    case FS_OPCODE_INTERPOLATE_AT_CENTROID:
 503    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
 504    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
 505    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
 506    case SHADER_OPCODE_UNTYPED_ATOMIC:
 507    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
 508    case SHADER_OPCODE_URB_WRITE_SIMD8:
 509       return true;
 510    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 511       return src[1].file == GRF;
 512    case FS_OPCODE_FB_WRITE:
 513       return src[0].file == GRF;
 514    default:
 515       if (is_tex())
 516          return src[0].file == GRF;
 517
 518       return false;
 519    }
 520 }
 521
 522 bool
 523 fs_inst::can_do_source_mods(struct brw_context *brw)
 524 {
 525    if (brw->gen == 6 && is_math())
 526       return false;
 527
 528    if (is_send_from_grf())
 529       return false;
 530
 531    if (!backend_instruction::can_do_source_mods())
 532       return false;
 533
 534    return true;
 535 }
 536
 537 void
 538 fs_reg::init()
 539 {
 540    memset(this, 0, sizeof(*this));
 541    stride = 1;
 542 }
 543
 544 /** Generic unset register constructor. */
 545 fs_reg::fs_reg()
 546 {
 547    init();
 548    this->file = BAD_FILE;
 549 }
 550
 551 /** Immediate value constructor. */
 552 fs_reg::fs_reg(float f)
 553 {
 554    init();
 555    this->file = IMM;
 556    this->type = BRW_REGISTER_TYPE_F;
 557    this->fixed_hw_reg.dw1.f = f;
 558    this->width = 1;
 559 }
 560
 561 /** Immediate value constructor. */
 562 fs_reg::fs_reg(int32_t i)
 563 {
 564    init();
 565    this->file = IMM;
 566    this->type = BRW_REGISTER_TYPE_D;
 567    this->fixed_hw_reg.dw1.d = i;
 568    this->width = 1;
 569 }
 570
 571 /** Immediate value constructor. */
 572 fs_reg::fs_reg(uint32_t u)
 573 {
 574    init();
 575    this->file = IMM;
 576    this->type = BRW_REGISTER_TYPE_UD;
 577    this->fixed_hw_reg.dw1.ud = u;
 578    this->width = 1;
 579 }
 580
 581 /** Vector float immediate value constructor. */
 582 fs_reg::fs_reg(uint8_t vf[4])
 583 {
 584    init();
 585    this->file = IMM;
 586    this->type = BRW_REGISTER_TYPE_VF;
 587    memcpy(&this->fixed_hw_reg.dw1.ud, vf, sizeof(unsigned));
 588 }
 589
 590 /** Vector float immediate value constructor. */
 591 fs_reg::fs_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
 592 {
 593    init();
 594    this->file = IMM;
 595    this->type = BRW_REGISTER_TYPE_VF;
 596    this->fixed_hw_reg.dw1.ud = (vf0 <<  0) |
 597                                (vf1 <<  8) |
 598                                (vf2 << 16) |
 599                                (vf3 << 24);
 600 }
 601
 602 /** Fixed brw_reg. */
 603 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 604 {
 605    init();
 606    this->file = HW_REG;
 607    this->fixed_hw_reg = fixed_hw_reg;
 608    this->type = fixed_hw_reg.type;
 609    this->width = 1 << fixed_hw_reg.width;
 610 }
 611
 612 bool
 613 fs_reg::equals(const fs_reg &r) const
 614 {
 615    return (file == r.file &&
 616            reg == r.reg &&
 617            reg_offset == r.reg_offset &&
 618            subreg_offset == r.subreg_offset &&
 619            type == r.type &&
 620            negate == r.negate &&
 621            abs == r.abs &&
 622            !reladdr && !r.reladdr &&
 623            memcmp(&fixed_hw_reg, &r.fixed_hw_reg, sizeof(fixed_hw_reg)) == 0 &&
 624            width == r.width &&
 625            stride == r.stride);
 626 }
 627
 628 fs_reg &
 629 fs_reg::set_smear(unsigned subreg)
 630 {
 631    assert(file != HW_REG && file != IMM);
 632    subreg_offset = subreg * type_sz(type);
 633    stride = 0;
 634    return *this;
 635 }
 636
 637 bool
 638 fs_reg::is_contiguous() const
 639 {
 640    return stride == 1;
 641 }
 642
 643 int
 644 fs_visitor::type_size(const struct glsl_type *type)
 645 {
 646    unsigned int size, i;
 647
 648    switch (type->base_type) {
 649    case GLSL_TYPE_UINT:
 650    case GLSL_TYPE_INT:
 651    case GLSL_TYPE_FLOAT:
 652    case GLSL_TYPE_BOOL:
 653       return type->components();
 654    case GLSL_TYPE_ARRAY:
 655       return type_size(type->fields.array) * type->length;
 656    case GLSL_TYPE_STRUCT:
 657       size = 0;
 658       for (i = 0; i < type->length; i++) {
 659          size += type_size(type->fields.structure[i].type);
 660       }
 661       return size;
 662    case GLSL_TYPE_SAMPLER:
 663       /* Samplers take up no register space, since they're baked in at
 664        * link time.
 665        */
 666       return 0;
 667    case GLSL_TYPE_ATOMIC_UINT:
 668       return 0;
 669    case GLSL_TYPE_IMAGE:
 670    case GLSL_TYPE_VOID:
 671    case GLSL_TYPE_ERROR:
 672    case GLSL_TYPE_INTERFACE:
 673    case GLSL_TYPE_DOUBLE:
 674       unreachable("not reached");
 675    }
 676
 677    return 0;
 678 }
 679
 680 fs_reg
 681 fs_visitor::get_timestamp()
 682 {
 683    assert(brw->gen >= 7);
 684
 685    fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 686                                           BRW_ARF_TIMESTAMP,
 687                                           0),
 688                              BRW_REGISTER_TYPE_UD));
 689
 690    fs_reg dst = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 4);
 691
 692    fs_inst *mov = emit(MOV(dst, ts));
 693    /* We want to read the 3 fields we care about even if it's not enabled in
 694     * the dispatch.
 695     */
 696    mov->force_writemask_all = true;
 697
 698    /* The caller wants the low 32 bits of the timestamp.  Since it's running
 699     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 700     * which is plenty of time for our purposes.  It is identical across the
 701     * EUs, but since it's tracking GPU core speed it will increment at a
 702     * varying rate as render P-states change.
 703     *
 704     * The caller could also check if render P-states have changed (or anything
 705     * else that might disrupt timing) by setting smear to 2 and checking if
 706     * that field is != 0.
 707     */
 708    dst.set_smear(0);
 709
 710    return dst;
 711 }
 712
 713 void
 714 fs_visitor::emit_shader_time_begin()
 715 {
 716    current_annotation = "shader time start";
 717    shader_start_time = get_timestamp();
 718 }
 719
 720 void
 721 fs_visitor::emit_shader_time_end()
 722 {
 723    current_annotation = "shader time end";
 724
 725    enum shader_time_shader_type type, written_type, reset_type;
 726    switch (stage) {
 727    case MESA_SHADER_VERTEX:
 728       type = ST_VS;
 729       written_type = ST_VS_WRITTEN;
 730       reset_type = ST_VS_RESET;
 731       break;
 732    case MESA_SHADER_GEOMETRY:
 733       type = ST_GS;
 734       written_type = ST_GS_WRITTEN;
 735       reset_type = ST_GS_RESET;
 736       break;
 737    case MESA_SHADER_FRAGMENT:
 738       if (dispatch_width == 8) {
 739          type = ST_FS8;
 740          written_type = ST_FS8_WRITTEN;
 741          reset_type = ST_FS8_RESET;
 742       } else {
 743          assert(dispatch_width == 16);
 744          type = ST_FS16;
 745          written_type = ST_FS16_WRITTEN;
 746          reset_type = ST_FS16_RESET;
 747       }
 748       break;
 749    default:
 750       unreachable("fs_visitor::emit_shader_time_end missing code");
 751    }
 752
 753    fs_reg shader_end_time = get_timestamp();
 754
 755    /* Check that there weren't any timestamp reset events (assuming these
 756     * were the only two timestamp reads that happened).
 757     */
 758    fs_reg reset = shader_end_time;
 759    reset.set_smear(2);
 760    fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
 761    test->conditional_mod = BRW_CONDITIONAL_Z;
 762    test->force_writemask_all = true;
 763    emit(IF(BRW_PREDICATE_NORMAL));
 764
 765    fs_reg start = shader_start_time;
 766    start.negate = true;
 767    fs_reg diff = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 1);
 768    diff.set_smear(0);
 769    fs_inst *add = ADD(diff, start, shader_end_time);
 770    add->force_writemask_all = true;
 771    emit(add);
 772
 773    /* If there were no instructions between the two timestamp gets, the diff
 774     * is 2 cycles.  Remove that overhead, so I can forget about that when
 775     * trying to determine the time taken for single instructions.
 776     */
 777    add = ADD(diff, diff, fs_reg(-2u));
 778    add->force_writemask_all = true;
 779    emit(add);
 780
 781    emit(SHADER_TIME_ADD(type, diff));
 782    emit(SHADER_TIME_ADD(written_type, fs_reg(1u)));
 783    emit(BRW_OPCODE_ELSE);
 784    emit(SHADER_TIME_ADD(reset_type, fs_reg(1u)));
 785    emit(BRW_OPCODE_ENDIF);
 786 }
 787
 788 fs_inst *
 789 fs_visitor::SHADER_TIME_ADD(enum shader_time_shader_type type, fs_reg value)
 790 {
 791    int shader_time_index =
 792       brw_get_shader_time_index(brw, shader_prog, prog, type);
 793    fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
 794
 795    fs_reg payload;
 796    if (dispatch_width == 8)
 797       payload = vgrf(glsl_type::uvec2_type);
 798    else
 799       payload = vgrf(glsl_type::uint_type);
 800
 801    return new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
 802                                fs_reg(), payload, offset, value);
 803 }
 804
 805 void
 806 fs_visitor::vfail(const char *format, va_list va)
 807 {
 808    char *msg;
 809
 810    if (failed)
 811       return;
 812
 813    failed = true;
 814
 815    msg = ralloc_vasprintf(mem_ctx, format, va);
 816    msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
 817
 818    this->fail_msg = msg;
 819
 820    if (debug_enabled) {
 821       fprintf(stderr, "%s",  msg);
 822    }
 823 }
 824
 825 void
 826 fs_visitor::fail(const char *format, ...)
 827 {
 828    va_list va;
 829
 830    va_start(va, format);
 831    vfail(format, va);
 832    va_end(va);
 833 }
 834
 835 /**
 836  * Mark this program as impossible to compile in SIMD16 mode.
 837  *
 838  * During the SIMD8 compile (which happens first), we can detect and flag
 839  * things that are unsupported in SIMD16 mode, so the compiler can skip
 840  * the SIMD16 compile altogether.
 841  *
 842  * During a SIMD16 compile (if one happens anyway), this just calls fail().
 843  */
 844 void
 845 fs_visitor::no16(const char *format, ...)
 846 {
 847    va_list va;
 848
 849    va_start(va, format);
 850
 851    if (dispatch_width == 16) {
 852       vfail(format, va);
 853    } else {
 854       simd16_unsupported = true;
 855
 856       if (brw->perf_debug) {
 857          if (no16_msg)
 858             ralloc_vasprintf_append(&no16_msg, format, va);
 859          else
 860             no16_msg = ralloc_vasprintf(mem_ctx, format, va);
 861       }
 862    }
 863
 864    va_end(va);
 865 }
 866
 867 fs_inst *
 868 fs_visitor::emit(enum opcode opcode)
 869 {
 870    return emit(new(mem_ctx) fs_inst(opcode, dispatch_width));
 871 }
 872
 873 fs_inst *
 874 fs_visitor::emit(enum opcode opcode, const fs_reg &dst)
 875 {
 876    return emit(new(mem_ctx) fs_inst(opcode, dst));
 877 }
 878
 879 fs_inst *
 880 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
 881 {
 882    return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
 883 }
 884
 885 fs_inst *
 886 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 887                  const fs_reg &src1)
 888 {
 889    return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
 890 }
 891
 892 fs_inst *
 893 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 894                  const fs_reg &src1, const fs_reg &src2)
 895 {
 896    return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
 897 }
 898
 899 fs_inst *
 900 fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
 901                  fs_reg src[], int sources)
 902 {
 903    return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
 904 }
 905
 906 /**
 907  * Returns true if the instruction has a flag that means it won't
 908  * update an entire destination register.
 909  *
 910  * For example, dead code elimination and live variable analysis want to know
 911  * when a write to a variable screens off any preceding values that were in
 912  * it.
 913  */
 914 bool
 915 fs_inst::is_partial_write() const
 916 {
 917    return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
 918            (this->dst.width * type_sz(this->dst.type)) < 32 ||
 919            !this->dst.is_contiguous());
 920 }
 921
 922 int
 923 fs_inst::regs_read(int arg) const
 924 {
 925    if (is_tex() && arg == 0 && src[0].file == GRF) {
 926       return mlen;
 927    } else if (opcode == FS_OPCODE_FB_WRITE && arg == 0) {
 928       return mlen;
 929    } else if (opcode == SHADER_OPCODE_URB_WRITE_SIMD8 && arg == 0) {
 930       return mlen;
 931    } else if (opcode == SHADER_OPCODE_UNTYPED_ATOMIC && arg == 0) {
 932       return mlen;
 933    } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ && arg == 0) {
 934       return mlen;
 935    } else if (opcode == FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET && arg == 0) {
 936       return mlen;
 937    }
 938
 939    switch (src[arg].file) {
 940    case BAD_FILE:
 941    case UNIFORM:
 942    case IMM:
 943       return 1;
 944    case GRF:
 945    case HW_REG:
 946       if (src[arg].stride == 0) {
 947          return 1;
 948       } else {
 949          int size = src[arg].width * src[arg].stride * type_sz(src[arg].type);
 950          return (size + 31) / 32;
 951       }
 952    case MRF:
 953       unreachable("MRF registers are not allowed as sources");
 954    default:
 955       unreachable("Invalid register file");
 956    }
 957 }
 958
 959 bool
 960 fs_inst::reads_flag() const
 961 {
 962    return predicate;
 963 }
 964
 965 bool
 966 fs_inst::writes_flag() const
 967 {
 968    return (conditional_mod && (opcode != BRW_OPCODE_SEL &&
 969                                opcode != BRW_OPCODE_IF &&
 970                                opcode != BRW_OPCODE_WHILE)) ||
 971           opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
 972 }
 973
 974 /**
 975  * Returns how many MRFs an FS opcode will write over.
 976  *
 977  * Note that this is not the 0 or 1 implied writes in an actual gen
 978  * instruction -- the FS opcodes often generate MOVs in addition.
 979  */
 980 int
 981 fs_visitor::implied_mrf_writes(fs_inst *inst)
 982 {
 983    if (inst->mlen == 0)
 984       return 0;
 985
 986    if (inst->base_mrf == -1)
 987       return 0;
 988
 989    switch (inst->opcode) {
 990    case SHADER_OPCODE_RCP:
 991    case SHADER_OPCODE_RSQ:
 992    case SHADER_OPCODE_SQRT:
 993    case SHADER_OPCODE_EXP2:
 994    case SHADER_OPCODE_LOG2:
 995    case SHADER_OPCODE_SIN:
 996    case SHADER_OPCODE_COS:
 997       return 1 * dispatch_width / 8;
 998    case SHADER_OPCODE_POW:
 999    case SHADER_OPCODE_INT_QUOTIENT:
1000    case SHADER_OPCODE_INT_REMAINDER:
1001       return 2 * dispatch_width / 8;
1002    case SHADER_OPCODE_TEX:
1003    case FS_OPCODE_TXB:
1004    case SHADER_OPCODE_TXD:
1005    case SHADER_OPCODE_TXF:
1006    case SHADER_OPCODE_TXF_CMS:
1007    case SHADER_OPCODE_TXF_MCS:
1008    case SHADER_OPCODE_TG4:
1009    case SHADER_OPCODE_TG4_OFFSET:
1010    case SHADER_OPCODE_TXL:
1011    case SHADER_OPCODE_TXS:
1012    case SHADER_OPCODE_LOD:
1013       return 1;
1014    case FS_OPCODE_FB_WRITE:
1015       return 2;
1016    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
1017    case SHADER_OPCODE_GEN4_SCRATCH_READ:
1018       return 1;
1019    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
1020       return inst->mlen;
1021    case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
1022       return 2;
1023    case SHADER_OPCODE_UNTYPED_ATOMIC:
1024    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
1025    case SHADER_OPCODE_URB_WRITE_SIMD8:
1026    case FS_OPCODE_INTERPOLATE_AT_CENTROID:
1027    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
1028    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
1029    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
1030       return 0;
1031    default:
1032       unreachable("not reached");
1033    }
1034 }
1035
1036 fs_reg
1037 fs_visitor::vgrf(const glsl_type *const type)
1038 {
1039    int reg_width = dispatch_width / 8;
1040    return fs_reg(GRF, alloc.allocate(type_size(type) * reg_width),
1041                  brw_type_for_base_type(type), dispatch_width);
1042 }
1043
1044 fs_reg
1045 fs_visitor::vgrf(int num_components)
1046 {
1047    int reg_width = dispatch_width / 8;
1048    return fs_reg(GRF, alloc.allocate(num_components * reg_width),
1049                  BRW_REGISTER_TYPE_F, dispatch_width);
1050 }
1051
1052 /** Fixed HW reg constructor. */
1053 fs_reg::fs_reg(enum register_file file, int reg)
1054 {
1055    init();
1056    this->file = file;
1057    this->reg = reg;
1058    this->type = BRW_REGISTER_TYPE_F;
1059
1060    switch (file) {
1061    case UNIFORM:
1062       this->width = 1;
1063       break;
1064    default:
1065       this->width = 8;
1066    }
1067 }
1068
1069 /** Fixed HW reg constructor. */
1070 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
1071 {
1072    init();
1073    this->file = file;
1074    this->reg = reg;
1075    this->type = type;
1076
1077    switch (file) {
1078    case UNIFORM:
1079       this->width = 1;
1080       break;
1081    default:
1082       this->width = 8;
1083    }
1084 }
1085
1086 /** Fixed HW reg constructor. */
1087 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type,
1088                uint8_t width)
1089 {
1090    init();
1091    this->file = file;
1092    this->reg = reg;
1093    this->type = type;
1094    this->width = width;
1095 }
1096
1097 fs_reg *
1098 fs_visitor::variable_storage(ir_variable *var)
1099 {
1100    return (fs_reg *)hash_table_find(this->variable_ht, var);
1101 }
1102
1103 void
1104 import_uniforms_callback(const void *key,
1105                          void *data,
1106                          void *closure)
1107 {
1108    struct hash_table *dst_ht = (struct hash_table *)closure;
1109    const fs_reg *reg = (const fs_reg *)data;
1110
1111    if (reg->file != UNIFORM)
1112       return;
1113
1114    hash_table_insert(dst_ht, data, key);
1115 }
1116
1117 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
1118  * This brings in those uniform definitions
1119  */
1120 void
1121 fs_visitor::import_uniforms(fs_visitor *v)
1122 {
1123    hash_table_call_foreach(v->variable_ht,
1124                            import_uniforms_callback,
1125                            variable_ht);
1126    this->push_constant_loc = v->push_constant_loc;
1127    this->pull_constant_loc = v->pull_constant_loc;
1128    this->uniforms = v->uniforms;
1129    this->param_size = v->param_size;
1130 }
1131
1132 /* Our support for uniforms is piggy-backed on the struct
1133  * gl_fragment_program, because that's where the values actually
1134  * get stored, rather than in some global gl_shader_program uniform
1135  * store.
1136  */
1137 void
1138 fs_visitor::setup_uniform_values(ir_variable *ir)
1139 {
1140    int namelen = strlen(ir->name);
1141
1142    /* The data for our (non-builtin) uniforms is stored in a series of
1143     * gl_uniform_driver_storage structs for each subcomponent that
1144     * glGetUniformLocation() could name.  We know it's been set up in the same
1145     * order we'd walk the type, so walk the list of storage and find anything
1146     * with our name, or the prefix of a component that starts with our name.
1147     */
1148    unsigned params_before = uniforms;
1149    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
1150       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
1151
1152       if (strncmp(ir->name, storage->name, namelen) != 0 ||
1153           (storage->name[namelen] != 0 &&
1154            storage->name[namelen] != '.' &&
1155            storage->name[namelen] != '[')) {
1156          continue;
1157       }
1158
1159       unsigned slots = storage->type->component_slots();
1160       if (storage->array_elements)
1161          slots *= storage->array_elements;
1162
1163       for (unsigned i = 0; i < slots; i++) {
1164          stage_prog_data->param[uniforms++] = &storage->storage[i];
1165       }
1166    }
1167
1168    /* Make sure we actually initialized the right amount of stuff here. */
1169    assert(params_before + ir->type->component_slots() == uniforms);
1170    (void)params_before;
1171 }
1172
1173
1174 /* Our support for builtin uniforms is even scarier than non-builtin.
1175  * It sits on top of the PROG_STATE_VAR parameters that are
1176  * automatically updated from GL context state.
1177  */
1178 void
1179 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
1180 {
1181    const ir_state_slot *const slots = ir->get_state_slots();
1182    assert(slots != NULL);
1183
1184    for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
1185       /* This state reference has already been setup by ir_to_mesa, but we'll
1186        * get the same index back here.
1187        */
1188       int index = _mesa_add_state_reference(this->prog->Parameters,
1189                                             (gl_state_index *)slots[i].tokens);
1190
1191       /* Add each of the unique swizzles of the element as a parameter.
1192        * This'll end up matching the expected layout of the
1193        * array/matrix/structure we're trying to fill in.
1194        */
1195       int last_swiz = -1;
1196       for (unsigned int j = 0; j < 4; j++) {
1197          int swiz = GET_SWZ(slots[i].swizzle, j);
1198          if (swiz == last_swiz)
1199             break;
1200          last_swiz = swiz;
1201
1202          stage_prog_data->param[uniforms++] =
1203             &prog->Parameters->ParameterValues[index][swiz];
1204       }
1205    }
1206 }
1207
1208 fs_reg *
1209 fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
1210                                          bool origin_upper_left)
1211 {
1212    assert(stage == MESA_SHADER_FRAGMENT);
1213    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1214    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec4_type));
1215    fs_reg wpos = *reg;
1216    bool flip = !origin_upper_left ^ key->render_to_fbo;
1217
1218    /* gl_FragCoord.x */
1219    if (pixel_center_integer) {
1220       emit(MOV(wpos, this->pixel_x));
1221    } else {
1222       emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
1223    }
1224    wpos = offset(wpos, 1);
1225
1226    /* gl_FragCoord.y */
1227    if (!flip && pixel_center_integer) {
1228       emit(MOV(wpos, this->pixel_y));
1229    } else {
1230       fs_reg pixel_y = this->pixel_y;
1231       float offset = (pixel_center_integer ? 0.0 : 0.5);
1232
1233       if (flip) {
1234          pixel_y.negate = true;
1235          offset += key->drawable_height - 1.0;
1236       }
1237
1238       emit(ADD(wpos, pixel_y, fs_reg(offset)));
1239    }
1240    wpos = offset(wpos, 1);
1241
1242    /* gl_FragCoord.z */
1243    if (brw->gen >= 6) {
1244       emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
1245    } else {
1246       emit(FS_OPCODE_LINTERP, wpos,
1247            this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1248            this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1249            interp_reg(VARYING_SLOT_POS, 2));
1250    }
1251    wpos = offset(wpos, 1);
1252
1253    /* gl_FragCoord.w: Already set up in emit_interpolation */
1254    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1255
1256    return reg;
1257 }
1258
1259 fs_inst *
1260 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1261                          glsl_interp_qualifier interpolation_mode,
1262                          bool is_centroid, bool is_sample)
1263 {
1264    brw_wm_barycentric_interp_mode barycoord_mode;
1265    if (brw->gen >= 6) {
1266       if (is_centroid) {
1267          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1268             barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1269          else
1270             barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1271       } else if (is_sample) {
1272           if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1273             barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1274          else
1275             barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1276       } else {
1277          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1278             barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1279          else
1280             barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1281       }
1282    } else {
1283       /* On Ironlake and below, there is only one interpolation mode.
1284        * Centroid interpolation doesn't mean anything on this hardware --
1285        * there is no multisampling.
1286        */
1287       barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1288    }
1289    return emit(FS_OPCODE_LINTERP, attr,
1290                this->delta_x[barycoord_mode],
1291                this->delta_y[barycoord_mode], interp);
1292 }
1293
1294 void
1295 fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
1296                                        const glsl_type *type,
1297                                        glsl_interp_qualifier interpolation_mode,
1298                                        int location, bool mod_centroid,
1299                                        bool mod_sample)
1300 {
1301    attr.type = brw_type_for_base_type(type->get_scalar_type());
1302
1303    assert(stage == MESA_SHADER_FRAGMENT);
1304    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1305    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1306
1307    unsigned int array_elements;
1308
1309    if (type->is_array()) {
1310       array_elements = type->length;
1311       if (array_elements == 0) {
1312          fail("dereferenced array '%s' has length 0\n", name);
1313       }
1314       type = type->fields.array;
1315    } else {
1316       array_elements = 1;
1317    }
1318
1319    if (interpolation_mode == INTERP_QUALIFIER_NONE) {
1320       bool is_gl_Color =
1321          location == VARYING_SLOT_COL0 || location == VARYING_SLOT_COL1;
1322       if (key->flat_shade && is_gl_Color) {
1323          interpolation_mode = INTERP_QUALIFIER_FLAT;
1324       } else {
1325          interpolation_mode = INTERP_QUALIFIER_SMOOTH;
1326       }
1327    }
1328
1329    for (unsigned int i = 0; i < array_elements; i++) {
1330       for (unsigned int j = 0; j < type->matrix_columns; j++) {
1331          if (prog_data->urb_setup[location] == -1) {
1332             /* If there's no incoming setup data for this slot, don't
1333              * emit interpolation for it.
1334              */
1335             attr = offset(attr, type->vector_elements);
1336             location++;
1337             continue;
1338          }
1339
1340          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1341             /* Constant interpolation (flat shading) case. The SF has
1342              * handed us defined values in only the constant offset
1343              * field of the setup reg.
1344              */
1345             for (unsigned int k = 0; k < type->vector_elements; k++) {
1346                struct brw_reg interp = interp_reg(location, k);
1347                interp = suboffset(interp, 3);
1348                interp.type = attr.type;
1349                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1350                attr = offset(attr, 1);
1351             }
1352          } else {
1353             /* Smooth/noperspective interpolation case. */
1354             for (unsigned int k = 0; k < type->vector_elements; k++) {
1355                struct brw_reg interp = interp_reg(location, k);
1356                if (brw->needs_unlit_centroid_workaround && mod_centroid) {
1357                   /* Get the pixel/sample mask into f0 so that we know
1358                    * which pixels are lit.  Then, for each channel that is
1359                    * unlit, replace the centroid data with non-centroid
1360                    * data.
1361                    */
1362                   emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1363
1364                   fs_inst *inst;
1365                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1366                                       false, false);
1367                   inst->predicate = BRW_PREDICATE_NORMAL;
1368                   inst->predicate_inverse = true;
1369                   if (brw->has_pln)
1370                      inst->no_dd_clear = true;
1371
1372                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1373                                       mod_centroid && !key->persample_shading,
1374                                       mod_sample || key->persample_shading);
1375                   inst->predicate = BRW_PREDICATE_NORMAL;
1376                   inst->predicate_inverse = false;
1377                   if (brw->has_pln)
1378                      inst->no_dd_check = true;
1379
1380                } else {
1381                   emit_linterp(attr, fs_reg(interp), interpolation_mode,
1382                                mod_centroid && !key->persample_shading,
1383                                mod_sample || key->persample_shading);
1384                }
1385                if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1386                   emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1387                }
1388                attr = offset(attr, 1);
1389             }
1390
1391          }
1392          location++;
1393       }
1394    }
1395 }
1396
1397 fs_reg *
1398 fs_visitor::emit_frontfacing_interpolation()
1399 {
1400    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type));
1401
1402    if (brw->gen >= 6) {
1403       /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
1404        * a boolean result from this (~0/true or 0/false).
1405        *
1406        * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
1407        * this task in only one instruction:
1408        *    - a negation source modifier will flip the bit; and
1409        *    - a W -> D type conversion will sign extend the bit into the high
1410        *      word of the destination.
1411        *
1412        * An ASR 15 fills the low word of the destination.
1413        */
1414       fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
1415       g0.negate = true;
1416
1417       emit(ASR(*reg, g0, fs_reg(15)));
1418    } else {
1419       /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
1420        * a boolean result from this (1/true or 0/false).
1421        *
1422        * Like in the above case, since the bit is the MSB of g1.6:UD we can use
1423        * the negation source modifier to flip it. Unfortunately the SHR
1424        * instruction only operates on UD (or D with an abs source modifier)
1425        * sources without negation.
1426        *
1427        * Instead, use ASR (which will give ~0/true or 0/false).
1428        */
1429       fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
1430       g1_6.negate = true;
1431
1432       emit(ASR(*reg, g1_6, fs_reg(31)));
1433    }
1434
1435    return reg;
1436 }
1437
1438 void
1439 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1440 {
1441    assert(stage == MESA_SHADER_FRAGMENT);
1442    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1443    assert(dst.type == BRW_REGISTER_TYPE_F);
1444
1445    if (key->compute_pos_offset) {
1446       /* Convert int_sample_pos to floating point */
1447       emit(MOV(dst, int_sample_pos));
1448       /* Scale to the range [0, 1] */
1449       emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1450    }
1451    else {
1452       /* From ARB_sample_shading specification:
1453        * "When rendering to a non-multisample buffer, or if multisample
1454        *  rasterization is disabled, gl_SamplePosition will always be
1455        *  (0.5, 0.5).
1456        */
1457       emit(MOV(dst, fs_reg(0.5f)));
1458    }
1459 }
1460
1461 fs_reg *
1462 fs_visitor::emit_samplepos_setup()
1463 {
1464    assert(brw->gen >= 6);
1465
1466    this->current_annotation = "compute sample position";
1467    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type));
1468    fs_reg pos = *reg;
1469    fs_reg int_sample_x = vgrf(glsl_type::int_type);
1470    fs_reg int_sample_y = vgrf(glsl_type::int_type);
1471
1472    /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1473     * mode will be enabled.
1474     *
1475     * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1476     * R31.1:0         Position Offset X/Y for Slot[3:0]
1477     * R31.3:2         Position Offset X/Y for Slot[7:4]
1478     * .....
1479     *
1480     * The X, Y sample positions come in as bytes in  thread payload. So, read
1481     * the positions using vstride=16, width=8, hstride=2.
1482     */
1483    struct brw_reg sample_pos_reg =
1484       stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1485                     BRW_REGISTER_TYPE_B), 16, 8, 2);
1486
1487    if (dispatch_width == 8) {
1488       emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1489    } else {
1490       emit(MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg)));
1491       emit(MOV(half(int_sample_x, 1), fs_reg(suboffset(sample_pos_reg, 16))))
1492          ->force_sechalf = true;
1493    }
1494    /* Compute gl_SamplePosition.x */
1495    compute_sample_position(pos, int_sample_x);
1496    pos = offset(pos, 1);
1497    if (dispatch_width == 8) {
1498       emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1499    } else {
1500       emit(MOV(half(int_sample_y, 0),
1501                fs_reg(suboffset(sample_pos_reg, 1))));
1502       emit(MOV(half(int_sample_y, 1), fs_reg(suboffset(sample_pos_reg, 17))))
1503          ->force_sechalf = true;
1504    }
1505    /* Compute gl_SamplePosition.y */
1506    compute_sample_position(pos, int_sample_y);
1507    return reg;
1508 }
1509
1510 fs_reg *
1511 fs_visitor::emit_sampleid_setup()
1512 {
1513    assert(stage == MESA_SHADER_FRAGMENT);
1514    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1515    assert(brw->gen >= 6);
1516
1517    this->current_annotation = "compute sample id";
1518    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
1519
1520    if (key->compute_sample_id) {
1521       fs_reg t1 = vgrf(glsl_type::int_type);
1522       fs_reg t2 = vgrf(glsl_type::int_type);
1523       t2.type = BRW_REGISTER_TYPE_UW;
1524
1525       /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1526        * 8x multisampling, subspan 0 will represent sample N (where N
1527        * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1528        * 7. We can find the value of N by looking at R0.0 bits 7:6
1529        * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1530        * (since samples are always delivered in pairs). That is, we
1531        * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1532        * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1533        * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1534        * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1535        * populating a temporary variable with the sequence (0, 1, 2, 3),
1536        * and then reading from it using vstride=1, width=4, hstride=0.
1537        * These computations hold good for 4x multisampling as well.
1538        *
1539        * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1540        * the first four slots are sample 0 of subspan 0; the next four
1541        * are sample 1 of subspan 0; the third group is sample 0 of
1542        * subspan 1, and finally sample 1 of subspan 1.
1543        */
1544       fs_inst *inst;
1545       inst = emit(BRW_OPCODE_AND, t1,
1546                   fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1547                   fs_reg(0xc0));
1548       inst->force_writemask_all = true;
1549       inst = emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1550       inst->force_writemask_all = true;
1551       /* This works for both SIMD8 and SIMD16 */
1552       inst = emit(MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210)));
1553       inst->force_writemask_all = true;
1554       /* This special instruction takes care of setting vstride=1,
1555        * width=4, hstride=0 of t2 during an ADD instruction.
1556        */
1557       emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1558    } else {
1559       /* As per GL_ARB_sample_shading specification:
1560        * "When rendering to a non-multisample buffer, or if multisample
1561        *  rasterization is disabled, gl_SampleID will always be zero."
1562        */
1563       emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1564    }
1565
1566    return reg;
1567 }
1568
1569 void
1570 fs_visitor::resolve_source_modifiers(fs_reg *src)
1571 {
1572    if (!src->abs && !src->negate)
1573       return;
1574
1575    fs_reg temp = retype(vgrf(1), src->type);
1576    emit(MOV(temp, *src));
1577    *src = temp;
1578 }
1579
1580 fs_reg
1581 fs_visitor::fix_math_operand(fs_reg src)
1582 {
1583    /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1584     * might be able to do better by doing execsize = 1 math and then
1585     * expanding that result out, but we would need to be careful with
1586     * masking.
1587     *
1588     * The hardware ignores source modifiers (negate and abs) on math
1589     * instructions, so we also move to a temp to set those up.
1590     */
1591    if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1592        !src.abs && !src.negate)
1593       return src;
1594
1595    /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1596     * operands to math
1597     */
1598    if (brw->gen >= 7 && src.file != IMM)
1599       return src;
1600
1601    fs_reg expanded = vgrf(glsl_type::float_type);
1602    expanded.type = src.type;
1603    emit(BRW_OPCODE_MOV, expanded, src);
1604    return expanded;
1605 }
1606
1607 fs_inst *
1608 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1609 {
1610    switch (opcode) {
1611    case SHADER_OPCODE_RCP:
1612    case SHADER_OPCODE_RSQ:
1613    case SHADER_OPCODE_SQRT:
1614    case SHADER_OPCODE_EXP2:
1615    case SHADER_OPCODE_LOG2:
1616    case SHADER_OPCODE_SIN:
1617    case SHADER_OPCODE_COS:
1618       break;
1619    default:
1620       unreachable("not reached: bad math opcode");
1621    }
1622
1623    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
1624     * might be able to do better by doing execsize = 1 math and then
1625     * expanding that result out, but we would need to be careful with
1626     * masking.
1627     *
1628     * Gen 6 hardware ignores source modifiers (negate and abs) on math
1629     * instructions, so we also move to a temp to set those up.
1630     */
1631    if (brw->gen == 6 || brw->gen == 7)
1632       src = fix_math_operand(src);
1633
1634    fs_inst *inst = emit(opcode, dst, src);
1635
1636    if (brw->gen < 6) {
1637       inst->base_mrf = 2;
1638       inst->mlen = dispatch_width / 8;
1639    }
1640
1641    return inst;
1642 }
1643
1644 fs_inst *
1645 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1646 {
1647    int base_mrf = 2;
1648    fs_inst *inst;
1649
1650    if (brw->gen >= 8) {
1651       inst = emit(opcode, dst, src0, src1);
1652    } else if (brw->gen >= 6) {
1653       src0 = fix_math_operand(src0);
1654       src1 = fix_math_operand(src1);
1655
1656       inst = emit(opcode, dst, src0, src1);
1657    } else {
1658       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1659        * "Message Payload":
1660        *
1661        * "Operand0[7].  For the INT DIV functions, this operand is the
1662        *  denominator."
1663        *  ...
1664        * "Operand1[7].  For the INT DIV functions, this operand is the
1665        *  numerator."
1666        */
1667       bool is_int_div = opcode != SHADER_OPCODE_POW;
1668       fs_reg &op0 = is_int_div ? src1 : src0;
1669       fs_reg &op1 = is_int_div ? src0 : src1;
1670
1671       emit(MOV(fs_reg(MRF, base_mrf + 1, op1.type, dispatch_width), op1));
1672       inst = emit(opcode, dst, op0, reg_null_f);
1673
1674       inst->base_mrf = base_mrf;
1675       inst->mlen = 2 * dispatch_width / 8;
1676    }
1677    return inst;
1678 }
1679
1680 void
1681 fs_visitor::assign_curb_setup()
1682 {
1683    if (dispatch_width == 8) {
1684       prog_data->dispatch_grf_start_reg = payload.num_regs;
1685    } else {
1686       assert(stage == MESA_SHADER_FRAGMENT);
1687       brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1688       prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1689    }
1690
1691    prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1692
1693    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1694    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1695       for (unsigned int i = 0; i < inst->sources; i++) {
1696          if (inst->src[i].file == UNIFORM) {
1697             int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1698             int constant_nr;
1699             if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1700                constant_nr = push_constant_loc[uniform_nr];
1701             } else {
1702                /* Section 5.11 of the OpenGL 4.1 spec says:
1703                 * "Out-of-bounds reads return undefined values, which include
1704                 *  values from other variables of the active program or zero."
1705                 * Just return the first push constant.
1706                 */
1707                constant_nr = 0;
1708             }
1709
1710             struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1711                                                   constant_nr / 8,
1712                                                   constant_nr % 8);
1713
1714             inst->src[i].file = HW_REG;
1715             inst->src[i].fixed_hw_reg = byte_offset(
1716                retype(brw_reg, inst->src[i].type),
1717                inst->src[i].subreg_offset);
1718          }
1719       }
1720    }
1721 }
1722
1723 void
1724 fs_visitor::calculate_urb_setup()
1725 {
1726    assert(stage == MESA_SHADER_FRAGMENT);
1727    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1728    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1729
1730    memset(prog_data->urb_setup, -1,
1731           sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
1732
1733    int urb_next = 0;
1734    /* Figure out where each of the incoming setup attributes lands. */
1735    if (brw->gen >= 6) {
1736       if (_mesa_bitcount_64(prog->InputsRead &
1737                             BRW_FS_VARYING_INPUT_MASK) <= 16) {
1738          /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1739           * first 16 varying inputs, so we can put them wherever we want.
1740           * Just put them in order.
1741           *
1742           * This is useful because it means that (a) inputs not used by the
1743           * fragment shader won't take up valuable register space, and (b) we
1744           * won't have to recompile the fragment shader if it gets paired with
1745           * a different vertex (or geometry) shader.
1746           */
1747          for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1748             if (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1749                 BITFIELD64_BIT(i)) {
1750                prog_data->urb_setup[i] = urb_next++;
1751             }
1752          }
1753       } else {
1754          /* We have enough input varyings that the SF/SBE pipeline stage can't
1755           * arbitrarily rearrange them to suit our whim; we have to put them
1756           * in an order that matches the output of the previous pipeline stage
1757           * (geometry or vertex shader).
1758           */
1759          struct brw_vue_map prev_stage_vue_map;
1760          brw_compute_vue_map(brw, &prev_stage_vue_map,
1761                              key->input_slots_valid);
1762          int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1763          assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1764          for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1765               slot++) {
1766             int varying = prev_stage_vue_map.slot_to_varying[slot];
1767             /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1768              * unused.
1769              */
1770             if (varying != BRW_VARYING_SLOT_COUNT &&
1771                 (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1772                  BITFIELD64_BIT(varying))) {
1773                prog_data->urb_setup[varying] = slot - first_slot;
1774             }
1775          }
1776          urb_next = prev_stage_vue_map.num_slots - first_slot;
1777       }
1778    } else {
1779       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1780       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1781          /* Point size is packed into the header, not as a general attribute */
1782          if (i == VARYING_SLOT_PSIZ)
1783             continue;
1784
1785          if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1786             /* The back color slot is skipped when the front color is
1787              * also written to.  In addition, some slots can be
1788              * written in the vertex shader and not read in the
1789              * fragment shader.  So the register number must always be
1790              * incremented, mapped or not.
1791              */
1792             if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1793                prog_data->urb_setup[i] = urb_next;
1794             urb_next++;
1795          }
1796       }
1797
1798       /*
1799        * It's a FS only attribute, and we did interpolation for this attribute
1800        * in SF thread. So, count it here, too.
1801        *
1802        * See compile_sf_prog() for more info.
1803        */
1804       if (prog->InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1805          prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1806    }
1807
1808    prog_data->num_varying_inputs = urb_next;
1809 }
1810
1811 void
1812 fs_visitor::assign_urb_setup()
1813 {
1814    assert(stage == MESA_SHADER_FRAGMENT);
1815    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1816
1817    int urb_start = payload.num_regs + prog_data->base.curb_read_length;
1818
1819    /* Offset all the urb_setup[] index by the actual position of the
1820     * setup regs, now that the location of the constants has been chosen.
1821     */
1822    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1823       if (inst->opcode == FS_OPCODE_LINTERP) {
1824          assert(inst->src[2].file == HW_REG);
1825          inst->src[2].fixed_hw_reg.nr += urb_start;
1826       }
1827
1828       if (inst->opcode == FS_OPCODE_CINTERP) {
1829          assert(inst->src[0].file == HW_REG);
1830          inst->src[0].fixed_hw_reg.nr += urb_start;
1831       }
1832    }
1833
1834    /* Each attribute is 4 setup channels, each of which is half a reg. */
1835    this->first_non_payload_grf =
1836       urb_start + prog_data->num_varying_inputs * 2;
1837 }
1838
1839 void
1840 fs_visitor::assign_vs_urb_setup()
1841 {
1842    brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
1843    int grf, count, slot, channel, attr;
1844
1845    assert(stage == MESA_SHADER_VERTEX);
1846    count = _mesa_bitcount_64(vs_prog_data->inputs_read);
1847    if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid)
1848       count++;
1849
1850    /* Each attribute is 4 regs. */
1851    this->first_non_payload_grf =
1852       payload.num_regs + prog_data->curb_read_length + count * 4;
1853
1854    unsigned vue_entries =
1855       MAX2(count, vs_prog_data->base.vue_map.num_slots);
1856
1857    vs_prog_data->base.urb_entry_size = ALIGN(vue_entries, 4) / 4;
1858    vs_prog_data->base.urb_read_length = (count + 1) / 2;
1859
1860    assert(vs_prog_data->base.urb_read_length <= 15);
1861
1862    /* Rewrite all ATTR file references to the hw grf that they land in. */
1863    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1864       for (int i = 0; i < inst->sources; i++) {
1865          if (inst->src[i].file == ATTR) {
1866
1867             if (inst->src[i].reg == VERT_ATTRIB_MAX) {
1868                slot = count - 1;
1869             } else {
1870                /* Attributes come in in a contiguous block, ordered by their
1871                 * gl_vert_attrib value.  That means we can compute the slot
1872                 * number for an attribute by masking out the enabled
1873                 * attributes before it and counting the bits.
1874                 */
1875                attr = inst->src[i].reg + inst->src[i].reg_offset / 4;
1876                slot = _mesa_bitcount_64(vs_prog_data->inputs_read &
1877                                         BITFIELD64_MASK(attr));
1878             }
1879
1880             channel = inst->src[i].reg_offset & 3;
1881
1882             grf = payload.num_regs +
1883                prog_data->curb_read_length +
1884                slot * 4 + channel;
1885
1886             inst->src[i].file = HW_REG;
1887             inst->src[i].fixed_hw_reg =
1888                retype(brw_vec8_grf(grf, 0), inst->src[i].type);
1889          }
1890       }
1891    }
1892 }
1893
1894 /**
1895  * Split large virtual GRFs into separate components if we can.
1896  *
1897  * This is mostly duplicated with what brw_fs_vector_splitting does,
1898  * but that's really conservative because it's afraid of doing
1899  * splitting that doesn't result in real progress after the rest of
1900  * the optimization phases, which would cause infinite looping in
1901  * optimization.  We can do it once here, safely.  This also has the
1902  * opportunity to split interpolated values, or maybe even uniforms,
1903  * which we don't have at the IR level.
1904  *
1905  * We want to split, because virtual GRFs are what we register
1906  * allocate and spill (due to contiguousness requirements for some
1907  * instructions), and they're what we naturally generate in the
1908  * codegen process, but most virtual GRFs don't actually need to be
1909  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1910  * live intervals and better dead code elimination and coalescing.
1911  */
1912 void
1913 fs_visitor::split_virtual_grfs()
1914 {
1915    int num_vars = this->alloc.count;
1916
1917    /* Count the total number of registers */
1918    int reg_count = 0;
1919    int vgrf_to_reg[num_vars];
1920    for (int i = 0; i < num_vars; i++) {
1921       vgrf_to_reg[i] = reg_count;
1922       reg_count += alloc.sizes[i];
1923    }
1924
1925    /* An array of "split points".  For each register slot, this indicates
1926     * if this slot can be separated from the previous slot.  Every time an
1927     * instruction uses multiple elements of a register (as a source or
1928     * destination), we mark the used slots as inseparable.  Then we go
1929     * through and split the registers into the smallest pieces we can.
1930     */
1931    bool split_points[reg_count];
1932    memset(split_points, 0, sizeof(split_points));
1933
1934    /* Mark all used registers as fully splittable */
1935    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1936       if (inst->dst.file == GRF) {
1937          int reg = vgrf_to_reg[inst->dst.reg];
1938          for (unsigned j = 1; j < this->alloc.sizes[inst->dst.reg]; j++)
1939             split_points[reg + j] = true;
1940       }
1941
1942       for (int i = 0; i < inst->sources; i++) {
1943          if (inst->src[i].file == GRF) {
1944             int reg = vgrf_to_reg[inst->src[i].reg];
1945             for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].reg]; j++)
1946                split_points[reg + j] = true;
1947          }
1948       }
1949    }
1950
1951    if (brw->has_pln &&
1952        this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1953       /* PLN opcodes rely on the delta_xy being contiguous.  We only have to
1954        * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1955        * Gen6, that was the only supported interpolation mode, and since Gen6,
1956        * delta_x and delta_y are in fixed hardware registers.
1957        */
1958       int vgrf = this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg;
1959       split_points[vgrf_to_reg[vgrf] + 1] = false;
1960    }
1961
1962    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1963       if (inst->dst.file == GRF) {
1964          int reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
1965          for (int j = 1; j < inst->regs_written; j++)
1966             split_points[reg + j] = false;
1967       }
1968       for (int i = 0; i < inst->sources; i++) {
1969          if (inst->src[i].file == GRF) {
1970             int reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
1971             for (int j = 1; j < inst->regs_read(i); j++)
1972                split_points[reg + j] = false;
1973          }
1974       }
1975    }
1976
1977    int new_virtual_grf[reg_count];
1978    int new_reg_offset[reg_count];
1979
1980    int reg = 0;
1981    for (int i = 0; i < num_vars; i++) {
1982       /* The first one should always be 0 as a quick sanity check. */
1983       assert(split_points[reg] == false);
1984
1985       /* j = 0 case */
1986       new_reg_offset[reg] = 0;
1987       reg++;
1988       int offset = 1;
1989
1990       /* j > 0 case */
1991       for (unsigned j = 1; j < alloc.sizes[i]; j++) {
1992          /* If this is a split point, reset the offset to 0 and allocate a
1993           * new virtual GRF for the previous offset many registers
1994           */
1995          if (split_points[reg]) {
1996             assert(offset <= MAX_VGRF_SIZE);
1997             int grf = alloc.allocate(offset);
1998             for (int k = reg - offset; k < reg; k++)
1999                new_virtual_grf[k] = grf;
2000             offset = 0;
2001          }
2002          new_reg_offset[reg] = offset;
2003          offset++;
2004          reg++;
2005       }
2006
2007       /* The last one gets the original register number */
2008       assert(offset <= MAX_VGRF_SIZE);
2009       alloc.sizes[i] = offset;
2010       for (int k = reg - offset; k < reg; k++)
2011          new_virtual_grf[k] = i;
2012    }
2013    assert(reg == reg_count);
2014
2015    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2016       if (inst->dst.file == GRF) {
2017          reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
2018          inst->dst.reg = new_virtual_grf[reg];
2019          inst->dst.reg_offset = new_reg_offset[reg];
2020          assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
2021       }
2022       for (int i = 0; i < inst->sources; i++) {
2023          if (inst->src[i].file == GRF) {
2024             reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
2025             inst->src[i].reg = new_virtual_grf[reg];
2026             inst->src[i].reg_offset = new_reg_offset[reg];
2027             assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
2028          }
2029       }
2030    }
2031    invalidate_live_intervals();
2032 }
2033
2034 /**
2035  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
2036  *
2037  * During code generation, we create tons of temporary variables, many of
2038  * which get immediately killed and are never used again.  Yet, in later
2039  * optimization and analysis passes, such as compute_live_intervals, we need
2040  * to loop over all the virtual GRFs.  Compacting them can save a lot of
2041  * overhead.
2042  */
2043 bool
2044 fs_visitor::compact_virtual_grfs()
2045 {
2046    bool progress = false;
2047    int remap_table[this->alloc.count];
2048    memset(remap_table, -1, sizeof(remap_table));
2049
2050    /* Mark which virtual GRFs are used. */
2051    foreach_block_and_inst(block, const fs_inst, inst, cfg) {
2052       if (inst->dst.file == GRF)
2053          remap_table[inst->dst.reg] = 0;
2054
2055       for (int i = 0; i < inst->sources; i++) {
2056          if (inst->src[i].file == GRF)
2057             remap_table[inst->src[i].reg] = 0;
2058       }
2059    }
2060
2061    /* Compact the GRF arrays. */
2062    int new_index = 0;
2063    for (unsigned i = 0; i < this->alloc.count; i++) {
2064       if (remap_table[i] == -1) {
2065          /* We just found an unused register.  This means that we are
2066           * actually going to compact something.
2067           */
2068          progress = true;
2069       } else {
2070          remap_table[i] = new_index;
2071          alloc.sizes[new_index] = alloc.sizes[i];
2072          invalidate_live_intervals();
2073          ++new_index;
2074       }
2075    }
2076
2077    this->alloc.count = new_index;
2078
2079    /* Patch all the instructions to use the newly renumbered registers */
2080    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2081       if (inst->dst.file == GRF)
2082          inst->dst.reg = remap_table[inst->dst.reg];
2083
2084       for (int i = 0; i < inst->sources; i++) {
2085          if (inst->src[i].file == GRF)
2086             inst->src[i].reg = remap_table[inst->src[i].reg];
2087       }
2088    }
2089
2090    /* Patch all the references to delta_x/delta_y, since they're used in
2091     * register allocation.  If they're unused, switch them to BAD_FILE so
2092     * we don't think some random VGRF is delta_x/delta_y.
2093     */
2094    for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
2095       if (delta_x[i].file == GRF) {
2096          if (remap_table[delta_x[i].reg] != -1) {
2097             delta_x[i].reg = remap_table[delta_x[i].reg];
2098          } else {
2099             delta_x[i].file = BAD_FILE;
2100          }
2101       }
2102    }
2103    for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
2104       if (delta_y[i].file == GRF) {
2105          if (remap_table[delta_y[i].reg] != -1) {
2106             delta_y[i].reg = remap_table[delta_y[i].reg];
2107          } else {
2108             delta_y[i].file = BAD_FILE;
2109          }
2110       }
2111    }
2112
2113    return progress;
2114 }
2115
2116 /*
2117  * Implements array access of uniforms by inserting a
2118  * PULL_CONSTANT_LOAD instruction.
2119  *
2120  * Unlike temporary GRF array access (where we don't support it due to
2121  * the difficulty of doing relative addressing on instruction
2122  * destinations), we could potentially do array access of uniforms
2123  * that were loaded in GRF space as push constants.  In real-world
2124  * usage we've seen, though, the arrays being used are always larger
2125  * than we could load as push constants, so just always move all
2126  * uniform array access out to a pull constant buffer.
2127  */
2128 void
2129 fs_visitor::move_uniform_array_access_to_pull_constants()
2130 {
2131    if (dispatch_width != 8)
2132       return;
2133
2134    pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2135    memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
2136
2137    /* Walk through and find array access of uniforms.  Put a copy of that
2138     * uniform in the pull constant buffer.
2139     *
2140     * Note that we don't move constant-indexed accesses to arrays.  No
2141     * testing has been done of the performance impact of this choice.
2142     */
2143    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2144       for (int i = 0 ; i < inst->sources; i++) {
2145          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2146             continue;
2147
2148          int uniform = inst->src[i].reg;
2149
2150          /* If this array isn't already present in the pull constant buffer,
2151           * add it.
2152           */
2153          if (pull_constant_loc[uniform] == -1) {
2154             const gl_constant_value **values = &stage_prog_data->param[uniform];
2155
2156             assert(param_size[uniform]);
2157
2158             for (int j = 0; j < param_size[uniform]; j++) {
2159                pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
2160
2161                stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
2162                   values[j];
2163             }
2164          }
2165       }
2166    }
2167 }
2168
2169 /**
2170  * Assign UNIFORM file registers to either push constants or pull constants.
2171  *
2172  * We allow a fragment shader to have more than the specified minimum
2173  * maximum number of fragment shader uniform components (64).  If
2174  * there are too many of these, they'd fill up all of register space.
2175  * So, this will push some of them out to the pull constant buffer and
2176  * update the program to load them.
2177  */
2178 void
2179 fs_visitor::assign_constant_locations()
2180 {
2181    /* Only the first compile (SIMD8 mode) gets to decide on locations. */
2182    if (dispatch_width != 8)
2183       return;
2184
2185    /* Find which UNIFORM registers are still in use. */
2186    bool is_live[uniforms];
2187    for (unsigned int i = 0; i < uniforms; i++) {
2188       is_live[i] = false;
2189    }
2190
2191    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2192       for (int i = 0; i < inst->sources; i++) {
2193          if (inst->src[i].file != UNIFORM)
2194             continue;
2195
2196          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
2197          if (constant_nr >= 0 && constant_nr < (int) uniforms)
2198             is_live[constant_nr] = true;
2199       }
2200    }
2201
2202    /* Only allow 16 registers (128 uniform components) as push constants.
2203     *
2204     * Just demote the end of the list.  We could probably do better
2205     * here, demoting things that are rarely used in the program first.
2206     *
2207     * If changing this value, note the limitation about total_regs in
2208     * brw_curbe.c.
2209     */
2210    unsigned int max_push_components = 16 * 8;
2211    unsigned int num_push_constants = 0;
2212
2213    push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2214
2215    for (unsigned int i = 0; i < uniforms; i++) {
2216       if (!is_live[i] || pull_constant_loc[i] != -1) {
2217          /* This UNIFORM register is either dead, or has already been demoted
2218           * to a pull const.  Mark it as no longer living in the param[] array.
2219           */
2220          push_constant_loc[i] = -1;
2221          continue;
2222       }
2223
2224       if (num_push_constants < max_push_components) {
2225          /* Retain as a push constant.  Record the location in the params[]
2226           * array.
2227           */
2228          push_constant_loc[i] = num_push_constants++;
2229       } else {
2230          /* Demote to a pull constant. */
2231          push_constant_loc[i] = -1;
2232
2233          int pull_index = stage_prog_data->nr_pull_params++;
2234          stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
2235          pull_constant_loc[i] = pull_index;
2236       }
2237    }
2238
2239    stage_prog_data->nr_params = num_push_constants;
2240
2241    /* Up until now, the param[] array has been indexed by reg + reg_offset
2242     * of UNIFORM registers.  Condense it to only contain the uniforms we
2243     * chose to upload as push constants.
2244     */
2245    for (unsigned int i = 0; i < uniforms; i++) {
2246       int remapped = push_constant_loc[i];
2247
2248       if (remapped == -1)
2249          continue;
2250
2251       assert(remapped <= (int)i);
2252       stage_prog_data->param[remapped] = stage_prog_data->param[i];
2253    }
2254 }
2255
2256 /**
2257  * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
2258  * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
2259  */
2260 void
2261 fs_visitor::demote_pull_constants()
2262 {
2263    foreach_block_and_inst (block, fs_inst, inst, cfg) {
2264       for (int i = 0; i < inst->sources; i++) {
2265          if (inst->src[i].file != UNIFORM)
2266             continue;
2267
2268          int pull_index = pull_constant_loc[inst->src[i].reg +
2269                                             inst->src[i].reg_offset];
2270          if (pull_index == -1)
2271             continue;
2272
2273          /* Set up the annotation tracking for new generated instructions. */
2274          base_ir = inst->ir;
2275          current_annotation = inst->annotation;
2276
2277          fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
2278          fs_reg dst = vgrf(glsl_type::float_type);
2279
2280          /* Generate a pull load into dst. */
2281          if (inst->src[i].reladdr) {
2282             exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
2283                                                         surf_index,
2284                                                         *inst->src[i].reladdr,
2285                                                         pull_index);
2286             inst->insert_before(block, &list);
2287             inst->src[i].reladdr = NULL;
2288          } else {
2289             fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
2290             fs_inst *pull =
2291                new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 8,
2292                                     dst, surf_index, offset);
2293             inst->insert_before(block, pull);
2294             inst->src[i].set_smear(pull_index & 3);
2295          }
2296
2297          /* Rewrite the instruction to use the temporary VGRF. */
2298          inst->src[i].file = GRF;
2299          inst->src[i].reg = dst.reg;
2300          inst->src[i].reg_offset = 0;
2301          inst->src[i].width = dispatch_width;
2302       }
2303    }
2304    invalidate_live_intervals();
2305 }
2306
2307 bool
2308 fs_visitor::opt_algebraic()
2309 {
2310    bool progress = false;
2311
2312    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2313       switch (inst->opcode) {
2314       case BRW_OPCODE_MOV:
2315          if (inst->src[0].file != IMM)
2316             break;
2317
2318          if (inst->saturate) {
2319             if (inst->dst.type != inst->src[0].type)
2320                assert(!"unimplemented: saturate mixed types");
2321
2322             if (brw_saturate_immediate(inst->dst.type,
2323                                        &inst->src[0].fixed_hw_reg)) {
2324                inst->saturate = false;
2325                progress = true;
2326             }
2327          }
2328          break;
2329
2330       case BRW_OPCODE_MUL:
2331          if (inst->src[1].file != IMM)
2332             continue;
2333
2334          /* a * 1.0 = a */
2335          if (inst->src[1].is_one()) {
2336             inst->opcode = BRW_OPCODE_MOV;
2337             inst->src[1] = reg_undef;
2338             progress = true;
2339             break;
2340          }
2341
2342          /* a * -1.0 = -a */
2343          if (inst->src[1].is_negative_one()) {
2344             inst->opcode = BRW_OPCODE_MOV;
2345             inst->src[0].negate = !inst->src[0].negate;
2346             inst->src[1] = reg_undef;
2347             progress = true;
2348             break;
2349          }
2350
2351          /* a * 0.0 = 0.0 */
2352          if (inst->src[1].is_zero()) {
2353             inst->opcode = BRW_OPCODE_MOV;
2354             inst->src[0] = inst->src[1];
2355             inst->src[1] = reg_undef;
2356             progress = true;
2357             break;
2358          }
2359
2360          if (inst->src[0].file == IMM) {
2361             assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2362             inst->opcode = BRW_OPCODE_MOV;
2363             inst->src[0].fixed_hw_reg.dw1.f *= inst->src[1].fixed_hw_reg.dw1.f;
2364             inst->src[1] = reg_undef;
2365             progress = true;
2366             break;
2367          }
2368          break;
2369       case BRW_OPCODE_ADD:
2370          if (inst->src[1].file != IMM)
2371             continue;
2372
2373          /* a + 0.0 = a */
2374          if (inst->src[1].is_zero()) {
2375             inst->opcode = BRW_OPCODE_MOV;
2376             inst->src[1] = reg_undef;
2377             progress = true;
2378             break;
2379          }
2380
2381          if (inst->src[0].file == IMM) {
2382             assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2383             inst->opcode = BRW_OPCODE_MOV;
2384             inst->src[0].fixed_hw_reg.dw1.f += inst->src[1].fixed_hw_reg.dw1.f;
2385             inst->src[1] = reg_undef;
2386             progress = true;
2387             break;
2388          }
2389          break;
2390       case BRW_OPCODE_OR:
2391          if (inst->src[0].equals(inst->src[1])) {
2392             inst->opcode = BRW_OPCODE_MOV;
2393             inst->src[1] = reg_undef;
2394             progress = true;
2395             break;
2396          }
2397          break;
2398       case BRW_OPCODE_LRP:
2399          if (inst->src[1].equals(inst->src[2])) {
2400             inst->opcode = BRW_OPCODE_MOV;
2401             inst->src[0] = inst->src[1];
2402             inst->src[1] = reg_undef;
2403             inst->src[2] = reg_undef;
2404             progress = true;
2405             break;
2406          }
2407          break;
2408       case BRW_OPCODE_CMP:
2409          if (inst->conditional_mod == BRW_CONDITIONAL_GE &&
2410              inst->src[0].abs &&
2411              inst->src[0].negate &&
2412              inst->src[1].is_zero()) {
2413             inst->src[0].abs = false;
2414             inst->src[0].negate = false;
2415             inst->conditional_mod = BRW_CONDITIONAL_Z;
2416             progress = true;
2417             break;
2418          }
2419          break;
2420       case BRW_OPCODE_SEL:
2421          if (inst->src[0].equals(inst->src[1])) {
2422             inst->opcode = BRW_OPCODE_MOV;
2423             inst->src[1] = reg_undef;
2424             inst->predicate = BRW_PREDICATE_NONE;
2425             inst->predicate_inverse = false;
2426             progress = true;
2427          } else if (inst->saturate && inst->src[1].file == IMM) {
2428             switch (inst->conditional_mod) {
2429             case BRW_CONDITIONAL_LE:
2430             case BRW_CONDITIONAL_L:
2431                switch (inst->src[1].type) {
2432                case BRW_REGISTER_TYPE_F:
2433                   if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
2434                      inst->opcode = BRW_OPCODE_MOV;
2435                      inst->src[1] = reg_undef;
2436                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
2437                      progress = true;
2438                   }
2439                   break;
2440                default:
2441                   break;
2442                }
2443                break;
2444             case BRW_CONDITIONAL_GE:
2445             case BRW_CONDITIONAL_G:
2446                switch (inst->src[1].type) {
2447                case BRW_REGISTER_TYPE_F:
2448                   if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
2449                      inst->opcode = BRW_OPCODE_MOV;
2450                      inst->src[1] = reg_undef;
2451                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
2452                      progress = true;
2453                   }
2454                   break;
2455                default:
2456                   break;
2457                }
2458             default:
2459                break;
2460             }
2461          }
2462          break;
2463       case BRW_OPCODE_MAD:
2464          if (inst->src[1].is_zero() || inst->src[2].is_zero()) {
2465             inst->opcode = BRW_OPCODE_MOV;
2466             inst->src[1] = reg_undef;
2467             inst->src[2] = reg_undef;
2468             progress = true;
2469          } else if (inst->src[0].is_zero()) {
2470             inst->opcode = BRW_OPCODE_MUL;
2471             inst->src[0] = inst->src[2];
2472             inst->src[2] = reg_undef;
2473          } else if (inst->src[1].is_one()) {
2474             inst->opcode = BRW_OPCODE_ADD;
2475             inst->src[1] = inst->src[2];
2476             inst->src[2] = reg_undef;
2477             progress = true;
2478          } else if (inst->src[2].is_one()) {
2479             inst->opcode = BRW_OPCODE_ADD;
2480             inst->src[2] = reg_undef;
2481             progress = true;
2482          } else if (inst->src[1].file == IMM && inst->src[2].file == IMM) {
2483             inst->opcode = BRW_OPCODE_ADD;
2484             inst->src[1].fixed_hw_reg.dw1.f *= inst->src[2].fixed_hw_reg.dw1.f;
2485             inst->src[2] = reg_undef;
2486             progress = true;
2487          }
2488          break;
2489       case SHADER_OPCODE_RCP: {
2490          fs_inst *prev = (fs_inst *)inst->prev;
2491          if (prev->opcode == SHADER_OPCODE_SQRT) {
2492             if (inst->src[0].equals(prev->dst)) {
2493                inst->opcode = SHADER_OPCODE_RSQ;
2494                inst->src[0] = prev->src[0];
2495                progress = true;
2496             }
2497          }
2498          break;
2499       }
2500       default:
2501          break;
2502       }
2503    }
2504
2505    return progress;
2506 }
2507
2508 bool
2509 fs_visitor::opt_register_renaming()
2510 {
2511    bool progress = false;
2512    int depth = 0;
2513
2514    int remap[alloc.count];
2515    memset(remap, -1, sizeof(int) * alloc.count);
2516
2517    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2518       if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
2519          depth++;
2520       } else if (inst->opcode == BRW_OPCODE_ENDIF ||
2521                  inst->opcode == BRW_OPCODE_WHILE) {
2522          depth--;
2523       }
2524
2525       /* Rewrite instruction sources. */
2526       for (int i = 0; i < inst->sources; i++) {
2527          if (inst->src[i].file == GRF &&
2528              remap[inst->src[i].reg] != -1 &&
2529              remap[inst->src[i].reg] != inst->src[i].reg) {
2530             inst->src[i].reg = remap[inst->src[i].reg];
2531             progress = true;
2532          }
2533       }
2534
2535       const int dst = inst->dst.reg;
2536
2537       if (depth == 0 &&
2538           inst->dst.file == GRF &&
2539           alloc.sizes[inst->dst.reg] == inst->dst.width / 8 &&
2540           !inst->is_partial_write()) {
2541          if (remap[dst] == -1) {
2542             remap[dst] = dst;
2543          } else {
2544             remap[dst] = alloc.allocate(inst->dst.width / 8);
2545             inst->dst.reg = remap[dst];
2546             progress = true;
2547          }
2548       } else if (inst->dst.file == GRF &&
2549                  remap[dst] != -1 &&
2550                  remap[dst] != dst) {
2551          inst->dst.reg = remap[dst];
2552          progress = true;
2553       }
2554    }
2555
2556    if (progress) {
2557       invalidate_live_intervals();
2558
2559       for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
2560          if (delta_x[i].file == GRF && remap[delta_x[i].reg] != -1) {
2561             delta_x[i].reg = remap[delta_x[i].reg];
2562          }
2563       }
2564       for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
2565          if (delta_y[i].file == GRF && remap[delta_y[i].reg] != -1) {
2566             delta_y[i].reg = remap[delta_y[i].reg];
2567          }
2568       }
2569    }
2570
2571    return progress;
2572 }
2573
2574 /**
2575  * Remove redundant or useless discard jumps.
2576  *
2577  * For example, we can eliminate jumps in the following sequence:
2578  *
2579  * discard-jump       (redundant with the next jump)
2580  * discard-jump       (useless; jumps to the next instruction)
2581  * placeholder-halt
2582  */
2583 bool
2584 fs_visitor::opt_redundant_discard_jumps()
2585 {
2586    bool progress = false;
2587
2588    bblock_t *last_bblock = cfg->blocks[cfg->num_blocks - 1];
2589
2590    fs_inst *placeholder_halt = NULL;
2591    foreach_inst_in_block_reverse(fs_inst, inst, last_bblock) {
2592       if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT) {
2593          placeholder_halt = inst;
2594          break;
2595       }
2596    }
2597
2598    if (!placeholder_halt)
2599       return false;
2600
2601    /* Delete any HALTs immediately before the placeholder halt. */
2602    for (fs_inst *prev = (fs_inst *) placeholder_halt->prev;
2603         !prev->is_head_sentinel() && prev->opcode == FS_OPCODE_DISCARD_JUMP;
2604         prev = (fs_inst *) placeholder_halt->prev) {
2605       prev->remove(last_bblock);
2606       progress = true;
2607    }
2608
2609    if (progress)
2610       invalidate_live_intervals();
2611
2612    return progress;
2613 }
2614
2615 bool
2616 fs_visitor::compute_to_mrf()
2617 {
2618    bool progress = false;
2619    int next_ip = 0;
2620
2621    /* No MRFs on Gen >= 7. */
2622    if (brw->gen >= 7)
2623       return false;
2624
2625    calculate_live_intervals();
2626
2627    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2628       int ip = next_ip;
2629       next_ip++;
2630
2631       if (inst->opcode != BRW_OPCODE_MOV ||
2632           inst->is_partial_write() ||
2633           inst->dst.file != MRF || inst->src[0].file != GRF ||
2634           inst->dst.type != inst->src[0].type ||
2635           inst->src[0].abs || inst->src[0].negate ||
2636           !inst->src[0].is_contiguous() ||
2637           inst->src[0].subreg_offset)
2638          continue;
2639
2640       /* Work out which hardware MRF registers are written by this
2641        * instruction.
2642        */
2643       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2644       int mrf_high;
2645       if (inst->dst.reg & BRW_MRF_COMPR4) {
2646          mrf_high = mrf_low + 4;
2647       } else if (inst->exec_size == 16) {
2648          mrf_high = mrf_low + 1;
2649       } else {
2650          mrf_high = mrf_low;
2651       }
2652
2653       /* Can't compute-to-MRF this GRF if someone else was going to
2654        * read it later.
2655        */
2656       if (this->virtual_grf_end[inst->src[0].reg] > ip)
2657          continue;
2658
2659       /* Found a move of a GRF to a MRF.  Let's see if we can go
2660        * rewrite the thing that made this GRF to write into the MRF.
2661        */
2662       foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2663          if (scan_inst->dst.file == GRF &&
2664              scan_inst->dst.reg == inst->src[0].reg) {
2665             /* Found the last thing to write our reg we want to turn
2666              * into a compute-to-MRF.
2667              */
2668
2669             /* If this one instruction didn't populate all the
2670              * channels, bail.  We might be able to rewrite everything
2671              * that writes that reg, but it would require smarter
2672              * tracking to delay the rewriting until complete success.
2673              */
2674             if (scan_inst->is_partial_write())
2675                break;
2676
2677             /* Things returning more than one register would need us to
2678              * understand coalescing out more than one MOV at a time.
2679              */
2680             if (scan_inst->regs_written > scan_inst->dst.width / 8)
2681                break;
2682
2683             /* SEND instructions can't have MRF as a destination. */
2684             if (scan_inst->mlen)
2685                break;
2686
2687             if (brw->gen == 6) {
2688                /* gen6 math instructions must have the destination be
2689                 * GRF, so no compute-to-MRF for them.
2690                 */
2691                if (scan_inst->is_math()) {
2692                   break;
2693                }
2694             }
2695
2696             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2697                /* Found the creator of our MRF's source value. */
2698                scan_inst->dst.file = MRF;
2699                scan_inst->dst.reg = inst->dst.reg;
2700                scan_inst->saturate |= inst->saturate;
2701                inst->remove(block);
2702                progress = true;
2703             }
2704             break;
2705          }
2706
2707          /* We don't handle control flow here.  Most computation of
2708           * values that end up in MRFs are shortly before the MRF
2709           * write anyway.
2710           */
2711          if (block->start() == scan_inst)
2712             break;
2713
2714          /* You can't read from an MRF, so if someone else reads our
2715           * MRF's source GRF that we wanted to rewrite, that stops us.
2716           */
2717          bool interfered = false;
2718          for (int i = 0; i < scan_inst->sources; i++) {
2719             if (scan_inst->src[i].file == GRF &&
2720                 scan_inst->src[i].reg == inst->src[0].reg &&
2721                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2722                interfered = true;
2723             }
2724          }
2725          if (interfered)
2726             break;
2727
2728          if (scan_inst->dst.file == MRF) {
2729             /* If somebody else writes our MRF here, we can't
2730              * compute-to-MRF before that.
2731              */
2732             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2733             int scan_mrf_high;
2734
2735             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2736                scan_mrf_high = scan_mrf_low + 4;
2737             } else if (scan_inst->exec_size == 16) {
2738                scan_mrf_high = scan_mrf_low + 1;
2739             } else {
2740                scan_mrf_high = scan_mrf_low;
2741             }
2742
2743             if (mrf_low == scan_mrf_low ||
2744                 mrf_low == scan_mrf_high ||
2745                 mrf_high == scan_mrf_low ||
2746                 mrf_high == scan_mrf_high) {
2747                break;
2748             }
2749          }
2750
2751          if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2752             /* Found a SEND instruction, which means that there are
2753              * live values in MRFs from base_mrf to base_mrf +
2754              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2755              * above it.
2756              */
2757             if (mrf_low >= scan_inst->base_mrf &&
2758                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2759                break;
2760             }
2761             if (mrf_high >= scan_inst->base_mrf &&
2762                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2763                break;
2764             }
2765          }
2766       }
2767    }
2768
2769    if (progress)
2770       invalidate_live_intervals();
2771
2772    return progress;
2773 }
2774
2775 /**
2776  * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
2777  * instructions to FS_OPCODE_REP_FB_WRITE.
2778  */
2779 void
2780 fs_visitor::emit_repclear_shader()
2781 {
2782    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2783    int base_mrf = 1;
2784    int color_mrf = base_mrf + 2;
2785
2786    fs_inst *mov = emit(MOV(vec4(brw_message_reg(color_mrf)),
2787                            fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F)));
2788    mov->force_writemask_all = true;
2789
2790    fs_inst *write;
2791    if (key->nr_color_regions == 1) {
2792       write = emit(FS_OPCODE_REP_FB_WRITE);
2793       write->saturate = key->clamp_fragment_color;
2794       write->base_mrf = color_mrf;
2795       write->target = 0;
2796       write->header_present = false;
2797       write->mlen = 1;
2798    } else {
2799       assume(key->nr_color_regions > 0);
2800       for (int i = 0; i < key->nr_color_regions; ++i) {
2801          write = emit(FS_OPCODE_REP_FB_WRITE);
2802          write->saturate = key->clamp_fragment_color;
2803          write->base_mrf = base_mrf;
2804          write->target = i;
2805          write->header_present = true;
2806          write->mlen = 3;
2807       }
2808    }
2809    write->eot = true;
2810
2811    calculate_cfg();
2812
2813    assign_constant_locations();
2814    assign_curb_setup();
2815
2816    /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
2817    assert(mov->src[0].file == HW_REG);
2818    mov->src[0] = brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
2819 }
2820
2821 /**
2822  * Walks through basic blocks, looking for repeated MRF writes and
2823  * removing the later ones.
2824  */
2825 bool
2826 fs_visitor::remove_duplicate_mrf_writes()
2827 {
2828    fs_inst *last_mrf_move[16];
2829    bool progress = false;
2830
2831    /* Need to update the MRF tracking for compressed instructions. */
2832    if (dispatch_width == 16)
2833       return false;
2834
2835    memset(last_mrf_move, 0, sizeof(last_mrf_move));
2836
2837    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
2838       if (inst->is_control_flow()) {
2839          memset(last_mrf_move, 0, sizeof(last_mrf_move));
2840       }
2841
2842       if (inst->opcode == BRW_OPCODE_MOV &&
2843           inst->dst.file == MRF) {
2844          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2845          if (prev_inst && inst->equals(prev_inst)) {
2846             inst->remove(block);
2847             progress = true;
2848             continue;
2849          }
2850       }
2851
2852       /* Clear out the last-write records for MRFs that were overwritten. */
2853       if (inst->dst.file == MRF) {
2854          last_mrf_move[inst->dst.reg] = NULL;
2855       }
2856
2857       if (inst->mlen > 0 && inst->base_mrf != -1) {
2858          /* Found a SEND instruction, which will include two or fewer
2859           * implied MRF writes.  We could do better here.
2860           */
2861          for (int i = 0; i < implied_mrf_writes(inst); i++) {
2862             last_mrf_move[inst->base_mrf + i] = NULL;
2863          }
2864       }
2865
2866       /* Clear out any MRF move records whose sources got overwritten. */
2867       if (inst->dst.file == GRF) {
2868          for (unsigned int i = 0; i < ARRAY_SIZE(last_mrf_move); i++) {
2869             if (last_mrf_move[i] &&
2870                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2871                last_mrf_move[i] = NULL;
2872             }
2873          }
2874       }
2875
2876       if (inst->opcode == BRW_OPCODE_MOV &&
2877           inst->dst.file == MRF &&
2878           inst->src[0].file == GRF &&
2879           !inst->is_partial_write()) {
2880          last_mrf_move[inst->dst.reg] = inst;
2881       }
2882    }
2883
2884    if (progress)
2885       invalidate_live_intervals();
2886
2887    return progress;
2888 }
2889
2890 static void
2891 clear_deps_for_inst_src(fs_inst *inst, bool *deps, int first_grf, int grf_len)
2892 {
2893    /* Clear the flag for registers that actually got read (as expected). */
2894    for (int i = 0; i < inst->sources; i++) {
2895       int grf;
2896       if (inst->src[i].file == GRF) {
2897          grf = inst->src[i].reg;
2898       } else if (inst->src[i].file == HW_REG &&
2899                  inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2900          grf = inst->src[i].fixed_hw_reg.nr;
2901       } else {
2902          continue;
2903       }
2904
2905       if (grf >= first_grf &&
2906           grf < first_grf + grf_len) {
2907          deps[grf - first_grf] = false;
2908          if (inst->exec_size == 16)
2909             deps[grf - first_grf + 1] = false;
2910       }
2911    }
2912 }
2913
2914 /**
2915  * Implements this workaround for the original 965:
2916  *
2917  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2918  *      check for post destination dependencies on this instruction, software
2919  *      must ensure that there is no destination hazard for the case of ‘write
2920  *      followed by a posted write’ shown in the following example.
2921  *
2922  *      1. mov r3 0
2923  *      2. send r3.xy <rest of send instruction>
2924  *      3. mov r2 r3
2925  *
2926  *      Due to no post-destination dependency check on the ‘send’, the above
2927  *      code sequence could have two instructions (1 and 2) in flight at the
2928  *      same time that both consider ‘r3’ as the target of their final writes.
2929  */
2930 void
2931 fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
2932                                                         fs_inst *inst)
2933 {
2934    int write_len = inst->regs_written;
2935    int first_write_grf = inst->dst.reg;
2936    bool needs_dep[BRW_MAX_MRF];
2937    assert(write_len < (int)sizeof(needs_dep) - 1);
2938
2939    memset(needs_dep, false, sizeof(needs_dep));
2940    memset(needs_dep, true, write_len);
2941
2942    clear_deps_for_inst_src(inst, needs_dep, first_write_grf, write_len);
2943
2944    /* Walk backwards looking for writes to registers we're writing which
2945     * aren't read since being written.  If we hit the start of the program,
2946     * we assume that there are no outstanding dependencies on entry to the
2947     * program.
2948     */
2949    foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2950       /* If we hit control flow, assume that there *are* outstanding
2951        * dependencies, and force their cleanup before our instruction.
2952        */
2953       if (block->start() == scan_inst) {
2954          for (int i = 0; i < write_len; i++) {
2955             if (needs_dep[i]) {
2956                inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
2957             }
2958          }
2959          return;
2960       }
2961
2962       /* We insert our reads as late as possible on the assumption that any
2963        * instruction but a MOV that might have left us an outstanding
2964        * dependency has more latency than a MOV.
2965        */
2966       if (scan_inst->dst.file == GRF) {
2967          for (int i = 0; i < scan_inst->regs_written; i++) {
2968             int reg = scan_inst->dst.reg + i;
2969
2970             if (reg >= first_write_grf &&
2971                 reg < first_write_grf + write_len &&
2972                 needs_dep[reg - first_write_grf]) {
2973                inst->insert_before(block, DEP_RESOLVE_MOV(reg));
2974                needs_dep[reg - first_write_grf] = false;
2975                if (scan_inst->exec_size == 16)
2976                   needs_dep[reg - first_write_grf + 1] = false;
2977             }
2978          }
2979       }
2980
2981       /* Clear the flag for registers that actually got read (as expected). */
2982       clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
2983
2984       /* Continue the loop only if we haven't resolved all the dependencies */
2985       int i;
2986       for (i = 0; i < write_len; i++) {
2987          if (needs_dep[i])
2988             break;
2989       }
2990       if (i == write_len)
2991          return;
2992    }
2993 }
2994
2995 /**
2996  * Implements this workaround for the original 965:
2997  *
2998  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
2999  *      used as a destination register until after it has been sourced by an
3000  *      instruction with a different destination register.
3001  */
3002 void
3003 fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
3004 {
3005    int write_len = inst->regs_written;
3006    int first_write_grf = inst->dst.reg;
3007    bool needs_dep[BRW_MAX_MRF];
3008    assert(write_len < (int)sizeof(needs_dep) - 1);
3009
3010    memset(needs_dep, false, sizeof(needs_dep));
3011    memset(needs_dep, true, write_len);
3012    /* Walk forwards looking for writes to registers we're writing which aren't
3013     * read before being written.
3014     */
3015    foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst, block) {
3016       /* If we hit control flow, force resolve all remaining dependencies. */
3017       if (block->end() == scan_inst) {
3018          for (int i = 0; i < write_len; i++) {
3019             if (needs_dep[i])
3020                scan_inst->insert_before(block,
3021                                         DEP_RESOLVE_MOV(first_write_grf + i));
3022          }
3023          return;
3024       }
3025
3026       /* Clear the flag for registers that actually got read (as expected). */
3027       clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3028
3029       /* We insert our reads as late as possible since they're reading the
3030        * result of a SEND, which has massive latency.
3031        */
3032       if (scan_inst->dst.file == GRF &&
3033           scan_inst->dst.reg >= first_write_grf &&
3034           scan_inst->dst.reg < first_write_grf + write_len &&
3035           needs_dep[scan_inst->dst.reg - first_write_grf]) {
3036          scan_inst->insert_before(block, DEP_RESOLVE_MOV(scan_inst->dst.reg));
3037          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
3038       }
3039
3040       /* Continue the loop only if we haven't resolved all the dependencies */
3041       int i;
3042       for (i = 0; i < write_len; i++) {
3043          if (needs_dep[i])
3044             break;
3045       }
3046       if (i == write_len)
3047          return;
3048    }
3049 }
3050
3051 void
3052 fs_visitor::insert_gen4_send_dependency_workarounds()
3053 {
3054    if (brw->gen != 4 || brw->is_g4x)
3055       return;
3056
3057    bool progress = false;
3058
3059    /* Note that we're done with register allocation, so GRF fs_regs always
3060     * have a .reg_offset of 0.
3061     */
3062
3063    foreach_block_and_inst(block, fs_inst, inst, cfg) {
3064       if (inst->mlen != 0 && inst->dst.file == GRF) {
3065          insert_gen4_pre_send_dependency_workarounds(block, inst);
3066          insert_gen4_post_send_dependency_workarounds(block, inst);
3067          progress = true;
3068       }
3069    }
3070
3071    if (progress)
3072       invalidate_live_intervals();
3073 }
3074
3075 /**
3076  * Turns the generic expression-style uniform pull constant load instruction
3077  * into a hardware-specific series of instructions for loading a pull
3078  * constant.
3079  *
3080  * The expression style allows the CSE pass before this to optimize out
3081  * repeated loads from the same offset, and gives the pre-register-allocation
3082  * scheduling full flexibility, while the conversion to native instructions
3083  * allows the post-register-allocation scheduler the best information
3084  * possible.
3085  *
3086  * Note that execution masking for setting up pull constant loads is special:
3087  * the channels that need to be written are unrelated to the current execution
3088  * mask, since a later instruction will use one of the result channels as a
3089  * source operand for all 8 or 16 of its channels.
3090  */
3091 void
3092 fs_visitor::lower_uniform_pull_constant_loads()
3093 {
3094    foreach_block_and_inst (block, fs_inst, inst, cfg) {
3095       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
3096          continue;
3097
3098       if (brw->gen >= 7) {
3099          /* The offset arg before was a vec4-aligned byte offset.  We need to
3100           * turn it into a dword offset.
3101           */
3102          fs_reg const_offset_reg = inst->src[1];
3103          assert(const_offset_reg.file == IMM &&
3104                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
3105          const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
3106          fs_reg payload = fs_reg(GRF, alloc.allocate(1));
3107
3108          /* We have to use a message header on Skylake to get SIMD4x2 mode.
3109           * Reserve space for the register.
3110           */
3111          if (brw->gen >= 9) {
3112             payload.reg_offset++;
3113             alloc.sizes[payload.reg] = 2;
3114          }
3115
3116          /* This is actually going to be a MOV, but since only the first dword
3117           * is accessed, we have a special opcode to do just that one.  Note
3118           * that this needs to be an operation that will be considered a def
3119           * by live variable analysis, or register allocation will explode.
3120           */
3121          fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
3122                                                8, payload, const_offset_reg);
3123          setup->force_writemask_all = true;
3124
3125          setup->ir = inst->ir;
3126          setup->annotation = inst->annotation;
3127          inst->insert_before(block, setup);
3128
3129          /* Similarly, this will only populate the first 4 channels of the
3130           * result register (since we only use smear values from 0-3), but we
3131           * don't tell the optimizer.
3132           */
3133          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
3134          inst->src[1] = payload;
3135
3136          invalidate_live_intervals();
3137       } else {
3138          /* Before register allocation, we didn't tell the scheduler about the
3139           * MRF we use.  We know it's safe to use this MRF because nothing
3140           * else does except for register spill/unspill, which generates and
3141           * uses its MRF within a single IR instruction.
3142           */
3143          inst->base_mrf = 14;
3144          inst->mlen = 1;
3145       }
3146    }
3147 }
3148
3149 bool
3150 fs_visitor::lower_load_payload()
3151 {
3152    bool progress = false;
3153
3154    int vgrf_to_reg[alloc.count];
3155    int reg_count = 0;
3156    for (unsigned i = 0; i < alloc.count; ++i) {
3157       vgrf_to_reg[i] = reg_count;
3158       reg_count += alloc.sizes[i];
3159    }
3160
3161    struct {
3162       bool written:1; /* Whether this register has ever been written */
3163       bool force_writemask_all:1;
3164       bool force_sechalf:1;
3165    } metadata[reg_count];
3166    memset(metadata, 0, sizeof(metadata));
3167
3168    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3169       if (inst->dst.file == GRF) {
3170          const int dst_reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
3171          bool force_sechalf = inst->force_sechalf &&
3172                               !inst->force_writemask_all;
3173          bool toggle_sechalf = inst->dst.width == 16 &&
3174                                type_sz(inst->dst.type) == 4 &&
3175                                !inst->force_writemask_all;
3176          for (int i = 0; i < inst->regs_written; ++i) {
3177             metadata[dst_reg + i].written = true;
3178             metadata[dst_reg + i].force_sechalf = force_sechalf;
3179             metadata[dst_reg + i].force_writemask_all = inst->force_writemask_all;
3180             force_sechalf = (toggle_sechalf != force_sechalf);
3181          }
3182       }
3183
3184       if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
3185          assert(inst->dst.file == MRF || inst->dst.file == GRF);
3186          fs_reg dst = inst->dst;
3187
3188          for (int i = 0; i < inst->sources; i++) {
3189             dst.width = inst->src[i].effective_width;
3190             dst.type = inst->src[i].type;
3191
3192             if (inst->src[i].file == BAD_FILE) {
3193                /* Do nothing but otherwise increment as normal */
3194             } else if (dst.file == MRF &&
3195                        dst.width == 8 &&
3196                        brw->has_compr4 &&
3197                        i + 4 < inst->sources &&
3198                        inst->src[i + 4].equals(horiz_offset(inst->src[i], 8))) {
3199                fs_reg compr4_dst = dst;
3200                compr4_dst.reg += BRW_MRF_COMPR4;
3201                compr4_dst.width = 16;
3202                fs_reg compr4_src = inst->src[i];
3203                compr4_src.width = 16;
3204                fs_inst *mov = MOV(compr4_dst, compr4_src);
3205                mov->force_writemask_all = true;
3206                inst->insert_before(block, mov);
3207                /* Mark i+4 as BAD_FILE so we don't emit a MOV for it */
3208                inst->src[i + 4].file = BAD_FILE;
3209             } else {
3210                fs_inst *mov = MOV(dst, inst->src[i]);
3211                if (inst->src[i].file == GRF) {
3212                   int src_reg = vgrf_to_reg[inst->src[i].reg] +
3213                                 inst->src[i].reg_offset;
3214                   mov->force_sechalf = metadata[src_reg].force_sechalf;
3215                   mov->force_writemask_all = metadata[src_reg].force_writemask_all;
3216                } else {
3217                   /* We don't have any useful metadata for immediates or
3218                    * uniforms.  Assume that any of the channels of the
3219                    * destination may be used.
3220                    */
3221                   assert(inst->src[i].file == IMM ||
3222                          inst->src[i].file == UNIFORM);
3223                   mov->force_writemask_all = true;
3224                }
3225
3226                if (dst.file == GRF) {
3227                   const int dst_reg = vgrf_to_reg[dst.reg] + dst.reg_offset;
3228                   const bool force_writemask = mov->force_writemask_all;
3229                   metadata[dst_reg].force_writemask_all = force_writemask;
3230                   metadata[dst_reg].force_sechalf = mov->force_sechalf;
3231                   if (dst.width * type_sz(dst.type) > 32) {
3232                      assert(!mov->force_sechalf);
3233                      metadata[dst_reg + 1].force_writemask_all = force_writemask;
3234                      metadata[dst_reg + 1].force_sechalf = !force_writemask;
3235                   }
3236                }
3237
3238                inst->insert_before(block, mov);
3239             }
3240
3241             dst = offset(dst, 1);
3242          }
3243
3244          inst->remove(block);
3245          progress = true;
3246       }
3247    }
3248
3249    if (progress)
3250       invalidate_live_intervals();
3251
3252    return progress;
3253 }
3254
3255 void
3256 fs_visitor::dump_instructions()
3257 {
3258    dump_instructions(NULL);
3259 }
3260
3261 void
3262 fs_visitor::dump_instructions(const char *name)
3263 {
3264    FILE *file = stderr;
3265    if (name && geteuid() != 0) {
3266       file = fopen(name, "w");
3267       if (!file)
3268          file = stderr;
3269    }
3270
3271    if (cfg) {
3272       calculate_register_pressure();
3273       int ip = 0, max_pressure = 0;
3274       foreach_block_and_inst(block, backend_instruction, inst, cfg) {
3275          max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
3276          fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
3277          dump_instruction(inst, file);
3278          ip++;
3279       }
3280       fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
3281    } else {
3282       int ip = 0;
3283       foreach_in_list(backend_instruction, inst, &instructions) {
3284          fprintf(file, "%4d: ", ip++);
3285          dump_instruction(inst, file);
3286       }
3287    }
3288
3289    if (file != stderr) {
3290       fclose(file);
3291    }
3292 }
3293
3294 void
3295 fs_visitor::dump_instruction(backend_instruction *be_inst)
3296 {
3297    dump_instruction(be_inst, stderr);
3298 }
3299
3300 void
3301 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
3302 {
3303    fs_inst *inst = (fs_inst *)be_inst;
3304
3305    if (inst->predicate) {
3306       fprintf(file, "(%cf0.%d) ",
3307              inst->predicate_inverse ? '-' : '+',
3308              inst->flag_subreg);
3309    }
3310
3311    fprintf(file, "%s", brw_instruction_name(inst->opcode));
3312    if (inst->saturate)
3313       fprintf(file, ".sat");
3314    if (inst->conditional_mod) {
3315       fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
3316       if (!inst->predicate &&
3317           (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
3318                               inst->opcode != BRW_OPCODE_IF &&
3319                               inst->opcode != BRW_OPCODE_WHILE))) {
3320          fprintf(file, ".f0.%d", inst->flag_subreg);
3321       }
3322    }
3323    fprintf(file, "(%d) ", inst->exec_size);
3324
3325
3326    switch (inst->dst.file) {
3327    case GRF:
3328       fprintf(file, "vgrf%d", inst->dst.reg);
3329       if (inst->dst.width != dispatch_width)
3330          fprintf(file, "@%d", inst->dst.width);
3331       if (alloc.sizes[inst->dst.reg] != inst->dst.width / 8 ||
3332           inst->dst.subreg_offset)
3333          fprintf(file, "+%d.%d",
3334                  inst->dst.reg_offset, inst->dst.subreg_offset);
3335       break;
3336    case MRF:
3337       fprintf(file, "m%d", inst->dst.reg);
3338       break;
3339    case BAD_FILE:
3340       fprintf(file, "(null)");
3341       break;
3342    case UNIFORM:
3343       fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
3344       break;
3345    case ATTR:
3346       fprintf(file, "***attr%d***", inst->dst.reg + inst->dst.reg_offset);
3347       break;
3348    case HW_REG:
3349       if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3350          switch (inst->dst.fixed_hw_reg.nr) {
3351          case BRW_ARF_NULL:
3352             fprintf(file, "null");
3353             break;
3354          case BRW_ARF_ADDRESS:
3355             fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
3356             break;
3357          case BRW_ARF_ACCUMULATOR:
3358             fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
3359             break;
3360          case BRW_ARF_FLAG:
3361             fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3362                              inst->dst.fixed_hw_reg.subnr);
3363             break;
3364          default:
3365             fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3366                                inst->dst.fixed_hw_reg.subnr);
3367             break;
3368          }
3369       } else {
3370          fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
3371       }
3372       if (inst->dst.fixed_hw_reg.subnr)
3373          fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
3374       break;
3375    default:
3376       fprintf(file, "???");
3377       break;
3378    }
3379    fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
3380
3381    for (int i = 0; i < inst->sources; i++) {
3382       if (inst->src[i].negate)
3383          fprintf(file, "-");
3384       if (inst->src[i].abs)
3385          fprintf(file, "|");
3386       switch (inst->src[i].file) {
3387       case GRF:
3388          fprintf(file, "vgrf%d", inst->src[i].reg);
3389          if (inst->src[i].width != dispatch_width)
3390             fprintf(file, "@%d", inst->src[i].width);
3391          if (alloc.sizes[inst->src[i].reg] != inst->src[i].width / 8 ||
3392              inst->src[i].subreg_offset)
3393             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3394                     inst->src[i].subreg_offset);
3395          break;
3396       case MRF:
3397          fprintf(file, "***m%d***", inst->src[i].reg);
3398          break;
3399       case ATTR:
3400          fprintf(file, "attr%d", inst->src[i].reg + inst->src[i].reg_offset);
3401          break;
3402       case UNIFORM:
3403          fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
3404          if (inst->src[i].reladdr) {
3405             fprintf(file, "+reladdr");
3406          } else if (inst->src[i].subreg_offset) {
3407             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3408                     inst->src[i].subreg_offset);
3409          }
3410          break;
3411       case BAD_FILE:
3412          fprintf(file, "(null)");
3413          break;
3414       case IMM:
3415          switch (inst->src[i].type) {
3416          case BRW_REGISTER_TYPE_F:
3417             fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
3418             break;
3419          case BRW_REGISTER_TYPE_W:
3420          case BRW_REGISTER_TYPE_D:
3421             fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
3422             break;
3423          case BRW_REGISTER_TYPE_UW:
3424          case BRW_REGISTER_TYPE_UD:
3425             fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
3426             break;
3427          case BRW_REGISTER_TYPE_VF:
3428             fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
3429                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >>  0) & 0xff),
3430                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >>  8) & 0xff),
3431                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 16) & 0xff),
3432                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 24) & 0xff));
3433             break;
3434          default:
3435             fprintf(file, "???");
3436             break;
3437          }
3438          break;
3439       case HW_REG:
3440          if (inst->src[i].fixed_hw_reg.negate)
3441             fprintf(file, "-");
3442          if (inst->src[i].fixed_hw_reg.abs)
3443             fprintf(file, "|");
3444          if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3445             switch (inst->src[i].fixed_hw_reg.nr) {
3446             case BRW_ARF_NULL:
3447                fprintf(file, "null");
3448                break;
3449             case BRW_ARF_ADDRESS:
3450                fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
3451                break;
3452             case BRW_ARF_ACCUMULATOR:
3453                fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
3454                break;
3455             case BRW_ARF_FLAG:
3456                fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3457                                 inst->src[i].fixed_hw_reg.subnr);
3458                break;
3459             default:
3460                fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3461                                   inst->src[i].fixed_hw_reg.subnr);
3462                break;
3463             }
3464          } else {
3465             fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3466          }
3467          if (inst->src[i].fixed_hw_reg.subnr)
3468             fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
3469          if (inst->src[i].fixed_hw_reg.abs)
3470             fprintf(file, "|");
3471          break;
3472       default:
3473          fprintf(file, "???");
3474          break;
3475       }
3476       if (inst->src[i].abs)
3477          fprintf(file, "|");
3478
3479       if (inst->src[i].file != IMM) {
3480          fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
3481       }
3482
3483       if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
3484          fprintf(file, ", ");
3485    }
3486
3487    fprintf(file, " ");
3488
3489    if (dispatch_width == 16 && inst->exec_size == 8) {
3490       if (inst->force_sechalf)
3491          fprintf(file, "2ndhalf ");
3492       else
3493          fprintf(file, "1sthalf ");
3494    }
3495
3496    fprintf(file, "\n");
3497 }
3498
3499 /**
3500  * Possibly returns an instruction that set up @param reg.
3501  *
3502  * Sometimes we want to take the result of some expression/variable
3503  * dereference tree and rewrite the instruction generating the result
3504  * of the tree.  When processing the tree, we know that the
3505  * instructions generated are all writing temporaries that are dead
3506  * outside of this tree.  So, if we have some instructions that write
3507  * a temporary, we're free to point that temp write somewhere else.
3508  *
3509  * Note that this doesn't guarantee that the instruction generated
3510  * only reg -- it might be the size=4 destination of a texture instruction.
3511  */
3512 fs_inst *
3513 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3514                                            fs_inst *end,
3515                                            const fs_reg &reg)
3516 {
3517    if (end == start ||
3518        end->is_partial_write() ||
3519        reg.reladdr ||
3520        !reg.equals(end->dst)) {
3521       return NULL;
3522    } else {
3523       return end;
3524    }
3525 }
3526
3527 void
3528 fs_visitor::setup_payload_gen6()
3529 {
3530    bool uses_depth =
3531       (prog->InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3532    unsigned barycentric_interp_modes =
3533       (stage == MESA_SHADER_FRAGMENT) ?
3534       ((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
3535
3536    assert(brw->gen >= 6);
3537
3538    /* R0-1: masks, pixel X/Y coordinates. */
3539    payload.num_regs = 2;
3540    /* R2: only for 32-pixel dispatch.*/
3541
3542    /* R3-26: barycentric interpolation coordinates.  These appear in the
3543     * same order that they appear in the brw_wm_barycentric_interp_mode
3544     * enum.  Each set of coordinates occupies 2 registers if dispatch width
3545     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
3546     * appear if they were enabled using the "Barycentric Interpolation
3547     * Mode" bits in WM_STATE.
3548     */
3549    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3550       if (barycentric_interp_modes & (1 << i)) {
3551          payload.barycentric_coord_reg[i] = payload.num_regs;
3552          payload.num_regs += 2;
3553          if (dispatch_width == 16) {
3554             payload.num_regs += 2;
3555          }
3556       }
3557    }
3558
3559    /* R27: interpolated depth if uses source depth */
3560    if (uses_depth) {
3561       payload.source_depth_reg = payload.num_regs;
3562       payload.num_regs++;
3563       if (dispatch_width == 16) {
3564          /* R28: interpolated depth if not SIMD8. */
3565          payload.num_regs++;
3566       }
3567    }
3568    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3569    if (uses_depth) {
3570       payload.source_w_reg = payload.num_regs;
3571       payload.num_regs++;
3572       if (dispatch_width == 16) {
3573          /* R30: interpolated W if not SIMD8. */
3574          payload.num_regs++;
3575       }
3576    }
3577
3578    if (stage == MESA_SHADER_FRAGMENT) {
3579       brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3580       brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3581       prog_data->uses_pos_offset = key->compute_pos_offset;
3582       /* R31: MSAA position offsets. */
3583       if (prog_data->uses_pos_offset) {
3584          payload.sample_pos_reg = payload.num_regs;
3585          payload.num_regs++;
3586       }
3587    }
3588
3589    /* R32: MSAA input coverage mask */
3590    if (prog->SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3591       assert(brw->gen >= 7);
3592       payload.sample_mask_in_reg = payload.num_regs;
3593       payload.num_regs++;
3594       if (dispatch_width == 16) {
3595          /* R33: input coverage mask if not SIMD8. */
3596          payload.num_regs++;
3597       }
3598    }
3599
3600    /* R34-: bary for 32-pixel. */
3601    /* R58-59: interp W for 32-pixel. */
3602
3603    if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3604       source_depth_to_render_target = true;
3605    }
3606 }
3607
3608 void
3609 fs_visitor::setup_vs_payload()
3610 {
3611    /* R0: thread header, R1: urb handles */
3612    payload.num_regs = 2;
3613 }
3614
3615 void
3616 fs_visitor::assign_binding_table_offsets()
3617 {
3618    assert(stage == MESA_SHADER_FRAGMENT);
3619    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3620    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3621    uint32_t next_binding_table_offset = 0;
3622
3623    /* If there are no color regions, we still perform an FB write to a null
3624     * renderbuffer, which we place at surface index 0.
3625     */
3626    prog_data->binding_table.render_target_start = next_binding_table_offset;
3627    next_binding_table_offset += MAX2(key->nr_color_regions, 1);
3628
3629    assign_common_binding_table_offsets(next_binding_table_offset);
3630 }
3631
3632 void
3633 fs_visitor::calculate_register_pressure()
3634 {
3635    invalidate_live_intervals();
3636    calculate_live_intervals();
3637
3638    unsigned num_instructions = 0;
3639    foreach_block(block, cfg)
3640       num_instructions += block->instructions.length();
3641
3642    regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3643
3644    for (unsigned reg = 0; reg < alloc.count; reg++) {
3645       for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3646          regs_live_at_ip[ip] += alloc.sizes[reg];
3647    }
3648 }
3649
3650 void
3651 fs_visitor::optimize()
3652 {
3653    const char *stage_name = stage == MESA_SHADER_VERTEX ? "vs" : "fs";
3654
3655    split_virtual_grfs();
3656
3657    move_uniform_array_access_to_pull_constants();
3658    assign_constant_locations();
3659    demote_pull_constants();
3660
3661 #define OPT(pass, args...) ({                                           \
3662       pass_num++;                                                       \
3663       bool this_progress = pass(args);                                  \
3664                                                                         \
3665       if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) {   \
3666          char filename[64];                                             \
3667          snprintf(filename, 64, "%s%d-%04d-%02d-%02d-" #pass,              \
3668                   stage_name, dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
3669                                                                         \
3670          backend_visitor::dump_instructions(filename);                  \
3671       }                                                                 \
3672                                                                         \
3673       progress = progress || this_progress;                             \
3674       this_progress;                                                    \
3675    })
3676
3677    if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
3678       char filename[64];
3679       snprintf(filename, 64, "%s%d-%04d-00-start",
3680                stage_name, dispatch_width, shader_prog ? shader_prog->Name : 0);
3681
3682       backend_visitor::dump_instructions(filename);
3683    }
3684
3685    bool progress;
3686    int iteration = 0;
3687    int pass_num = 0;
3688    do {
3689       progress = false;
3690       pass_num = 0;
3691       iteration++;
3692
3693       OPT(remove_duplicate_mrf_writes);
3694
3695       OPT(opt_algebraic);
3696       OPT(opt_cse);
3697       OPT(opt_copy_propagate);
3698       OPT(opt_peephole_predicated_break);
3699       OPT(opt_cmod_propagation);
3700       OPT(dead_code_eliminate);
3701       OPT(opt_peephole_sel);
3702       OPT(dead_control_flow_eliminate, this);
3703       OPT(opt_register_renaming);
3704       OPT(opt_redundant_discard_jumps);
3705       OPT(opt_saturate_propagation);
3706       OPT(register_coalesce);
3707       OPT(compute_to_mrf);
3708
3709       OPT(compact_virtual_grfs);
3710    } while (progress);
3711
3712    pass_num = 0;
3713
3714    if (OPT(lower_load_payload)) {
3715       split_virtual_grfs();
3716       OPT(register_coalesce);
3717       OPT(compute_to_mrf);
3718       OPT(dead_code_eliminate);
3719    }
3720
3721    OPT(opt_combine_constants);
3722
3723    lower_uniform_pull_constant_loads();
3724 }
3725
3726 /**
3727  * Three source instruction must have a GRF/MRF destination register.
3728  * ARF NULL is not allowed.  Fix that up by allocating a temporary GRF.
3729  */
3730 void
3731 fs_visitor::fixup_3src_null_dest()
3732 {
3733    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3734       if (inst->is_3src() && inst->dst.is_null()) {
3735          inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
3736                             inst->dst.type);
3737       }
3738    }
3739 }
3740
3741 void
3742 fs_visitor::allocate_registers()
3743 {
3744    bool allocated_without_spills;
3745
3746    static const enum instruction_scheduler_mode pre_modes[] = {
3747       SCHEDULE_PRE,
3748       SCHEDULE_PRE_NON_LIFO,
3749       SCHEDULE_PRE_LIFO,
3750    };
3751
3752    /* Try each scheduling heuristic to see if it can successfully register
3753     * allocate without spilling.  They should be ordered by decreasing
3754     * performance but increasing likelihood of allocating.
3755     */
3756    for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3757       schedule_instructions(pre_modes[i]);
3758
3759       if (0) {
3760          assign_regs_trivial();
3761          allocated_without_spills = true;
3762       } else {
3763          allocated_without_spills = assign_regs(false);
3764       }
3765       if (allocated_without_spills)
3766          break;
3767    }
3768
3769    if (!allocated_without_spills) {
3770       const char *stage_name = stage == MESA_SHADER_VERTEX ?
3771          "Vertex" : "Fragment";
3772
3773       /* We assume that any spilling is worse than just dropping back to
3774        * SIMD8.  There's probably actually some intermediate point where
3775        * SIMD16 with a couple of spills is still better.
3776        */
3777       if (dispatch_width == 16) {
3778          fail("Failure to register allocate.  Reduce number of "
3779               "live scalar values to avoid this.");
3780       } else {
3781          perf_debug("%s shader triggered register spilling.  "
3782                     "Try reducing the number of live scalar values to "
3783                     "improve performance.\n", stage_name);
3784       }
3785
3786       /* Since we're out of heuristics, just go spill registers until we
3787        * get an allocation.
3788        */
3789       while (!assign_regs(true)) {
3790          if (failed)
3791             break;
3792       }
3793    }
3794
3795    /* This must come after all optimization and register allocation, since
3796     * it inserts dead code that happens to have side effects, and it does
3797     * so based on the actual physical registers in use.
3798     */
3799    insert_gen4_send_dependency_workarounds();
3800
3801    if (failed)
3802       return;
3803
3804    if (!allocated_without_spills)
3805       schedule_instructions(SCHEDULE_POST);
3806
3807    if (last_scratch > 0)
3808       prog_data->total_scratch = brw_get_scratch_size(last_scratch);
3809 }
3810
3811 bool
3812 fs_visitor::run_vs()
3813 {
3814    assert(stage == MESA_SHADER_VERTEX);
3815
3816    assign_common_binding_table_offsets(0);
3817    setup_vs_payload();
3818
3819    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3820       emit_shader_time_begin();
3821
3822    foreach_in_list(ir_instruction, ir, shader->base.ir) {
3823       base_ir = ir;
3824       this->result = reg_undef;
3825       ir->accept(this);
3826    }
3827    base_ir = NULL;
3828    if (failed)
3829       return false;
3830
3831    emit_urb_writes();
3832
3833    calculate_cfg();
3834
3835    optimize();
3836
3837    assign_curb_setup();
3838    assign_vs_urb_setup();
3839
3840    fixup_3src_null_dest();
3841    allocate_registers();
3842
3843    return !failed;
3844 }
3845
3846 bool
3847 fs_visitor::run_fs()
3848 {
3849    brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
3850    brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
3851
3852    assert(stage == MESA_SHADER_FRAGMENT);
3853
3854    sanity_param_count = prog->Parameters->NumParameters;
3855
3856    assign_binding_table_offsets();
3857
3858    if (brw->gen >= 6)
3859       setup_payload_gen6();
3860    else
3861       setup_payload_gen4();
3862
3863    if (0) {
3864       emit_dummy_fs();
3865    } else if (brw->use_rep_send && dispatch_width == 16) {
3866       emit_repclear_shader();
3867    } else {
3868       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3869          emit_shader_time_begin();
3870
3871       calculate_urb_setup();
3872       if (prog->InputsRead > 0) {
3873          if (brw->gen < 6)
3874             emit_interpolation_setup_gen4();
3875          else
3876             emit_interpolation_setup_gen6();
3877       }
3878
3879       /* We handle discards by keeping track of the still-live pixels in f0.1.
3880        * Initialize it with the dispatched pixels.
3881        */
3882       if (wm_prog_data->uses_kill) {
3883          fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3884          discard_init->flag_subreg = 1;
3885       }
3886
3887       /* Generate FS IR for main().  (the visitor only descends into
3888        * functions called "main").
3889        */
3890       if (shader) {
3891          if (getenv("INTEL_USE_NIR") != NULL) {
3892             emit_nir_code();
3893          } else {
3894             foreach_in_list(ir_instruction, ir, shader->base.ir) {
3895                base_ir = ir;
3896                this->result = reg_undef;
3897                ir->accept(this);
3898             }
3899          }
3900       } else {
3901          emit_fragment_program_code();
3902       }
3903       base_ir = NULL;
3904       if (failed)
3905          return false;
3906
3907       emit(FS_OPCODE_PLACEHOLDER_HALT);
3908
3909       if (wm_key->alpha_test_func)
3910          emit_alpha_test();
3911
3912       emit_fb_writes();
3913
3914       calculate_cfg();
3915
3916       optimize();
3917
3918       assign_curb_setup();
3919       assign_urb_setup();
3920
3921       fixup_3src_null_dest();
3922       allocate_registers();
3923
3924       if (failed)
3925          return false;
3926    }
3927
3928    if (dispatch_width == 8)
3929       wm_prog_data->reg_blocks = brw_register_blocks(grf_used);
3930    else
3931       wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
3932
3933    /* If any state parameters were appended, then ParameterValues could have
3934     * been realloced, in which case the driver uniform storage set up by
3935     * _mesa_associate_uniform_storage() would point to freed memory.  Make
3936     * sure that didn't happen.
3937     */
3938    assert(sanity_param_count == prog->Parameters->NumParameters);
3939
3940    return !failed;
3941 }
3942
3943 const unsigned *
3944 brw_wm_fs_emit(struct brw_context *brw,
3945                void *mem_ctx,
3946                const struct brw_wm_prog_key *key,
3947                struct brw_wm_prog_data *prog_data,
3948                struct gl_fragment_program *fp,
3949                struct gl_shader_program *prog,
3950                unsigned *final_assembly_size)
3951 {
3952    bool start_busy = false;
3953    double start_time = 0;
3954
3955    if (unlikely(brw->perf_debug)) {
3956       start_busy = (brw->batch.last_bo &&
3957                     drm_intel_bo_busy(brw->batch.last_bo));
3958       start_time = get_time();
3959    }
3960
3961    struct brw_shader *shader = NULL;
3962    if (prog)
3963       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3964
3965    if (unlikely(INTEL_DEBUG & DEBUG_WM))
3966       brw_dump_ir("fragment", prog, &shader->base, &fp->Base);
3967
3968    /* Now the main event: Visit the shader IR and generate our FS IR for it.
3969     */
3970    fs_visitor v(brw, mem_ctx, key, prog_data, prog, fp, 8);
3971    if (!v.run_fs()) {
3972       if (prog) {
3973          prog->LinkStatus = false;
3974          ralloc_strcat(&prog->InfoLog, v.fail_msg);
3975       }
3976
3977       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3978                     v.fail_msg);
3979
3980       return NULL;
3981    }
3982
3983    cfg_t *simd16_cfg = NULL;
3984    fs_visitor v2(brw, mem_ctx, key, prog_data, prog, fp, 16);
3985    if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16) ||
3986                                brw->use_rep_send)) {
3987       if (!v.simd16_unsupported) {
3988          /* Try a SIMD16 compile */
3989          v2.import_uniforms(&v);
3990          if (!v2.run_fs()) {
3991             perf_debug("SIMD16 shader failed to compile, falling back to "
3992                        "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
3993          } else {
3994             simd16_cfg = v2.cfg;
3995          }
3996       } else {
3997          perf_debug("SIMD16 shader unsupported, falling back to "
3998                     "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
3999       }
4000    }
4001
4002    cfg_t *simd8_cfg;
4003    int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8;
4004    if (no_simd8 && simd16_cfg) {
4005       simd8_cfg = NULL;
4006       prog_data->no_8 = true;
4007    } else {
4008       simd8_cfg = v.cfg;
4009       prog_data->no_8 = false;
4010    }
4011
4012    fs_generator g(brw, mem_ctx, (void *) key, &prog_data->base,
4013                   &fp->Base, v.runtime_check_aads_emit, "FS");
4014
4015    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
4016       char *name;
4017       if (prog)
4018          name = ralloc_asprintf(mem_ctx, "%s fragment shader %d",
4019                                 prog->Label ? prog->Label : "unnamed",
4020                                 prog->Name);
4021       else
4022          name = ralloc_asprintf(mem_ctx, "fragment program %d", fp->Base.Id);
4023
4024       g.enable_debug(name);
4025    }
4026
4027    if (simd8_cfg)
4028       g.generate_code(simd8_cfg, 8);
4029    if (simd16_cfg)
4030       prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
4031
4032    if (unlikely(brw->perf_debug) && shader) {
4033       if (shader->compiled_once)
4034          brw_wm_debug_recompile(brw, prog, key);
4035       shader->compiled_once = true;
4036
4037       if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
4038          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
4039                     (get_time() - start_time) * 1000);
4040       }
4041    }
4042
4043    return g.get_assembly(final_assembly_size);
4044 }
4045
4046 extern "C" bool
4047 brw_fs_precompile(struct gl_context *ctx,
4048                   struct gl_shader_program *shader_prog,
4049                   struct gl_program *prog)
4050 {
4051    struct brw_context *brw = brw_context(ctx);
4052    struct brw_wm_prog_key key;
4053
4054    struct gl_fragment_program *fp = (struct gl_fragment_program *) prog;
4055    struct brw_fragment_program *bfp = brw_fragment_program(fp);
4056    bool program_uses_dfdy = fp->UsesDFdy;
4057
4058    memset(&key, 0, sizeof(key));
4059
4060    if (brw->gen < 6) {
4061       if (fp->UsesKill)
4062          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
4063
4064       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
4065          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
4066
4067       /* Just assume depth testing. */
4068       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
4069       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
4070    }
4071
4072    if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
4073                                          BRW_FS_VARYING_INPUT_MASK) > 16)
4074       key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
4075
4076    const bool has_shader_channel_select = brw->is_haswell || brw->gen >= 8;
4077    unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
4078    for (unsigned i = 0; i < sampler_count; i++) {
4079       if (!has_shader_channel_select && (fp->Base.ShadowSamplers & (1 << i))) {
4080          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
4081          key.tex.swizzles[i] =
4082             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
4083       } else {
4084          /* Color sampler: assume no swizzling. */
4085          key.tex.swizzles[i] = SWIZZLE_XYZW;
4086       }
4087    }
4088
4089    if (fp->Base.InputsRead & VARYING_BIT_POS) {
4090       key.drawable_height = ctx->DrawBuffer->Height;
4091    }
4092
4093    key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
4094          ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
4095          BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
4096
4097    if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
4098       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
4099                           key.nr_color_regions > 1;
4100    }
4101
4102    key.program_string_id = bfp->id;
4103
4104    uint32_t old_prog_offset = brw->wm.base.prog_offset;
4105    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
4106
4107    bool success = do_wm_prog(brw, shader_prog, bfp, &key);
4108
4109    brw->wm.base.prog_offset = old_prog_offset;
4110    brw->wm.prog_data = old_prog_data;
4111
4112    return success;
4113 }