src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 #include <sys/types.h>
  32
  33 #include "util/hash_table.h"
  34 #include "main/macros.h"
  35 #include "main/shaderobj.h"
  36 #include "main/fbobject.h"
  37 #include "program/prog_parameter.h"
  38 #include "program/prog_print.h"
  39 #include "util/register_allocate.h"
  40 #include "program/hash_table.h"
  41 #include "brw_context.h"
  42 #include "brw_eu.h"
  43 #include "brw_wm.h"
  44 #include "brw_fs.h"
  45 #include "brw_cfg.h"
  46 #include "brw_dead_control_flow.h"
  47 #include "main/uniforms.h"
  48 #include "brw_fs_live_variables.h"
  49 #include "glsl/glsl_types.h"
  50 #include "program/sampler.h"
  51
  52 void
  53 fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
  54               const fs_reg *src, unsigned sources)
  55 {
  56    memset(this, 0, sizeof(*this));
  57
  58    this->src = new fs_reg[MAX2(sources, 3)];
  59    for (unsigned i = 0; i < sources; i++)
  60       this->src[i] = src[i];
  61
  62    this->opcode = opcode;
  63    this->dst = dst;
  64    this->sources = sources;
  65    this->exec_size = exec_size;
  66
  67    assert(dst.file != IMM && dst.file != UNIFORM);
  68
  69    /* If exec_size == 0, try to guess it from the registers.  Since all
  70     * manner of things may use hardware registers, we first try to guess
  71     * based on GRF registers.  If this fails, we will go ahead and take the
  72     * width from the destination register.
  73     */
  74    if (this->exec_size == 0) {
  75       if (dst.file == GRF) {
  76          this->exec_size = dst.width;
  77       } else {
  78          for (unsigned i = 0; i < sources; ++i) {
  79             if (src[i].file != GRF && src[i].file != ATTR)
  80                continue;
  81
  82             if (this->exec_size <= 1)
  83                this->exec_size = src[i].width;
  84             assert(src[i].width == 1 || src[i].width == this->exec_size);
  85          }
  86       }
  87
  88       if (this->exec_size == 0 && dst.file != BAD_FILE)
  89          this->exec_size = dst.width;
  90    }
  91    assert(this->exec_size != 0);
  92
  93    for (unsigned i = 0; i < sources; ++i) {
  94       switch (this->src[i].file) {
  95       case BAD_FILE:
  96          this->src[i].effective_width = 8;
  97          break;
  98       case GRF:
  99       case HW_REG:
 100       case ATTR:
 101          assert(this->src[i].width > 0);
 102          if (this->src[i].width == 1) {
 103             this->src[i].effective_width = this->exec_size;
 104          } else {
 105             this->src[i].effective_width = this->src[i].width;
 106          }
 107          break;
 108       case IMM:
 109       case UNIFORM:
 110          this->src[i].effective_width = this->exec_size;
 111          break;
 112       default:
 113          unreachable("Invalid source register file");
 114       }
 115    }
 116    this->dst.effective_width = this->exec_size;
 117
 118    this->conditional_mod = BRW_CONDITIONAL_NONE;
 119
 120    /* This will be the case for almost all instructions. */
 121    switch (dst.file) {
 122    case GRF:
 123    case HW_REG:
 124    case MRF:
 125    case ATTR:
 126       this->regs_written =
 127          DIV_ROUND_UP(MAX2(dst.width * dst.stride, 1) * type_sz(dst.type), 32);
 128       break;
 129    case BAD_FILE:
 130       this->regs_written = 0;
 131       break;
 132    case IMM:
 133    case UNIFORM:
 134       unreachable("Invalid destination register file");
 135    default:
 136       unreachable("Invalid register file");
 137    }
 138
 139    this->writes_accumulator = false;
 140 }
 141
 142 fs_inst::fs_inst()
 143 {
 144    init(BRW_OPCODE_NOP, 8, dst, NULL, 0);
 145 }
 146
 147 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
 148 {
 149    init(opcode, exec_size, reg_undef, NULL, 0);
 150 }
 151
 152 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
 153 {
 154    init(opcode, 0, dst, NULL, 0);
 155 }
 156
 157 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 158                  const fs_reg &src0)
 159 {
 160    const fs_reg src[1] = { src0 };
 161    init(opcode, exec_size, dst, src, 1);
 162 }
 163
 164 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
 165 {
 166    const fs_reg src[1] = { src0 };
 167    init(opcode, 0, dst, src, 1);
 168 }
 169
 170 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 171                  const fs_reg &src0, const fs_reg &src1)
 172 {
 173    const fs_reg src[2] = { src0, src1 };
 174    init(opcode, exec_size, dst, src, 2);
 175 }
 176
 177 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 178                  const fs_reg &src1)
 179 {
 180    const fs_reg src[2] = { src0, src1 };
 181    init(opcode, 0, dst, src, 2);
 182 }
 183
 184 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 185                  const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
 186 {
 187    const fs_reg src[3] = { src0, src1, src2 };
 188    init(opcode, exec_size, dst, src, 3);
 189 }
 190
 191 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 192                  const fs_reg &src1, const fs_reg &src2)
 193 {
 194    const fs_reg src[3] = { src0, src1, src2 };
 195    init(opcode, 0, dst, src, 3);
 196 }
 197
 198 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst,
 199                  const fs_reg src[], unsigned sources)
 200 {
 201    init(opcode, 0, dst, src, sources);
 202 }
 203
 204 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
 205                  const fs_reg src[], unsigned sources)
 206 {
 207    init(opcode, exec_width, dst, src, sources);
 208 }
 209
 210 fs_inst::fs_inst(const fs_inst &that)
 211 {
 212    memcpy(this, &that, sizeof(that));
 213
 214    this->src = new fs_reg[MAX2(that.sources, 3)];
 215
 216    for (unsigned i = 0; i < that.sources; i++)
 217       this->src[i] = that.src[i];
 218 }
 219
 220 fs_inst::~fs_inst()
 221 {
 222    delete[] this->src;
 223 }
 224
 225 void
 226 fs_inst::resize_sources(uint8_t num_sources)
 227 {
 228    if (this->sources != num_sources) {
 229       fs_reg *src = new fs_reg[MAX2(num_sources, 3)];
 230
 231       for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i)
 232          src[i] = this->src[i];
 233
 234       delete[] this->src;
 235       this->src = src;
 236       this->sources = num_sources;
 237    }
 238 }
 239
 240 #define ALU1(op)                                                        \
 241    fs_inst *                                                            \
 242    fs_visitor::op(const fs_reg &dst, const fs_reg &src0)                \
 243    {                                                                    \
 244       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
 245    }
 246
 247 #define ALU2(op)                                                        \
 248    fs_inst *                                                            \
 249    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 250                   const fs_reg &src1)                                   \
 251    {                                                                    \
 252       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
 253    }
 254
 255 #define ALU2_ACC(op)                                                    \
 256    fs_inst *                                                            \
 257    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 258                   const fs_reg &src1)                                   \
 259    {                                                                    \
 260       fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
 261       inst->writes_accumulator = true;                                  \
 262       return inst;                                                      \
 263    }
 264
 265 #define ALU3(op)                                                        \
 266    fs_inst *                                                            \
 267    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 268                   const fs_reg &src1, const fs_reg &src2)               \
 269    {                                                                    \
 270       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
 271    }
 272
 273 ALU1(NOT)
 274 ALU1(MOV)
 275 ALU1(FRC)
 276 ALU1(RNDD)
 277 ALU1(RNDE)
 278 ALU1(RNDZ)
 279 ALU2(ADD)
 280 ALU2(MUL)
 281 ALU2_ACC(MACH)
 282 ALU2(AND)
 283 ALU2(OR)
 284 ALU2(XOR)
 285 ALU2(SHL)
 286 ALU2(SHR)
 287 ALU2(ASR)
 288 ALU3(LRP)
 289 ALU1(BFREV)
 290 ALU3(BFE)
 291 ALU2(BFI1)
 292 ALU3(BFI2)
 293 ALU1(FBH)
 294 ALU1(FBL)
 295 ALU1(CBIT)
 296 ALU3(MAD)
 297 ALU2_ACC(ADDC)
 298 ALU2_ACC(SUBB)
 299 ALU2(SEL)
 300 ALU2(MAC)
 301
 302 /** Gen4 predicated IF. */
 303 fs_inst *
 304 fs_visitor::IF(enum brw_predicate predicate)
 305 {
 306    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width);
 307    inst->predicate = predicate;
 308    return inst;
 309 }
 310
 311 /** Gen6 IF with embedded comparison. */
 312 fs_inst *
 313 fs_visitor::IF(const fs_reg &src0, const fs_reg &src1,
 314                enum brw_conditional_mod condition)
 315 {
 316    assert(brw->gen == 6);
 317    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width,
 318                                         reg_null_d, src0, src1);
 319    inst->conditional_mod = condition;
 320    return inst;
 321 }
 322
 323 /**
 324  * CMP: Sets the low bit of the destination channels with the result
 325  * of the comparison, while the upper bits are undefined, and updates
 326  * the flag register with the packed 16 bits of the result.
 327  */
 328 fs_inst *
 329 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
 330                 enum brw_conditional_mod condition)
 331 {
 332    fs_inst *inst;
 333
 334    /* Take the instruction:
 335     *
 336     * CMP null<d> src0<f> src1<f>
 337     *
 338     * Original gen4 does type conversion to the destination type before
 339     * comparison, producing garbage results for floating point comparisons.
 340     *
 341     * The destination type doesn't matter on newer generations, so we set the
 342     * type to match src0 so we can compact the instruction.
 343     */
 344    dst.type = src0.type;
 345    if (dst.file == HW_REG)
 346       dst.fixed_hw_reg.type = dst.type;
 347
 348    resolve_ud_negate(&src0);
 349    resolve_ud_negate(&src1);
 350
 351    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
 352    inst->conditional_mod = condition;
 353
 354    return inst;
 355 }
 356
 357 fs_inst *
 358 fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources)
 359 {
 360    uint8_t exec_size = dst.width;
 361    for (int i = 0; i < sources; ++i) {
 362       assert(src[i].width % dst.width == 0);
 363       if (src[i].width > exec_size)
 364          exec_size = src[i].width;
 365    }
 366
 367    fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, exec_size,
 368                                         dst, src, sources);
 369    inst->regs_written = 0;
 370    for (int i = 0; i < sources; ++i) {
 371       /* The LOAD_PAYLOAD instruction only really makes sense if we are
 372        * dealing with whole registers.  If this ever changes, we can deal
 373        * with it later.
 374        */
 375       int size = inst->src[i].effective_width * type_sz(src[i].type);
 376       assert(size % 32 == 0);
 377       inst->regs_written += (size + 31) / 32;
 378    }
 379
 380    return inst;
 381 }
 382
 383 exec_list
 384 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
 385                                        const fs_reg &surf_index,
 386                                        const fs_reg &varying_offset,
 387                                        uint32_t const_offset)
 388 {
 389    exec_list instructions;
 390    fs_inst *inst;
 391
 392    /* We have our constant surface use a pitch of 4 bytes, so our index can
 393     * be any component of a vector, and then we load 4 contiguous
 394     * components starting from that.
 395     *
 396     * We break down the const_offset to a portion added to the variable
 397     * offset and a portion done using reg_offset, which means that if you
 398     * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
 399     * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
 400     * CSE can later notice that those loads are all the same and eliminate
 401     * the redundant ones.
 402     */
 403    fs_reg vec4_offset = vgrf(glsl_type::int_type);
 404    instructions.push_tail(ADD(vec4_offset,
 405                               varying_offset, fs_reg(const_offset & ~3)));
 406
 407    int scale = 1;
 408    if (brw->gen == 4 && dst.width == 8) {
 409       /* Pre-gen5, we can either use a SIMD8 message that requires (header,
 410        * u, v, r) as parameters, or we can just use the SIMD16 message
 411        * consisting of (header, u).  We choose the second, at the cost of a
 412        * longer return length.
 413        */
 414       scale = 2;
 415    }
 416
 417    enum opcode op;
 418    if (brw->gen >= 7)
 419       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
 420    else
 421       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
 422
 423    assert(dst.width % 8 == 0);
 424    int regs_written = 4 * (dst.width / 8) * scale;
 425    fs_reg vec4_result = fs_reg(GRF, alloc.allocate(regs_written),
 426                                dst.type, dst.width);
 427    inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
 428    inst->regs_written = regs_written;
 429    instructions.push_tail(inst);
 430
 431    if (brw->gen < 7) {
 432       inst->base_mrf = 13;
 433       inst->header_present = true;
 434       if (brw->gen == 4)
 435          inst->mlen = 3;
 436       else
 437          inst->mlen = 1 + dispatch_width / 8;
 438    }
 439
 440    fs_reg result = offset(vec4_result, (const_offset & 3) * scale);
 441    instructions.push_tail(MOV(dst, result));
 442
 443    return instructions;
 444 }
 445
 446 /**
 447  * A helper for MOV generation for fixing up broken hardware SEND dependency
 448  * handling.
 449  */
 450 fs_inst *
 451 fs_visitor::DEP_RESOLVE_MOV(int grf)
 452 {
 453    fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
 454
 455    inst->ir = NULL;
 456    inst->annotation = "send dependency resolve";
 457
 458    /* The caller always wants uncompressed to emit the minimal extra
 459     * dependencies, and to avoid having to deal with aligning its regs to 2.
 460     */
 461    inst->exec_size = 8;
 462
 463    return inst;
 464 }
 465
 466 bool
 467 fs_inst::equals(fs_inst *inst) const
 468 {
 469    return (opcode == inst->opcode &&
 470            dst.equals(inst->dst) &&
 471            src[0].equals(inst->src[0]) &&
 472            src[1].equals(inst->src[1]) &&
 473            src[2].equals(inst->src[2]) &&
 474            saturate == inst->saturate &&
 475            predicate == inst->predicate &&
 476            conditional_mod == inst->conditional_mod &&
 477            mlen == inst->mlen &&
 478            base_mrf == inst->base_mrf &&
 479            target == inst->target &&
 480            eot == inst->eot &&
 481            header_present == inst->header_present &&
 482            shadow_compare == inst->shadow_compare &&
 483            exec_size == inst->exec_size &&
 484            offset == inst->offset);
 485 }
 486
 487 bool
 488 fs_inst::overwrites_reg(const fs_reg &reg) const
 489 {
 490    return reg.in_range(dst, regs_written);
 491 }
 492
 493 bool
 494 fs_inst::is_send_from_grf() const
 495 {
 496    switch (opcode) {
 497    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
 498    case SHADER_OPCODE_SHADER_TIME_ADD:
 499    case FS_OPCODE_INTERPOLATE_AT_CENTROID:
 500    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
 501    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
 502    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
 503    case SHADER_OPCODE_UNTYPED_ATOMIC:
 504    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
 505    case SHADER_OPCODE_URB_WRITE_SIMD8:
 506       return true;
 507    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 508       return src[1].file == GRF;
 509    case FS_OPCODE_FB_WRITE:
 510       return src[0].file == GRF;
 511    default:
 512       if (is_tex())
 513          return src[0].file == GRF;
 514
 515       return false;
 516    }
 517 }
 518
 519 bool
 520 fs_inst::can_do_source_mods(struct brw_context *brw)
 521 {
 522    if (brw->gen == 6 && is_math())
 523       return false;
 524
 525    if (is_send_from_grf())
 526       return false;
 527
 528    if (!backend_instruction::can_do_source_mods())
 529       return false;
 530
 531    return true;
 532 }
 533
 534 void
 535 fs_reg::init()
 536 {
 537    memset(this, 0, sizeof(*this));
 538    stride = 1;
 539 }
 540
 541 /** Generic unset register constructor. */
 542 fs_reg::fs_reg()
 543 {
 544    init();
 545    this->file = BAD_FILE;
 546 }
 547
 548 /** Immediate value constructor. */
 549 fs_reg::fs_reg(float f)
 550 {
 551    init();
 552    this->file = IMM;
 553    this->type = BRW_REGISTER_TYPE_F;
 554    this->fixed_hw_reg.dw1.f = f;
 555    this->width = 1;
 556 }
 557
 558 /** Immediate value constructor. */
 559 fs_reg::fs_reg(int32_t i)
 560 {
 561    init();
 562    this->file = IMM;
 563    this->type = BRW_REGISTER_TYPE_D;
 564    this->fixed_hw_reg.dw1.d = i;
 565    this->width = 1;
 566 }
 567
 568 /** Immediate value constructor. */
 569 fs_reg::fs_reg(uint32_t u)
 570 {
 571    init();
 572    this->file = IMM;
 573    this->type = BRW_REGISTER_TYPE_UD;
 574    this->fixed_hw_reg.dw1.ud = u;
 575    this->width = 1;
 576 }
 577
 578 /** Vector float immediate value constructor. */
 579 fs_reg::fs_reg(uint8_t vf[4])
 580 {
 581    init();
 582    this->file = IMM;
 583    this->type = BRW_REGISTER_TYPE_VF;
 584    memcpy(&this->fixed_hw_reg.dw1.ud, vf, sizeof(unsigned));
 585 }
 586
 587 /** Vector float immediate value constructor. */
 588 fs_reg::fs_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
 589 {
 590    init();
 591    this->file = IMM;
 592    this->type = BRW_REGISTER_TYPE_VF;
 593    this->fixed_hw_reg.dw1.ud = (vf0 <<  0) |
 594                                (vf1 <<  8) |
 595                                (vf2 << 16) |
 596                                (vf3 << 24);
 597 }
 598
 599 /** Fixed brw_reg. */
 600 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 601 {
 602    init();
 603    this->file = HW_REG;
 604    this->fixed_hw_reg = fixed_hw_reg;
 605    this->type = fixed_hw_reg.type;
 606    this->width = 1 << fixed_hw_reg.width;
 607 }
 608
 609 bool
 610 fs_reg::equals(const fs_reg &r) const
 611 {
 612    return (file == r.file &&
 613            reg == r.reg &&
 614            reg_offset == r.reg_offset &&
 615            subreg_offset == r.subreg_offset &&
 616            type == r.type &&
 617            negate == r.negate &&
 618            abs == r.abs &&
 619            !reladdr && !r.reladdr &&
 620            memcmp(&fixed_hw_reg, &r.fixed_hw_reg, sizeof(fixed_hw_reg)) == 0 &&
 621            width == r.width &&
 622            stride == r.stride);
 623 }
 624
 625 fs_reg &
 626 fs_reg::set_smear(unsigned subreg)
 627 {
 628    assert(file != HW_REG && file != IMM);
 629    subreg_offset = subreg * type_sz(type);
 630    stride = 0;
 631    return *this;
 632 }
 633
 634 bool
 635 fs_reg::is_contiguous() const
 636 {
 637    return stride == 1;
 638 }
 639
 640 int
 641 fs_visitor::type_size(const struct glsl_type *type)
 642 {
 643    unsigned int size, i;
 644
 645    switch (type->base_type) {
 646    case GLSL_TYPE_UINT:
 647    case GLSL_TYPE_INT:
 648    case GLSL_TYPE_FLOAT:
 649    case GLSL_TYPE_BOOL:
 650       return type->components();
 651    case GLSL_TYPE_ARRAY:
 652       return type_size(type->fields.array) * type->length;
 653    case GLSL_TYPE_STRUCT:
 654       size = 0;
 655       for (i = 0; i < type->length; i++) {
 656          size += type_size(type->fields.structure[i].type);
 657       }
 658       return size;
 659    case GLSL_TYPE_SAMPLER:
 660       /* Samplers take up no register space, since they're baked in at
 661        * link time.
 662        */
 663       return 0;
 664    case GLSL_TYPE_ATOMIC_UINT:
 665       return 0;
 666    case GLSL_TYPE_IMAGE:
 667    case GLSL_TYPE_VOID:
 668    case GLSL_TYPE_ERROR:
 669    case GLSL_TYPE_INTERFACE:
 670    case GLSL_TYPE_DOUBLE:
 671       unreachable("not reached");
 672    }
 673
 674    return 0;
 675 }
 676
 677 /**
 678  * Create a MOV to read the timestamp register.
 679  *
 680  * The caller is responsible for emitting the MOV.  The return value is
 681  * the destination of the MOV, with extra parameters set.
 682  */
 683 fs_reg
 684 fs_visitor::get_timestamp(fs_inst **out_mov)
 685 {
 686    assert(brw->gen >= 7);
 687
 688    fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 689                                           BRW_ARF_TIMESTAMP,
 690                                           0),
 691                              BRW_REGISTER_TYPE_UD));
 692
 693    fs_reg dst = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 4);
 694
 695    fs_inst *mov = MOV(dst, ts);
 696    /* We want to read the 3 fields we care about even if it's not enabled in
 697     * the dispatch.
 698     */
 699    mov->force_writemask_all = true;
 700
 701    /* The caller wants the low 32 bits of the timestamp.  Since it's running
 702     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 703     * which is plenty of time for our purposes.  It is identical across the
 704     * EUs, but since it's tracking GPU core speed it will increment at a
 705     * varying rate as render P-states change.
 706     *
 707     * The caller could also check if render P-states have changed (or anything
 708     * else that might disrupt timing) by setting smear to 2 and checking if
 709     * that field is != 0.
 710     */
 711    dst.set_smear(0);
 712
 713    *out_mov = mov;
 714    return dst;
 715 }
 716
 717 void
 718 fs_visitor::emit_shader_time_begin()
 719 {
 720    current_annotation = "shader time start";
 721    fs_inst *mov;
 722    shader_start_time = get_timestamp(&mov);
 723    emit(mov);
 724 }
 725
 726 void
 727 fs_visitor::emit_shader_time_end()
 728 {
 729    current_annotation = "shader time end";
 730
 731    enum shader_time_shader_type type, written_type, reset_type;
 732    switch (stage) {
 733    case MESA_SHADER_VERTEX:
 734       type = ST_VS;
 735       written_type = ST_VS_WRITTEN;
 736       reset_type = ST_VS_RESET;
 737       break;
 738    case MESA_SHADER_GEOMETRY:
 739       type = ST_GS;
 740       written_type = ST_GS_WRITTEN;
 741       reset_type = ST_GS_RESET;
 742       break;
 743    case MESA_SHADER_FRAGMENT:
 744       if (dispatch_width == 8) {
 745          type = ST_FS8;
 746          written_type = ST_FS8_WRITTEN;
 747          reset_type = ST_FS8_RESET;
 748       } else {
 749          assert(dispatch_width == 16);
 750          type = ST_FS16;
 751          written_type = ST_FS16_WRITTEN;
 752          reset_type = ST_FS16_RESET;
 753       }
 754       break;
 755    default:
 756       unreachable("fs_visitor::emit_shader_time_end missing code");
 757    }
 758
 759    /* Insert our code just before the final SEND with EOT. */
 760    exec_node *end = this->instructions.get_tail();
 761    assert(end && ((fs_inst *) end)->eot);
 762
 763    fs_inst *tm_read;
 764    fs_reg shader_end_time = get_timestamp(&tm_read);
 765    end->insert_before(tm_read);
 766
 767    /* Check that there weren't any timestamp reset events (assuming these
 768     * were the only two timestamp reads that happened).
 769     */
 770    fs_reg reset = shader_end_time;
 771    reset.set_smear(2);
 772    fs_inst *test = AND(reg_null_d, reset, fs_reg(1u));
 773    test->conditional_mod = BRW_CONDITIONAL_Z;
 774    test->force_writemask_all = true;
 775    end->insert_before(test);
 776    end->insert_before(IF(BRW_PREDICATE_NORMAL));
 777
 778    fs_reg start = shader_start_time;
 779    start.negate = true;
 780    fs_reg diff = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 1);
 781    diff.set_smear(0);
 782    fs_inst *add = ADD(diff, start, shader_end_time);
 783    add->force_writemask_all = true;
 784    end->insert_before(add);
 785
 786    /* If there were no instructions between the two timestamp gets, the diff
 787     * is 2 cycles.  Remove that overhead, so I can forget about that when
 788     * trying to determine the time taken for single instructions.
 789     */
 790    add = ADD(diff, diff, fs_reg(-2u));
 791    add->force_writemask_all = true;
 792    end->insert_before(add);
 793
 794    end->insert_before(SHADER_TIME_ADD(type, diff));
 795    end->insert_before(SHADER_TIME_ADD(written_type, fs_reg(1u)));
 796    end->insert_before(new(mem_ctx) fs_inst(BRW_OPCODE_ELSE, dispatch_width));
 797    end->insert_before(SHADER_TIME_ADD(reset_type, fs_reg(1u)));
 798    end->insert_before(new(mem_ctx) fs_inst(BRW_OPCODE_ENDIF, dispatch_width));
 799 }
 800
 801 fs_inst *
 802 fs_visitor::SHADER_TIME_ADD(enum shader_time_shader_type type, fs_reg value)
 803 {
 804    int shader_time_index =
 805       brw_get_shader_time_index(brw, shader_prog, prog, type);
 806    fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
 807
 808    fs_reg payload;
 809    if (dispatch_width == 8)
 810       payload = vgrf(glsl_type::uvec2_type);
 811    else
 812       payload = vgrf(glsl_type::uint_type);
 813
 814    return new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
 815                                fs_reg(), payload, offset, value);
 816 }
 817
 818 void
 819 fs_visitor::vfail(const char *format, va_list va)
 820 {
 821    char *msg;
 822
 823    if (failed)
 824       return;
 825
 826    failed = true;
 827
 828    msg = ralloc_vasprintf(mem_ctx, format, va);
 829    msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
 830
 831    this->fail_msg = msg;
 832
 833    if (debug_enabled) {
 834       fprintf(stderr, "%s",  msg);
 835    }
 836 }
 837
 838 void
 839 fs_visitor::fail(const char *format, ...)
 840 {
 841    va_list va;
 842
 843    va_start(va, format);
 844    vfail(format, va);
 845    va_end(va);
 846 }
 847
 848 /**
 849  * Mark this program as impossible to compile in SIMD16 mode.
 850  *
 851  * During the SIMD8 compile (which happens first), we can detect and flag
 852  * things that are unsupported in SIMD16 mode, so the compiler can skip
 853  * the SIMD16 compile altogether.
 854  *
 855  * During a SIMD16 compile (if one happens anyway), this just calls fail().
 856  */
 857 void
 858 fs_visitor::no16(const char *format, ...)
 859 {
 860    va_list va;
 861
 862    va_start(va, format);
 863
 864    if (dispatch_width == 16) {
 865       vfail(format, va);
 866    } else {
 867       simd16_unsupported = true;
 868
 869       if (brw->perf_debug) {
 870          if (no16_msg)
 871             ralloc_vasprintf_append(&no16_msg, format, va);
 872          else
 873             no16_msg = ralloc_vasprintf(mem_ctx, format, va);
 874       }
 875    }
 876
 877    va_end(va);
 878 }
 879
 880 fs_inst *
 881 fs_visitor::emit(enum opcode opcode)
 882 {
 883    return emit(new(mem_ctx) fs_inst(opcode, dispatch_width));
 884 }
 885
 886 fs_inst *
 887 fs_visitor::emit(enum opcode opcode, const fs_reg &dst)
 888 {
 889    return emit(new(mem_ctx) fs_inst(opcode, dst));
 890 }
 891
 892 fs_inst *
 893 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
 894 {
 895    return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
 896 }
 897
 898 fs_inst *
 899 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 900                  const fs_reg &src1)
 901 {
 902    return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
 903 }
 904
 905 fs_inst *
 906 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 907                  const fs_reg &src1, const fs_reg &src2)
 908 {
 909    return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
 910 }
 911
 912 fs_inst *
 913 fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
 914                  fs_reg src[], int sources)
 915 {
 916    return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
 917 }
 918
 919 /**
 920  * Returns true if the instruction has a flag that means it won't
 921  * update an entire destination register.
 922  *
 923  * For example, dead code elimination and live variable analysis want to know
 924  * when a write to a variable screens off any preceding values that were in
 925  * it.
 926  */
 927 bool
 928 fs_inst::is_partial_write() const
 929 {
 930    return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
 931            (this->dst.width * type_sz(this->dst.type)) < 32 ||
 932            !this->dst.is_contiguous());
 933 }
 934
 935 int
 936 fs_inst::regs_read(int arg) const
 937 {
 938    if (is_tex() && arg == 0 && src[0].file == GRF) {
 939       return mlen;
 940    } else if (opcode == FS_OPCODE_FB_WRITE && arg == 0) {
 941       return mlen;
 942    } else if (opcode == SHADER_OPCODE_URB_WRITE_SIMD8 && arg == 0) {
 943       return mlen;
 944    } else if (opcode == SHADER_OPCODE_UNTYPED_ATOMIC && arg == 0) {
 945       return mlen;
 946    } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ && arg == 0) {
 947       return mlen;
 948    } else if (opcode == FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET && arg == 0) {
 949       return mlen;
 950    }
 951
 952    switch (src[arg].file) {
 953    case BAD_FILE:
 954    case UNIFORM:
 955    case IMM:
 956       return 1;
 957    case GRF:
 958    case HW_REG:
 959       if (src[arg].stride == 0) {
 960          return 1;
 961       } else {
 962          int size = src[arg].width * src[arg].stride * type_sz(src[arg].type);
 963          return (size + 31) / 32;
 964       }
 965    case MRF:
 966       unreachable("MRF registers are not allowed as sources");
 967    default:
 968       unreachable("Invalid register file");
 969    }
 970 }
 971
 972 bool
 973 fs_inst::reads_flag() const
 974 {
 975    return predicate;
 976 }
 977
 978 bool
 979 fs_inst::writes_flag() const
 980 {
 981    return (conditional_mod && (opcode != BRW_OPCODE_SEL &&
 982                                opcode != BRW_OPCODE_IF &&
 983                                opcode != BRW_OPCODE_WHILE)) ||
 984           opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
 985 }
 986
 987 /**
 988  * Returns how many MRFs an FS opcode will write over.
 989  *
 990  * Note that this is not the 0 or 1 implied writes in an actual gen
 991  * instruction -- the FS opcodes often generate MOVs in addition.
 992  */
 993 int
 994 fs_visitor::implied_mrf_writes(fs_inst *inst)
 995 {
 996    if (inst->mlen == 0)
 997       return 0;
 998
 999    if (inst->base_mrf == -1)
1000       return 0;
1001
1002    switch (inst->opcode) {
1003    case SHADER_OPCODE_RCP:
1004    case SHADER_OPCODE_RSQ:
1005    case SHADER_OPCODE_SQRT:
1006    case SHADER_OPCODE_EXP2:
1007    case SHADER_OPCODE_LOG2:
1008    case SHADER_OPCODE_SIN:
1009    case SHADER_OPCODE_COS:
1010       return 1 * dispatch_width / 8;
1011    case SHADER_OPCODE_POW:
1012    case SHADER_OPCODE_INT_QUOTIENT:
1013    case SHADER_OPCODE_INT_REMAINDER:
1014       return 2 * dispatch_width / 8;
1015    case SHADER_OPCODE_TEX:
1016    case FS_OPCODE_TXB:
1017    case SHADER_OPCODE_TXD:
1018    case SHADER_OPCODE_TXF:
1019    case SHADER_OPCODE_TXF_CMS:
1020    case SHADER_OPCODE_TXF_MCS:
1021    case SHADER_OPCODE_TG4:
1022    case SHADER_OPCODE_TG4_OFFSET:
1023    case SHADER_OPCODE_TXL:
1024    case SHADER_OPCODE_TXS:
1025    case SHADER_OPCODE_LOD:
1026       return 1;
1027    case FS_OPCODE_FB_WRITE:
1028       return 2;
1029    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
1030    case SHADER_OPCODE_GEN4_SCRATCH_READ:
1031       return 1;
1032    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
1033       return inst->mlen;
1034    case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
1035       return 2;
1036    case SHADER_OPCODE_UNTYPED_ATOMIC:
1037    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
1038    case SHADER_OPCODE_URB_WRITE_SIMD8:
1039    case FS_OPCODE_INTERPOLATE_AT_CENTROID:
1040    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
1041    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
1042    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
1043       return 0;
1044    default:
1045       unreachable("not reached");
1046    }
1047 }
1048
1049 fs_reg
1050 fs_visitor::vgrf(const glsl_type *const type)
1051 {
1052    int reg_width = dispatch_width / 8;
1053    return fs_reg(GRF, alloc.allocate(type_size(type) * reg_width),
1054                  brw_type_for_base_type(type), dispatch_width);
1055 }
1056
1057 fs_reg
1058 fs_visitor::vgrf(int num_components)
1059 {
1060    int reg_width = dispatch_width / 8;
1061    return fs_reg(GRF, alloc.allocate(num_components * reg_width),
1062                  BRW_REGISTER_TYPE_F, dispatch_width);
1063 }
1064
1065 /** Fixed HW reg constructor. */
1066 fs_reg::fs_reg(enum register_file file, int reg)
1067 {
1068    init();
1069    this->file = file;
1070    this->reg = reg;
1071    this->type = BRW_REGISTER_TYPE_F;
1072
1073    switch (file) {
1074    case UNIFORM:
1075       this->width = 1;
1076       break;
1077    default:
1078       this->width = 8;
1079    }
1080 }
1081
1082 /** Fixed HW reg constructor. */
1083 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
1084 {
1085    init();
1086    this->file = file;
1087    this->reg = reg;
1088    this->type = type;
1089
1090    switch (file) {
1091    case UNIFORM:
1092       this->width = 1;
1093       break;
1094    default:
1095       this->width = 8;
1096    }
1097 }
1098
1099 /** Fixed HW reg constructor. */
1100 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type,
1101                uint8_t width)
1102 {
1103    init();
1104    this->file = file;
1105    this->reg = reg;
1106    this->type = type;
1107    this->width = width;
1108 }
1109
1110 fs_reg *
1111 fs_visitor::variable_storage(ir_variable *var)
1112 {
1113    return (fs_reg *)hash_table_find(this->variable_ht, var);
1114 }
1115
1116 void
1117 import_uniforms_callback(const void *key,
1118                          void *data,
1119                          void *closure)
1120 {
1121    struct hash_table *dst_ht = (struct hash_table *)closure;
1122    const fs_reg *reg = (const fs_reg *)data;
1123
1124    if (reg->file != UNIFORM)
1125       return;
1126
1127    hash_table_insert(dst_ht, data, key);
1128 }
1129
1130 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
1131  * This brings in those uniform definitions
1132  */
1133 void
1134 fs_visitor::import_uniforms(fs_visitor *v)
1135 {
1136    hash_table_call_foreach(v->variable_ht,
1137                            import_uniforms_callback,
1138                            variable_ht);
1139    this->push_constant_loc = v->push_constant_loc;
1140    this->pull_constant_loc = v->pull_constant_loc;
1141    this->uniforms = v->uniforms;
1142    this->param_size = v->param_size;
1143 }
1144
1145 /* Our support for uniforms is piggy-backed on the struct
1146  * gl_fragment_program, because that's where the values actually
1147  * get stored, rather than in some global gl_shader_program uniform
1148  * store.
1149  */
1150 void
1151 fs_visitor::setup_uniform_values(ir_variable *ir)
1152 {
1153    int namelen = strlen(ir->name);
1154
1155    /* The data for our (non-builtin) uniforms is stored in a series of
1156     * gl_uniform_driver_storage structs for each subcomponent that
1157     * glGetUniformLocation() could name.  We know it's been set up in the same
1158     * order we'd walk the type, so walk the list of storage and find anything
1159     * with our name, or the prefix of a component that starts with our name.
1160     */
1161    unsigned params_before = uniforms;
1162    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
1163       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
1164
1165       if (strncmp(ir->name, storage->name, namelen) != 0 ||
1166           (storage->name[namelen] != 0 &&
1167            storage->name[namelen] != '.' &&
1168            storage->name[namelen] != '[')) {
1169          continue;
1170       }
1171
1172       unsigned slots = storage->type->component_slots();
1173       if (storage->array_elements)
1174          slots *= storage->array_elements;
1175
1176       for (unsigned i = 0; i < slots; i++) {
1177          stage_prog_data->param[uniforms++] = &storage->storage[i];
1178       }
1179    }
1180
1181    /* Make sure we actually initialized the right amount of stuff here. */
1182    assert(params_before + ir->type->component_slots() == uniforms);
1183    (void)params_before;
1184 }
1185
1186
1187 /* Our support for builtin uniforms is even scarier than non-builtin.
1188  * It sits on top of the PROG_STATE_VAR parameters that are
1189  * automatically updated from GL context state.
1190  */
1191 void
1192 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
1193 {
1194    const ir_state_slot *const slots = ir->get_state_slots();
1195    assert(slots != NULL);
1196
1197    for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
1198       /* This state reference has already been setup by ir_to_mesa, but we'll
1199        * get the same index back here.
1200        */
1201       int index = _mesa_add_state_reference(this->prog->Parameters,
1202                                             (gl_state_index *)slots[i].tokens);
1203
1204       /* Add each of the unique swizzles of the element as a parameter.
1205        * This'll end up matching the expected layout of the
1206        * array/matrix/structure we're trying to fill in.
1207        */
1208       int last_swiz = -1;
1209       for (unsigned int j = 0; j < 4; j++) {
1210          int swiz = GET_SWZ(slots[i].swizzle, j);
1211          if (swiz == last_swiz)
1212             break;
1213          last_swiz = swiz;
1214
1215          stage_prog_data->param[uniforms++] =
1216             &prog->Parameters->ParameterValues[index][swiz];
1217       }
1218    }
1219 }
1220
1221 fs_reg *
1222 fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
1223                                          bool origin_upper_left)
1224 {
1225    assert(stage == MESA_SHADER_FRAGMENT);
1226    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1227    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec4_type));
1228    fs_reg wpos = *reg;
1229    bool flip = !origin_upper_left ^ key->render_to_fbo;
1230
1231    /* gl_FragCoord.x */
1232    if (pixel_center_integer) {
1233       emit(MOV(wpos, this->pixel_x));
1234    } else {
1235       emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
1236    }
1237    wpos = offset(wpos, 1);
1238
1239    /* gl_FragCoord.y */
1240    if (!flip && pixel_center_integer) {
1241       emit(MOV(wpos, this->pixel_y));
1242    } else {
1243       fs_reg pixel_y = this->pixel_y;
1244       float offset = (pixel_center_integer ? 0.0 : 0.5);
1245
1246       if (flip) {
1247          pixel_y.negate = true;
1248          offset += key->drawable_height - 1.0;
1249       }
1250
1251       emit(ADD(wpos, pixel_y, fs_reg(offset)));
1252    }
1253    wpos = offset(wpos, 1);
1254
1255    /* gl_FragCoord.z */
1256    if (brw->gen >= 6) {
1257       emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
1258    } else {
1259       emit(FS_OPCODE_LINTERP, wpos,
1260            this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1261            this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1262            interp_reg(VARYING_SLOT_POS, 2));
1263    }
1264    wpos = offset(wpos, 1);
1265
1266    /* gl_FragCoord.w: Already set up in emit_interpolation */
1267    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1268
1269    return reg;
1270 }
1271
1272 fs_inst *
1273 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1274                          glsl_interp_qualifier interpolation_mode,
1275                          bool is_centroid, bool is_sample)
1276 {
1277    brw_wm_barycentric_interp_mode barycoord_mode;
1278    if (brw->gen >= 6) {
1279       if (is_centroid) {
1280          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1281             barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1282          else
1283             barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1284       } else if (is_sample) {
1285           if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1286             barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1287          else
1288             barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1289       } else {
1290          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1291             barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1292          else
1293             barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1294       }
1295    } else {
1296       /* On Ironlake and below, there is only one interpolation mode.
1297        * Centroid interpolation doesn't mean anything on this hardware --
1298        * there is no multisampling.
1299        */
1300       barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1301    }
1302    return emit(FS_OPCODE_LINTERP, attr,
1303                this->delta_x[barycoord_mode],
1304                this->delta_y[barycoord_mode], interp);
1305 }
1306
1307 void
1308 fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
1309                                        const glsl_type *type,
1310                                        glsl_interp_qualifier interpolation_mode,
1311                                        int location, bool mod_centroid,
1312                                        bool mod_sample)
1313 {
1314    attr.type = brw_type_for_base_type(type->get_scalar_type());
1315
1316    assert(stage == MESA_SHADER_FRAGMENT);
1317    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1318    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1319
1320    unsigned int array_elements;
1321
1322    if (type->is_array()) {
1323       array_elements = type->length;
1324       if (array_elements == 0) {
1325          fail("dereferenced array '%s' has length 0\n", name);
1326       }
1327       type = type->fields.array;
1328    } else {
1329       array_elements = 1;
1330    }
1331
1332    if (interpolation_mode == INTERP_QUALIFIER_NONE) {
1333       bool is_gl_Color =
1334          location == VARYING_SLOT_COL0 || location == VARYING_SLOT_COL1;
1335       if (key->flat_shade && is_gl_Color) {
1336          interpolation_mode = INTERP_QUALIFIER_FLAT;
1337       } else {
1338          interpolation_mode = INTERP_QUALIFIER_SMOOTH;
1339       }
1340    }
1341
1342    for (unsigned int i = 0; i < array_elements; i++) {
1343       for (unsigned int j = 0; j < type->matrix_columns; j++) {
1344          if (prog_data->urb_setup[location] == -1) {
1345             /* If there's no incoming setup data for this slot, don't
1346              * emit interpolation for it.
1347              */
1348             attr = offset(attr, type->vector_elements);
1349             location++;
1350             continue;
1351          }
1352
1353          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1354             /* Constant interpolation (flat shading) case. The SF has
1355              * handed us defined values in only the constant offset
1356              * field of the setup reg.
1357              */
1358             for (unsigned int k = 0; k < type->vector_elements; k++) {
1359                struct brw_reg interp = interp_reg(location, k);
1360                interp = suboffset(interp, 3);
1361                interp.type = attr.type;
1362                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1363                attr = offset(attr, 1);
1364             }
1365          } else {
1366             /* Smooth/noperspective interpolation case. */
1367             for (unsigned int k = 0; k < type->vector_elements; k++) {
1368                struct brw_reg interp = interp_reg(location, k);
1369                if (brw->needs_unlit_centroid_workaround && mod_centroid) {
1370                   /* Get the pixel/sample mask into f0 so that we know
1371                    * which pixels are lit.  Then, for each channel that is
1372                    * unlit, replace the centroid data with non-centroid
1373                    * data.
1374                    */
1375                   emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1376
1377                   fs_inst *inst;
1378                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1379                                       false, false);
1380                   inst->predicate = BRW_PREDICATE_NORMAL;
1381                   inst->predicate_inverse = true;
1382                   if (brw->has_pln)
1383                      inst->no_dd_clear = true;
1384
1385                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1386                                       mod_centroid && !key->persample_shading,
1387                                       mod_sample || key->persample_shading);
1388                   inst->predicate = BRW_PREDICATE_NORMAL;
1389                   inst->predicate_inverse = false;
1390                   if (brw->has_pln)
1391                      inst->no_dd_check = true;
1392
1393                } else {
1394                   emit_linterp(attr, fs_reg(interp), interpolation_mode,
1395                                mod_centroid && !key->persample_shading,
1396                                mod_sample || key->persample_shading);
1397                }
1398                if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1399                   emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1400                }
1401                attr = offset(attr, 1);
1402             }
1403
1404          }
1405          location++;
1406       }
1407    }
1408 }
1409
1410 fs_reg *
1411 fs_visitor::emit_frontfacing_interpolation()
1412 {
1413    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type));
1414
1415    if (brw->gen >= 6) {
1416       /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
1417        * a boolean result from this (~0/true or 0/false).
1418        *
1419        * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
1420        * this task in only one instruction:
1421        *    - a negation source modifier will flip the bit; and
1422        *    - a W -> D type conversion will sign extend the bit into the high
1423        *      word of the destination.
1424        *
1425        * An ASR 15 fills the low word of the destination.
1426        */
1427       fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
1428       g0.negate = true;
1429
1430       emit(ASR(*reg, g0, fs_reg(15)));
1431    } else {
1432       /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
1433        * a boolean result from this (1/true or 0/false).
1434        *
1435        * Like in the above case, since the bit is the MSB of g1.6:UD we can use
1436        * the negation source modifier to flip it. Unfortunately the SHR
1437        * instruction only operates on UD (or D with an abs source modifier)
1438        * sources without negation.
1439        *
1440        * Instead, use ASR (which will give ~0/true or 0/false).
1441        */
1442       fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
1443       g1_6.negate = true;
1444
1445       emit(ASR(*reg, g1_6, fs_reg(31)));
1446    }
1447
1448    return reg;
1449 }
1450
1451 void
1452 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1453 {
1454    assert(stage == MESA_SHADER_FRAGMENT);
1455    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1456    assert(dst.type == BRW_REGISTER_TYPE_F);
1457
1458    if (key->compute_pos_offset) {
1459       /* Convert int_sample_pos to floating point */
1460       emit(MOV(dst, int_sample_pos));
1461       /* Scale to the range [0, 1] */
1462       emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1463    }
1464    else {
1465       /* From ARB_sample_shading specification:
1466        * "When rendering to a non-multisample buffer, or if multisample
1467        *  rasterization is disabled, gl_SamplePosition will always be
1468        *  (0.5, 0.5).
1469        */
1470       emit(MOV(dst, fs_reg(0.5f)));
1471    }
1472 }
1473
1474 fs_reg *
1475 fs_visitor::emit_samplepos_setup()
1476 {
1477    assert(brw->gen >= 6);
1478
1479    this->current_annotation = "compute sample position";
1480    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type));
1481    fs_reg pos = *reg;
1482    fs_reg int_sample_x = vgrf(glsl_type::int_type);
1483    fs_reg int_sample_y = vgrf(glsl_type::int_type);
1484
1485    /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1486     * mode will be enabled.
1487     *
1488     * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1489     * R31.1:0         Position Offset X/Y for Slot[3:0]
1490     * R31.3:2         Position Offset X/Y for Slot[7:4]
1491     * .....
1492     *
1493     * The X, Y sample positions come in as bytes in  thread payload. So, read
1494     * the positions using vstride=16, width=8, hstride=2.
1495     */
1496    struct brw_reg sample_pos_reg =
1497       stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1498                     BRW_REGISTER_TYPE_B), 16, 8, 2);
1499
1500    if (dispatch_width == 8) {
1501       emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1502    } else {
1503       emit(MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg)));
1504       emit(MOV(half(int_sample_x, 1), fs_reg(suboffset(sample_pos_reg, 16))))
1505          ->force_sechalf = true;
1506    }
1507    /* Compute gl_SamplePosition.x */
1508    compute_sample_position(pos, int_sample_x);
1509    pos = offset(pos, 1);
1510    if (dispatch_width == 8) {
1511       emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1512    } else {
1513       emit(MOV(half(int_sample_y, 0),
1514                fs_reg(suboffset(sample_pos_reg, 1))));
1515       emit(MOV(half(int_sample_y, 1), fs_reg(suboffset(sample_pos_reg, 17))))
1516          ->force_sechalf = true;
1517    }
1518    /* Compute gl_SamplePosition.y */
1519    compute_sample_position(pos, int_sample_y);
1520    return reg;
1521 }
1522
1523 fs_reg *
1524 fs_visitor::emit_sampleid_setup()
1525 {
1526    assert(stage == MESA_SHADER_FRAGMENT);
1527    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1528    assert(brw->gen >= 6);
1529
1530    this->current_annotation = "compute sample id";
1531    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
1532
1533    if (key->compute_sample_id) {
1534       fs_reg t1 = vgrf(glsl_type::int_type);
1535       fs_reg t2 = vgrf(glsl_type::int_type);
1536       t2.type = BRW_REGISTER_TYPE_UW;
1537
1538       /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1539        * 8x multisampling, subspan 0 will represent sample N (where N
1540        * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1541        * 7. We can find the value of N by looking at R0.0 bits 7:6
1542        * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1543        * (since samples are always delivered in pairs). That is, we
1544        * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1545        * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1546        * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1547        * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1548        * populating a temporary variable with the sequence (0, 1, 2, 3),
1549        * and then reading from it using vstride=1, width=4, hstride=0.
1550        * These computations hold good for 4x multisampling as well.
1551        *
1552        * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1553        * the first four slots are sample 0 of subspan 0; the next four
1554        * are sample 1 of subspan 0; the third group is sample 0 of
1555        * subspan 1, and finally sample 1 of subspan 1.
1556        */
1557       fs_inst *inst;
1558       inst = emit(BRW_OPCODE_AND, t1,
1559                   fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1560                   fs_reg(0xc0));
1561       inst->force_writemask_all = true;
1562       inst = emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1563       inst->force_writemask_all = true;
1564       /* This works for both SIMD8 and SIMD16 */
1565       inst = emit(MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210)));
1566       inst->force_writemask_all = true;
1567       /* This special instruction takes care of setting vstride=1,
1568        * width=4, hstride=0 of t2 during an ADD instruction.
1569        */
1570       emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1571    } else {
1572       /* As per GL_ARB_sample_shading specification:
1573        * "When rendering to a non-multisample buffer, or if multisample
1574        *  rasterization is disabled, gl_SampleID will always be zero."
1575        */
1576       emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1577    }
1578
1579    return reg;
1580 }
1581
1582 void
1583 fs_visitor::resolve_source_modifiers(fs_reg *src)
1584 {
1585    if (!src->abs && !src->negate)
1586       return;
1587
1588    fs_reg temp = retype(vgrf(1), src->type);
1589    emit(MOV(temp, *src));
1590    *src = temp;
1591 }
1592
1593 fs_reg
1594 fs_visitor::fix_math_operand(fs_reg src)
1595 {
1596    /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1597     * might be able to do better by doing execsize = 1 math and then
1598     * expanding that result out, but we would need to be careful with
1599     * masking.
1600     *
1601     * The hardware ignores source modifiers (negate and abs) on math
1602     * instructions, so we also move to a temp to set those up.
1603     */
1604    if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1605        !src.abs && !src.negate)
1606       return src;
1607
1608    /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1609     * operands to math
1610     */
1611    if (brw->gen >= 7 && src.file != IMM)
1612       return src;
1613
1614    fs_reg expanded = vgrf(glsl_type::float_type);
1615    expanded.type = src.type;
1616    emit(BRW_OPCODE_MOV, expanded, src);
1617    return expanded;
1618 }
1619
1620 fs_inst *
1621 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1622 {
1623    switch (opcode) {
1624    case SHADER_OPCODE_RCP:
1625    case SHADER_OPCODE_RSQ:
1626    case SHADER_OPCODE_SQRT:
1627    case SHADER_OPCODE_EXP2:
1628    case SHADER_OPCODE_LOG2:
1629    case SHADER_OPCODE_SIN:
1630    case SHADER_OPCODE_COS:
1631       break;
1632    default:
1633       unreachable("not reached: bad math opcode");
1634    }
1635
1636    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
1637     * might be able to do better by doing execsize = 1 math and then
1638     * expanding that result out, but we would need to be careful with
1639     * masking.
1640     *
1641     * Gen 6 hardware ignores source modifiers (negate and abs) on math
1642     * instructions, so we also move to a temp to set those up.
1643     */
1644    if (brw->gen == 6 || brw->gen == 7)
1645       src = fix_math_operand(src);
1646
1647    fs_inst *inst = emit(opcode, dst, src);
1648
1649    if (brw->gen < 6) {
1650       inst->base_mrf = 2;
1651       inst->mlen = dispatch_width / 8;
1652    }
1653
1654    return inst;
1655 }
1656
1657 fs_inst *
1658 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1659 {
1660    int base_mrf = 2;
1661    fs_inst *inst;
1662
1663    if (brw->gen >= 8) {
1664       inst = emit(opcode, dst, src0, src1);
1665    } else if (brw->gen >= 6) {
1666       src0 = fix_math_operand(src0);
1667       src1 = fix_math_operand(src1);
1668
1669       inst = emit(opcode, dst, src0, src1);
1670    } else {
1671       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1672        * "Message Payload":
1673        *
1674        * "Operand0[7].  For the INT DIV functions, this operand is the
1675        *  denominator."
1676        *  ...
1677        * "Operand1[7].  For the INT DIV functions, this operand is the
1678        *  numerator."
1679        */
1680       bool is_int_div = opcode != SHADER_OPCODE_POW;
1681       fs_reg &op0 = is_int_div ? src1 : src0;
1682       fs_reg &op1 = is_int_div ? src0 : src1;
1683
1684       emit(MOV(fs_reg(MRF, base_mrf + 1, op1.type, dispatch_width), op1));
1685       inst = emit(opcode, dst, op0, reg_null_f);
1686
1687       inst->base_mrf = base_mrf;
1688       inst->mlen = 2 * dispatch_width / 8;
1689    }
1690    return inst;
1691 }
1692
1693 void
1694 fs_visitor::emit_discard_jump()
1695 {
1696    /* For performance, after a discard, jump to the end of the
1697     * shader if all relevant channels have been discarded.
1698     */
1699    fs_inst *discard_jump = emit(FS_OPCODE_DISCARD_JUMP);
1700    discard_jump->flag_subreg = 1;
1701
1702    discard_jump->predicate = (dispatch_width == 8)
1703                              ? BRW_PREDICATE_ALIGN1_ANY8H
1704                              : BRW_PREDICATE_ALIGN1_ANY16H;
1705    discard_jump->predicate_inverse = true;
1706 }
1707
1708 void
1709 fs_visitor::assign_curb_setup()
1710 {
1711    if (dispatch_width == 8) {
1712       prog_data->dispatch_grf_start_reg = payload.num_regs;
1713    } else {
1714       assert(stage == MESA_SHADER_FRAGMENT);
1715       brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1716       prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1717    }
1718
1719    prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1720
1721    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1722    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1723       for (unsigned int i = 0; i < inst->sources; i++) {
1724          if (inst->src[i].file == UNIFORM) {
1725             int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1726             int constant_nr;
1727             if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1728                constant_nr = push_constant_loc[uniform_nr];
1729             } else {
1730                /* Section 5.11 of the OpenGL 4.1 spec says:
1731                 * "Out-of-bounds reads return undefined values, which include
1732                 *  values from other variables of the active program or zero."
1733                 * Just return the first push constant.
1734                 */
1735                constant_nr = 0;
1736             }
1737
1738             struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1739                                                   constant_nr / 8,
1740                                                   constant_nr % 8);
1741
1742             inst->src[i].file = HW_REG;
1743             inst->src[i].fixed_hw_reg = byte_offset(
1744                retype(brw_reg, inst->src[i].type),
1745                inst->src[i].subreg_offset);
1746          }
1747       }
1748    }
1749 }
1750
1751 void
1752 fs_visitor::calculate_urb_setup()
1753 {
1754    assert(stage == MESA_SHADER_FRAGMENT);
1755    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1756    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1757
1758    memset(prog_data->urb_setup, -1,
1759           sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
1760
1761    int urb_next = 0;
1762    /* Figure out where each of the incoming setup attributes lands. */
1763    if (brw->gen >= 6) {
1764       if (_mesa_bitcount_64(prog->InputsRead &
1765                             BRW_FS_VARYING_INPUT_MASK) <= 16) {
1766          /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1767           * first 16 varying inputs, so we can put them wherever we want.
1768           * Just put them in order.
1769           *
1770           * This is useful because it means that (a) inputs not used by the
1771           * fragment shader won't take up valuable register space, and (b) we
1772           * won't have to recompile the fragment shader if it gets paired with
1773           * a different vertex (or geometry) shader.
1774           */
1775          for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1776             if (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1777                 BITFIELD64_BIT(i)) {
1778                prog_data->urb_setup[i] = urb_next++;
1779             }
1780          }
1781       } else {
1782          /* We have enough input varyings that the SF/SBE pipeline stage can't
1783           * arbitrarily rearrange them to suit our whim; we have to put them
1784           * in an order that matches the output of the previous pipeline stage
1785           * (geometry or vertex shader).
1786           */
1787          struct brw_vue_map prev_stage_vue_map;
1788          brw_compute_vue_map(brw, &prev_stage_vue_map,
1789                              key->input_slots_valid);
1790          int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1791          assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1792          for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1793               slot++) {
1794             int varying = prev_stage_vue_map.slot_to_varying[slot];
1795             /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1796              * unused.
1797              */
1798             if (varying != BRW_VARYING_SLOT_COUNT &&
1799                 (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1800                  BITFIELD64_BIT(varying))) {
1801                prog_data->urb_setup[varying] = slot - first_slot;
1802             }
1803          }
1804          urb_next = prev_stage_vue_map.num_slots - first_slot;
1805       }
1806    } else {
1807       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1808       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1809          /* Point size is packed into the header, not as a general attribute */
1810          if (i == VARYING_SLOT_PSIZ)
1811             continue;
1812
1813          if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1814             /* The back color slot is skipped when the front color is
1815              * also written to.  In addition, some slots can be
1816              * written in the vertex shader and not read in the
1817              * fragment shader.  So the register number must always be
1818              * incremented, mapped or not.
1819              */
1820             if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1821                prog_data->urb_setup[i] = urb_next;
1822             urb_next++;
1823          }
1824       }
1825
1826       /*
1827        * It's a FS only attribute, and we did interpolation for this attribute
1828        * in SF thread. So, count it here, too.
1829        *
1830        * See compile_sf_prog() for more info.
1831        */
1832       if (prog->InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1833          prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1834    }
1835
1836    prog_data->num_varying_inputs = urb_next;
1837 }
1838
1839 void
1840 fs_visitor::assign_urb_setup()
1841 {
1842    assert(stage == MESA_SHADER_FRAGMENT);
1843    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1844
1845    int urb_start = payload.num_regs + prog_data->base.curb_read_length;
1846
1847    /* Offset all the urb_setup[] index by the actual position of the
1848     * setup regs, now that the location of the constants has been chosen.
1849     */
1850    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1851       if (inst->opcode == FS_OPCODE_LINTERP) {
1852          assert(inst->src[2].file == HW_REG);
1853          inst->src[2].fixed_hw_reg.nr += urb_start;
1854       }
1855
1856       if (inst->opcode == FS_OPCODE_CINTERP) {
1857          assert(inst->src[0].file == HW_REG);
1858          inst->src[0].fixed_hw_reg.nr += urb_start;
1859       }
1860    }
1861
1862    /* Each attribute is 4 setup channels, each of which is half a reg. */
1863    this->first_non_payload_grf =
1864       urb_start + prog_data->num_varying_inputs * 2;
1865 }
1866
1867 void
1868 fs_visitor::assign_vs_urb_setup()
1869 {
1870    brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
1871    int grf, count, slot, channel, attr;
1872
1873    assert(stage == MESA_SHADER_VERTEX);
1874    count = _mesa_bitcount_64(vs_prog_data->inputs_read);
1875    if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid)
1876       count++;
1877
1878    /* Each attribute is 4 regs. */
1879    this->first_non_payload_grf =
1880       payload.num_regs + prog_data->curb_read_length + count * 4;
1881
1882    unsigned vue_entries =
1883       MAX2(count, vs_prog_data->base.vue_map.num_slots);
1884
1885    vs_prog_data->base.urb_entry_size = ALIGN(vue_entries, 4) / 4;
1886    vs_prog_data->base.urb_read_length = (count + 1) / 2;
1887
1888    assert(vs_prog_data->base.urb_read_length <= 15);
1889
1890    /* Rewrite all ATTR file references to the hw grf that they land in. */
1891    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1892       for (int i = 0; i < inst->sources; i++) {
1893          if (inst->src[i].file == ATTR) {
1894
1895             if (inst->src[i].reg == VERT_ATTRIB_MAX) {
1896                slot = count - 1;
1897             } else {
1898                /* Attributes come in in a contiguous block, ordered by their
1899                 * gl_vert_attrib value.  That means we can compute the slot
1900                 * number for an attribute by masking out the enabled
1901                 * attributes before it and counting the bits.
1902                 */
1903                attr = inst->src[i].reg + inst->src[i].reg_offset / 4;
1904                slot = _mesa_bitcount_64(vs_prog_data->inputs_read &
1905                                         BITFIELD64_MASK(attr));
1906             }
1907
1908             channel = inst->src[i].reg_offset & 3;
1909
1910             grf = payload.num_regs +
1911                prog_data->curb_read_length +
1912                slot * 4 + channel;
1913
1914             inst->src[i].file = HW_REG;
1915             inst->src[i].fixed_hw_reg =
1916                retype(brw_vec8_grf(grf, 0), inst->src[i].type);
1917          }
1918       }
1919    }
1920 }
1921
1922 /**
1923  * Split large virtual GRFs into separate components if we can.
1924  *
1925  * This is mostly duplicated with what brw_fs_vector_splitting does,
1926  * but that's really conservative because it's afraid of doing
1927  * splitting that doesn't result in real progress after the rest of
1928  * the optimization phases, which would cause infinite looping in
1929  * optimization.  We can do it once here, safely.  This also has the
1930  * opportunity to split interpolated values, or maybe even uniforms,
1931  * which we don't have at the IR level.
1932  *
1933  * We want to split, because virtual GRFs are what we register
1934  * allocate and spill (due to contiguousness requirements for some
1935  * instructions), and they're what we naturally generate in the
1936  * codegen process, but most virtual GRFs don't actually need to be
1937  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1938  * live intervals and better dead code elimination and coalescing.
1939  */
1940 void
1941 fs_visitor::split_virtual_grfs()
1942 {
1943    int num_vars = this->alloc.count;
1944
1945    /* Count the total number of registers */
1946    int reg_count = 0;
1947    int vgrf_to_reg[num_vars];
1948    for (int i = 0; i < num_vars; i++) {
1949       vgrf_to_reg[i] = reg_count;
1950       reg_count += alloc.sizes[i];
1951    }
1952
1953    /* An array of "split points".  For each register slot, this indicates
1954     * if this slot can be separated from the previous slot.  Every time an
1955     * instruction uses multiple elements of a register (as a source or
1956     * destination), we mark the used slots as inseparable.  Then we go
1957     * through and split the registers into the smallest pieces we can.
1958     */
1959    bool split_points[reg_count];
1960    memset(split_points, 0, sizeof(split_points));
1961
1962    /* Mark all used registers as fully splittable */
1963    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1964       if (inst->dst.file == GRF) {
1965          int reg = vgrf_to_reg[inst->dst.reg];
1966          for (unsigned j = 1; j < this->alloc.sizes[inst->dst.reg]; j++)
1967             split_points[reg + j] = true;
1968       }
1969
1970       for (int i = 0; i < inst->sources; i++) {
1971          if (inst->src[i].file == GRF) {
1972             int reg = vgrf_to_reg[inst->src[i].reg];
1973             for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].reg]; j++)
1974                split_points[reg + j] = true;
1975          }
1976       }
1977    }
1978
1979    if (brw->has_pln &&
1980        this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1981       /* PLN opcodes rely on the delta_xy being contiguous.  We only have to
1982        * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1983        * Gen6, that was the only supported interpolation mode, and since Gen6,
1984        * delta_x and delta_y are in fixed hardware registers.
1985        */
1986       int vgrf = this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg;
1987       split_points[vgrf_to_reg[vgrf] + 1] = false;
1988    }
1989
1990    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1991       if (inst->dst.file == GRF) {
1992          int reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
1993          for (int j = 1; j < inst->regs_written; j++)
1994             split_points[reg + j] = false;
1995       }
1996       for (int i = 0; i < inst->sources; i++) {
1997          if (inst->src[i].file == GRF) {
1998             int reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
1999             for (int j = 1; j < inst->regs_read(i); j++)
2000                split_points[reg + j] = false;
2001          }
2002       }
2003    }
2004
2005    int new_virtual_grf[reg_count];
2006    int new_reg_offset[reg_count];
2007
2008    int reg = 0;
2009    for (int i = 0; i < num_vars; i++) {
2010       /* The first one should always be 0 as a quick sanity check. */
2011       assert(split_points[reg] == false);
2012
2013       /* j = 0 case */
2014       new_reg_offset[reg] = 0;
2015       reg++;
2016       int offset = 1;
2017
2018       /* j > 0 case */
2019       for (unsigned j = 1; j < alloc.sizes[i]; j++) {
2020          /* If this is a split point, reset the offset to 0 and allocate a
2021           * new virtual GRF for the previous offset many registers
2022           */
2023          if (split_points[reg]) {
2024             assert(offset <= MAX_VGRF_SIZE);
2025             int grf = alloc.allocate(offset);
2026             for (int k = reg - offset; k < reg; k++)
2027                new_virtual_grf[k] = grf;
2028             offset = 0;
2029          }
2030          new_reg_offset[reg] = offset;
2031          offset++;
2032          reg++;
2033       }
2034
2035       /* The last one gets the original register number */
2036       assert(offset <= MAX_VGRF_SIZE);
2037       alloc.sizes[i] = offset;
2038       for (int k = reg - offset; k < reg; k++)
2039          new_virtual_grf[k] = i;
2040    }
2041    assert(reg == reg_count);
2042
2043    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2044       if (inst->dst.file == GRF) {
2045          reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
2046          inst->dst.reg = new_virtual_grf[reg];
2047          inst->dst.reg_offset = new_reg_offset[reg];
2048          assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
2049       }
2050       for (int i = 0; i < inst->sources; i++) {
2051          if (inst->src[i].file == GRF) {
2052             reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
2053             inst->src[i].reg = new_virtual_grf[reg];
2054             inst->src[i].reg_offset = new_reg_offset[reg];
2055             assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
2056          }
2057       }
2058    }
2059    invalidate_live_intervals();
2060 }
2061
2062 /**
2063  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
2064  *
2065  * During code generation, we create tons of temporary variables, many of
2066  * which get immediately killed and are never used again.  Yet, in later
2067  * optimization and analysis passes, such as compute_live_intervals, we need
2068  * to loop over all the virtual GRFs.  Compacting them can save a lot of
2069  * overhead.
2070  */
2071 bool
2072 fs_visitor::compact_virtual_grfs()
2073 {
2074    bool progress = false;
2075    int remap_table[this->alloc.count];
2076    memset(remap_table, -1, sizeof(remap_table));
2077
2078    /* Mark which virtual GRFs are used. */
2079    foreach_block_and_inst(block, const fs_inst, inst, cfg) {
2080       if (inst->dst.file == GRF)
2081          remap_table[inst->dst.reg] = 0;
2082
2083       for (int i = 0; i < inst->sources; i++) {
2084          if (inst->src[i].file == GRF)
2085             remap_table[inst->src[i].reg] = 0;
2086       }
2087    }
2088
2089    /* Compact the GRF arrays. */
2090    int new_index = 0;
2091    for (unsigned i = 0; i < this->alloc.count; i++) {
2092       if (remap_table[i] == -1) {
2093          /* We just found an unused register.  This means that we are
2094           * actually going to compact something.
2095           */
2096          progress = true;
2097       } else {
2098          remap_table[i] = new_index;
2099          alloc.sizes[new_index] = alloc.sizes[i];
2100          invalidate_live_intervals();
2101          ++new_index;
2102       }
2103    }
2104
2105    this->alloc.count = new_index;
2106
2107    /* Patch all the instructions to use the newly renumbered registers */
2108    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2109       if (inst->dst.file == GRF)
2110          inst->dst.reg = remap_table[inst->dst.reg];
2111
2112       for (int i = 0; i < inst->sources; i++) {
2113          if (inst->src[i].file == GRF)
2114             inst->src[i].reg = remap_table[inst->src[i].reg];
2115       }
2116    }
2117
2118    /* Patch all the references to delta_x/delta_y, since they're used in
2119     * register allocation.  If they're unused, switch them to BAD_FILE so
2120     * we don't think some random VGRF is delta_x/delta_y.
2121     */
2122    for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
2123       if (delta_x[i].file == GRF) {
2124          if (remap_table[delta_x[i].reg] != -1) {
2125             delta_x[i].reg = remap_table[delta_x[i].reg];
2126          } else {
2127             delta_x[i].file = BAD_FILE;
2128          }
2129       }
2130    }
2131    for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
2132       if (delta_y[i].file == GRF) {
2133          if (remap_table[delta_y[i].reg] != -1) {
2134             delta_y[i].reg = remap_table[delta_y[i].reg];
2135          } else {
2136             delta_y[i].file = BAD_FILE;
2137          }
2138       }
2139    }
2140
2141    return progress;
2142 }
2143
2144 /*
2145  * Implements array access of uniforms by inserting a
2146  * PULL_CONSTANT_LOAD instruction.
2147  *
2148  * Unlike temporary GRF array access (where we don't support it due to
2149  * the difficulty of doing relative addressing on instruction
2150  * destinations), we could potentially do array access of uniforms
2151  * that were loaded in GRF space as push constants.  In real-world
2152  * usage we've seen, though, the arrays being used are always larger
2153  * than we could load as push constants, so just always move all
2154  * uniform array access out to a pull constant buffer.
2155  */
2156 void
2157 fs_visitor::move_uniform_array_access_to_pull_constants()
2158 {
2159    if (dispatch_width != 8)
2160       return;
2161
2162    pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2163    memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
2164
2165    /* Walk through and find array access of uniforms.  Put a copy of that
2166     * uniform in the pull constant buffer.
2167     *
2168     * Note that we don't move constant-indexed accesses to arrays.  No
2169     * testing has been done of the performance impact of this choice.
2170     */
2171    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2172       for (int i = 0 ; i < inst->sources; i++) {
2173          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2174             continue;
2175
2176          int uniform = inst->src[i].reg;
2177
2178          /* If this array isn't already present in the pull constant buffer,
2179           * add it.
2180           */
2181          if (pull_constant_loc[uniform] == -1) {
2182             const gl_constant_value **values = &stage_prog_data->param[uniform];
2183
2184             assert(param_size[uniform]);
2185
2186             for (int j = 0; j < param_size[uniform]; j++) {
2187                pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
2188
2189                stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
2190                   values[j];
2191             }
2192          }
2193       }
2194    }
2195 }
2196
2197 /**
2198  * Assign UNIFORM file registers to either push constants or pull constants.
2199  *
2200  * We allow a fragment shader to have more than the specified minimum
2201  * maximum number of fragment shader uniform components (64).  If
2202  * there are too many of these, they'd fill up all of register space.
2203  * So, this will push some of them out to the pull constant buffer and
2204  * update the program to load them.
2205  */
2206 void
2207 fs_visitor::assign_constant_locations()
2208 {
2209    /* Only the first compile (SIMD8 mode) gets to decide on locations. */
2210    if (dispatch_width != 8)
2211       return;
2212
2213    /* Find which UNIFORM registers are still in use. */
2214    bool is_live[uniforms];
2215    for (unsigned int i = 0; i < uniforms; i++) {
2216       is_live[i] = false;
2217    }
2218
2219    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2220       for (int i = 0; i < inst->sources; i++) {
2221          if (inst->src[i].file != UNIFORM)
2222             continue;
2223
2224          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
2225          if (constant_nr >= 0 && constant_nr < (int) uniforms)
2226             is_live[constant_nr] = true;
2227       }
2228    }
2229
2230    /* Only allow 16 registers (128 uniform components) as push constants.
2231     *
2232     * Just demote the end of the list.  We could probably do better
2233     * here, demoting things that are rarely used in the program first.
2234     *
2235     * If changing this value, note the limitation about total_regs in
2236     * brw_curbe.c.
2237     */
2238    unsigned int max_push_components = 16 * 8;
2239    unsigned int num_push_constants = 0;
2240
2241    push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2242
2243    for (unsigned int i = 0; i < uniforms; i++) {
2244       if (!is_live[i] || pull_constant_loc[i] != -1) {
2245          /* This UNIFORM register is either dead, or has already been demoted
2246           * to a pull const.  Mark it as no longer living in the param[] array.
2247           */
2248          push_constant_loc[i] = -1;
2249          continue;
2250       }
2251
2252       if (num_push_constants < max_push_components) {
2253          /* Retain as a push constant.  Record the location in the params[]
2254           * array.
2255           */
2256          push_constant_loc[i] = num_push_constants++;
2257       } else {
2258          /* Demote to a pull constant. */
2259          push_constant_loc[i] = -1;
2260
2261          int pull_index = stage_prog_data->nr_pull_params++;
2262          stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
2263          pull_constant_loc[i] = pull_index;
2264       }
2265    }
2266
2267    stage_prog_data->nr_params = num_push_constants;
2268
2269    /* Up until now, the param[] array has been indexed by reg + reg_offset
2270     * of UNIFORM registers.  Condense it to only contain the uniforms we
2271     * chose to upload as push constants.
2272     */
2273    for (unsigned int i = 0; i < uniforms; i++) {
2274       int remapped = push_constant_loc[i];
2275
2276       if (remapped == -1)
2277          continue;
2278
2279       assert(remapped <= (int)i);
2280       stage_prog_data->param[remapped] = stage_prog_data->param[i];
2281    }
2282 }
2283
2284 /**
2285  * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
2286  * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
2287  */
2288 void
2289 fs_visitor::demote_pull_constants()
2290 {
2291    foreach_block_and_inst (block, fs_inst, inst, cfg) {
2292       for (int i = 0; i < inst->sources; i++) {
2293          if (inst->src[i].file != UNIFORM)
2294             continue;
2295
2296          int pull_index;
2297          unsigned location = inst->src[i].reg + inst->src[i].reg_offset;
2298          if (location >= uniforms) /* Out of bounds access */
2299             pull_index = -1;
2300          else
2301             pull_index = pull_constant_loc[location];
2302
2303          if (pull_index == -1)
2304             continue;
2305
2306          /* Set up the annotation tracking for new generated instructions. */
2307          base_ir = inst->ir;
2308          current_annotation = inst->annotation;
2309
2310          fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
2311          fs_reg dst = vgrf(glsl_type::float_type);
2312
2313          /* Generate a pull load into dst. */
2314          if (inst->src[i].reladdr) {
2315             exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
2316                                                         surf_index,
2317                                                         *inst->src[i].reladdr,
2318                                                         pull_index);
2319             inst->insert_before(block, &list);
2320             inst->src[i].reladdr = NULL;
2321          } else {
2322             fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
2323             fs_inst *pull =
2324                new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 8,
2325                                     dst, surf_index, offset);
2326             inst->insert_before(block, pull);
2327             inst->src[i].set_smear(pull_index & 3);
2328          }
2329
2330          /* Rewrite the instruction to use the temporary VGRF. */
2331          inst->src[i].file = GRF;
2332          inst->src[i].reg = dst.reg;
2333          inst->src[i].reg_offset = 0;
2334          inst->src[i].width = dispatch_width;
2335       }
2336    }
2337    invalidate_live_intervals();
2338 }
2339
2340 bool
2341 fs_visitor::opt_algebraic()
2342 {
2343    bool progress = false;
2344
2345    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2346       switch (inst->opcode) {
2347       case BRW_OPCODE_MOV:
2348          if (inst->src[0].file != IMM)
2349             break;
2350
2351          if (inst->saturate) {
2352             if (inst->dst.type != inst->src[0].type)
2353                assert(!"unimplemented: saturate mixed types");
2354
2355             if (brw_saturate_immediate(inst->dst.type,
2356                                        &inst->src[0].fixed_hw_reg)) {
2357                inst->saturate = false;
2358                progress = true;
2359             }
2360          }
2361          break;
2362
2363       case BRW_OPCODE_MUL:
2364          if (inst->src[1].file != IMM)
2365             continue;
2366
2367          /* a * 1.0 = a */
2368          if (inst->src[1].is_one()) {
2369             inst->opcode = BRW_OPCODE_MOV;
2370             inst->src[1] = reg_undef;
2371             progress = true;
2372             break;
2373          }
2374
2375          /* a * -1.0 = -a */
2376          if (inst->src[1].is_negative_one()) {
2377             inst->opcode = BRW_OPCODE_MOV;
2378             inst->src[0].negate = !inst->src[0].negate;
2379             inst->src[1] = reg_undef;
2380             progress = true;
2381             break;
2382          }
2383
2384          /* a * 0.0 = 0.0 */
2385          if (inst->src[1].is_zero()) {
2386             inst->opcode = BRW_OPCODE_MOV;
2387             inst->src[0] = inst->src[1];
2388             inst->src[1] = reg_undef;
2389             progress = true;
2390             break;
2391          }
2392
2393          if (inst->src[0].file == IMM) {
2394             assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2395             inst->opcode = BRW_OPCODE_MOV;
2396             inst->src[0].fixed_hw_reg.dw1.f *= inst->src[1].fixed_hw_reg.dw1.f;
2397             inst->src[1] = reg_undef;
2398             progress = true;
2399             break;
2400          }
2401          break;
2402       case BRW_OPCODE_ADD:
2403          if (inst->src[1].file != IMM)
2404             continue;
2405
2406          /* a + 0.0 = a */
2407          if (inst->src[1].is_zero()) {
2408             inst->opcode = BRW_OPCODE_MOV;
2409             inst->src[1] = reg_undef;
2410             progress = true;
2411             break;
2412          }
2413
2414          if (inst->src[0].file == IMM) {
2415             assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2416             inst->opcode = BRW_OPCODE_MOV;
2417             inst->src[0].fixed_hw_reg.dw1.f += inst->src[1].fixed_hw_reg.dw1.f;
2418             inst->src[1] = reg_undef;
2419             progress = true;
2420             break;
2421          }
2422          break;
2423       case BRW_OPCODE_OR:
2424          if (inst->src[0].equals(inst->src[1])) {
2425             inst->opcode = BRW_OPCODE_MOV;
2426             inst->src[1] = reg_undef;
2427             progress = true;
2428             break;
2429          }
2430          break;
2431       case BRW_OPCODE_LRP:
2432          if (inst->src[1].equals(inst->src[2])) {
2433             inst->opcode = BRW_OPCODE_MOV;
2434             inst->src[0] = inst->src[1];
2435             inst->src[1] = reg_undef;
2436             inst->src[2] = reg_undef;
2437             progress = true;
2438             break;
2439          }
2440          break;
2441       case BRW_OPCODE_CMP:
2442          if (inst->conditional_mod == BRW_CONDITIONAL_GE &&
2443              inst->src[0].abs &&
2444              inst->src[0].negate &&
2445              inst->src[1].is_zero()) {
2446             inst->src[0].abs = false;
2447             inst->src[0].negate = false;
2448             inst->conditional_mod = BRW_CONDITIONAL_Z;
2449             progress = true;
2450             break;
2451          }
2452          break;
2453       case BRW_OPCODE_SEL:
2454          if (inst->src[0].equals(inst->src[1])) {
2455             inst->opcode = BRW_OPCODE_MOV;
2456             inst->src[1] = reg_undef;
2457             inst->predicate = BRW_PREDICATE_NONE;
2458             inst->predicate_inverse = false;
2459             progress = true;
2460          } else if (inst->saturate && inst->src[1].file == IMM) {
2461             switch (inst->conditional_mod) {
2462             case BRW_CONDITIONAL_LE:
2463             case BRW_CONDITIONAL_L:
2464                switch (inst->src[1].type) {
2465                case BRW_REGISTER_TYPE_F:
2466                   if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
2467                      inst->opcode = BRW_OPCODE_MOV;
2468                      inst->src[1] = reg_undef;
2469                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
2470                      progress = true;
2471                   }
2472                   break;
2473                default:
2474                   break;
2475                }
2476                break;
2477             case BRW_CONDITIONAL_GE:
2478             case BRW_CONDITIONAL_G:
2479                switch (inst->src[1].type) {
2480                case BRW_REGISTER_TYPE_F:
2481                   if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
2482                      inst->opcode = BRW_OPCODE_MOV;
2483                      inst->src[1] = reg_undef;
2484                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
2485                      progress = true;
2486                   }
2487                   break;
2488                default:
2489                   break;
2490                }
2491             default:
2492                break;
2493             }
2494          }
2495          break;
2496       case BRW_OPCODE_MAD:
2497          if (inst->src[1].is_zero() || inst->src[2].is_zero()) {
2498             inst->opcode = BRW_OPCODE_MOV;
2499             inst->src[1] = reg_undef;
2500             inst->src[2] = reg_undef;
2501             progress = true;
2502          } else if (inst->src[0].is_zero()) {
2503             inst->opcode = BRW_OPCODE_MUL;
2504             inst->src[0] = inst->src[2];
2505             inst->src[2] = reg_undef;
2506             progress = true;
2507          } else if (inst->src[1].is_one()) {
2508             inst->opcode = BRW_OPCODE_ADD;
2509             inst->src[1] = inst->src[2];
2510             inst->src[2] = reg_undef;
2511             progress = true;
2512          } else if (inst->src[2].is_one()) {
2513             inst->opcode = BRW_OPCODE_ADD;
2514             inst->src[2] = reg_undef;
2515             progress = true;
2516          } else if (inst->src[1].file == IMM && inst->src[2].file == IMM) {
2517             inst->opcode = BRW_OPCODE_ADD;
2518             inst->src[1].fixed_hw_reg.dw1.f *= inst->src[2].fixed_hw_reg.dw1.f;
2519             inst->src[2] = reg_undef;
2520             progress = true;
2521          }
2522          break;
2523       case SHADER_OPCODE_RCP: {
2524          fs_inst *prev = (fs_inst *)inst->prev;
2525          if (prev->opcode == SHADER_OPCODE_SQRT) {
2526             if (inst->src[0].equals(prev->dst)) {
2527                inst->opcode = SHADER_OPCODE_RSQ;
2528                inst->src[0] = prev->src[0];
2529                progress = true;
2530             }
2531          }
2532          break;
2533       }
2534       default:
2535          break;
2536       }
2537
2538       /* Swap if src[0] is immediate. */
2539       if (progress && inst->is_commutative()) {
2540          if (inst->src[0].file == IMM) {
2541             fs_reg tmp = inst->src[1];
2542             inst->src[1] = inst->src[0];
2543             inst->src[0] = tmp;
2544          }
2545       }
2546    }
2547    return progress;
2548 }
2549
2550 bool
2551 fs_visitor::opt_register_renaming()
2552 {
2553    bool progress = false;
2554    int depth = 0;
2555
2556    int remap[alloc.count];
2557    memset(remap, -1, sizeof(int) * alloc.count);
2558
2559    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2560       if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
2561          depth++;
2562       } else if (inst->opcode == BRW_OPCODE_ENDIF ||
2563                  inst->opcode == BRW_OPCODE_WHILE) {
2564          depth--;
2565       }
2566
2567       /* Rewrite instruction sources. */
2568       for (int i = 0; i < inst->sources; i++) {
2569          if (inst->src[i].file == GRF &&
2570              remap[inst->src[i].reg] != -1 &&
2571              remap[inst->src[i].reg] != inst->src[i].reg) {
2572             inst->src[i].reg = remap[inst->src[i].reg];
2573             progress = true;
2574          }
2575       }
2576
2577       const int dst = inst->dst.reg;
2578
2579       if (depth == 0 &&
2580           inst->dst.file == GRF &&
2581           alloc.sizes[inst->dst.reg] == inst->dst.width / 8 &&
2582           !inst->is_partial_write()) {
2583          if (remap[dst] == -1) {
2584             remap[dst] = dst;
2585          } else {
2586             remap[dst] = alloc.allocate(inst->dst.width / 8);
2587             inst->dst.reg = remap[dst];
2588             progress = true;
2589          }
2590       } else if (inst->dst.file == GRF &&
2591                  remap[dst] != -1 &&
2592                  remap[dst] != dst) {
2593          inst->dst.reg = remap[dst];
2594          progress = true;
2595       }
2596    }
2597
2598    if (progress) {
2599       invalidate_live_intervals();
2600
2601       for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
2602          if (delta_x[i].file == GRF && remap[delta_x[i].reg] != -1) {
2603             delta_x[i].reg = remap[delta_x[i].reg];
2604          }
2605       }
2606       for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
2607          if (delta_y[i].file == GRF && remap[delta_y[i].reg] != -1) {
2608             delta_y[i].reg = remap[delta_y[i].reg];
2609          }
2610       }
2611    }
2612
2613    return progress;
2614 }
2615
2616 /**
2617  * Remove redundant or useless discard jumps.
2618  *
2619  * For example, we can eliminate jumps in the following sequence:
2620  *
2621  * discard-jump       (redundant with the next jump)
2622  * discard-jump       (useless; jumps to the next instruction)
2623  * placeholder-halt
2624  */
2625 bool
2626 fs_visitor::opt_redundant_discard_jumps()
2627 {
2628    bool progress = false;
2629
2630    bblock_t *last_bblock = cfg->blocks[cfg->num_blocks - 1];
2631
2632    fs_inst *placeholder_halt = NULL;
2633    foreach_inst_in_block_reverse(fs_inst, inst, last_bblock) {
2634       if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT) {
2635          placeholder_halt = inst;
2636          break;
2637       }
2638    }
2639
2640    if (!placeholder_halt)
2641       return false;
2642
2643    /* Delete any HALTs immediately before the placeholder halt. */
2644    for (fs_inst *prev = (fs_inst *) placeholder_halt->prev;
2645         !prev->is_head_sentinel() && prev->opcode == FS_OPCODE_DISCARD_JUMP;
2646         prev = (fs_inst *) placeholder_halt->prev) {
2647       prev->remove(last_bblock);
2648       progress = true;
2649    }
2650
2651    if (progress)
2652       invalidate_live_intervals();
2653
2654    return progress;
2655 }
2656
2657 bool
2658 fs_visitor::compute_to_mrf()
2659 {
2660    bool progress = false;
2661    int next_ip = 0;
2662
2663    /* No MRFs on Gen >= 7. */
2664    if (brw->gen >= 7)
2665       return false;
2666
2667    calculate_live_intervals();
2668
2669    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2670       int ip = next_ip;
2671       next_ip++;
2672
2673       if (inst->opcode != BRW_OPCODE_MOV ||
2674           inst->is_partial_write() ||
2675           inst->dst.file != MRF || inst->src[0].file != GRF ||
2676           inst->dst.type != inst->src[0].type ||
2677           inst->src[0].abs || inst->src[0].negate ||
2678           !inst->src[0].is_contiguous() ||
2679           inst->src[0].subreg_offset)
2680          continue;
2681
2682       /* Work out which hardware MRF registers are written by this
2683        * instruction.
2684        */
2685       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2686       int mrf_high;
2687       if (inst->dst.reg & BRW_MRF_COMPR4) {
2688          mrf_high = mrf_low + 4;
2689       } else if (inst->exec_size == 16) {
2690          mrf_high = mrf_low + 1;
2691       } else {
2692          mrf_high = mrf_low;
2693       }
2694
2695       /* Can't compute-to-MRF this GRF if someone else was going to
2696        * read it later.
2697        */
2698       if (this->virtual_grf_end[inst->src[0].reg] > ip)
2699          continue;
2700
2701       /* Found a move of a GRF to a MRF.  Let's see if we can go
2702        * rewrite the thing that made this GRF to write into the MRF.
2703        */
2704       foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2705          if (scan_inst->dst.file == GRF &&
2706              scan_inst->dst.reg == inst->src[0].reg) {
2707             /* Found the last thing to write our reg we want to turn
2708              * into a compute-to-MRF.
2709              */
2710
2711             /* If this one instruction didn't populate all the
2712              * channels, bail.  We might be able to rewrite everything
2713              * that writes that reg, but it would require smarter
2714              * tracking to delay the rewriting until complete success.
2715              */
2716             if (scan_inst->is_partial_write())
2717                break;
2718
2719             /* Things returning more than one register would need us to
2720              * understand coalescing out more than one MOV at a time.
2721              */
2722             if (scan_inst->regs_written > scan_inst->dst.width / 8)
2723                break;
2724
2725             /* SEND instructions can't have MRF as a destination. */
2726             if (scan_inst->mlen)
2727                break;
2728
2729             if (brw->gen == 6) {
2730                /* gen6 math instructions must have the destination be
2731                 * GRF, so no compute-to-MRF for them.
2732                 */
2733                if (scan_inst->is_math()) {
2734                   break;
2735                }
2736             }
2737
2738             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2739                /* Found the creator of our MRF's source value. */
2740                scan_inst->dst.file = MRF;
2741                scan_inst->dst.reg = inst->dst.reg;
2742                scan_inst->saturate |= inst->saturate;
2743                inst->remove(block);
2744                progress = true;
2745             }
2746             break;
2747          }
2748
2749          /* We don't handle control flow here.  Most computation of
2750           * values that end up in MRFs are shortly before the MRF
2751           * write anyway.
2752           */
2753          if (block->start() == scan_inst)
2754             break;
2755
2756          /* You can't read from an MRF, so if someone else reads our
2757           * MRF's source GRF that we wanted to rewrite, that stops us.
2758           */
2759          bool interfered = false;
2760          for (int i = 0; i < scan_inst->sources; i++) {
2761             if (scan_inst->src[i].file == GRF &&
2762                 scan_inst->src[i].reg == inst->src[0].reg &&
2763                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2764                interfered = true;
2765             }
2766          }
2767          if (interfered)
2768             break;
2769
2770          if (scan_inst->dst.file == MRF) {
2771             /* If somebody else writes our MRF here, we can't
2772              * compute-to-MRF before that.
2773              */
2774             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2775             int scan_mrf_high;
2776
2777             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2778                scan_mrf_high = scan_mrf_low + 4;
2779             } else if (scan_inst->exec_size == 16) {
2780                scan_mrf_high = scan_mrf_low + 1;
2781             } else {
2782                scan_mrf_high = scan_mrf_low;
2783             }
2784
2785             if (mrf_low == scan_mrf_low ||
2786                 mrf_low == scan_mrf_high ||
2787                 mrf_high == scan_mrf_low ||
2788                 mrf_high == scan_mrf_high) {
2789                break;
2790             }
2791          }
2792
2793          if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2794             /* Found a SEND instruction, which means that there are
2795              * live values in MRFs from base_mrf to base_mrf +
2796              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2797              * above it.
2798              */
2799             if (mrf_low >= scan_inst->base_mrf &&
2800                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2801                break;
2802             }
2803             if (mrf_high >= scan_inst->base_mrf &&
2804                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2805                break;
2806             }
2807          }
2808       }
2809    }
2810
2811    if (progress)
2812       invalidate_live_intervals();
2813
2814    return progress;
2815 }
2816
2817 /**
2818  * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
2819  * instructions to FS_OPCODE_REP_FB_WRITE.
2820  */
2821 void
2822 fs_visitor::emit_repclear_shader()
2823 {
2824    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2825    int base_mrf = 1;
2826    int color_mrf = base_mrf + 2;
2827
2828    fs_inst *mov = emit(MOV(vec4(brw_message_reg(color_mrf)),
2829                            fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F)));
2830    mov->force_writemask_all = true;
2831
2832    fs_inst *write;
2833    if (key->nr_color_regions == 1) {
2834       write = emit(FS_OPCODE_REP_FB_WRITE);
2835       write->saturate = key->clamp_fragment_color;
2836       write->base_mrf = color_mrf;
2837       write->target = 0;
2838       write->header_present = false;
2839       write->mlen = 1;
2840    } else {
2841       assume(key->nr_color_regions > 0);
2842       for (int i = 0; i < key->nr_color_regions; ++i) {
2843          write = emit(FS_OPCODE_REP_FB_WRITE);
2844          write->saturate = key->clamp_fragment_color;
2845          write->base_mrf = base_mrf;
2846          write->target = i;
2847          write->header_present = true;
2848          write->mlen = 3;
2849       }
2850    }
2851    write->eot = true;
2852
2853    calculate_cfg();
2854
2855    assign_constant_locations();
2856    assign_curb_setup();
2857
2858    /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
2859    assert(mov->src[0].file == HW_REG);
2860    mov->src[0] = brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
2861 }
2862
2863 /**
2864  * Walks through basic blocks, looking for repeated MRF writes and
2865  * removing the later ones.
2866  */
2867 bool
2868 fs_visitor::remove_duplicate_mrf_writes()
2869 {
2870    fs_inst *last_mrf_move[16];
2871    bool progress = false;
2872
2873    /* Need to update the MRF tracking for compressed instructions. */
2874    if (dispatch_width == 16)
2875       return false;
2876
2877    memset(last_mrf_move, 0, sizeof(last_mrf_move));
2878
2879    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
2880       if (inst->is_control_flow()) {
2881          memset(last_mrf_move, 0, sizeof(last_mrf_move));
2882       }
2883
2884       if (inst->opcode == BRW_OPCODE_MOV &&
2885           inst->dst.file == MRF) {
2886          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2887          if (prev_inst && inst->equals(prev_inst)) {
2888             inst->remove(block);
2889             progress = true;
2890             continue;
2891          }
2892       }
2893
2894       /* Clear out the last-write records for MRFs that were overwritten. */
2895       if (inst->dst.file == MRF) {
2896          last_mrf_move[inst->dst.reg] = NULL;
2897       }
2898
2899       if (inst->mlen > 0 && inst->base_mrf != -1) {
2900          /* Found a SEND instruction, which will include two or fewer
2901           * implied MRF writes.  We could do better here.
2902           */
2903          for (int i = 0; i < implied_mrf_writes(inst); i++) {
2904             last_mrf_move[inst->base_mrf + i] = NULL;
2905          }
2906       }
2907
2908       /* Clear out any MRF move records whose sources got overwritten. */
2909       if (inst->dst.file == GRF) {
2910          for (unsigned int i = 0; i < ARRAY_SIZE(last_mrf_move); i++) {
2911             if (last_mrf_move[i] &&
2912                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2913                last_mrf_move[i] = NULL;
2914             }
2915          }
2916       }
2917
2918       if (inst->opcode == BRW_OPCODE_MOV &&
2919           inst->dst.file == MRF &&
2920           inst->src[0].file == GRF &&
2921           !inst->is_partial_write()) {
2922          last_mrf_move[inst->dst.reg] = inst;
2923       }
2924    }
2925
2926    if (progress)
2927       invalidate_live_intervals();
2928
2929    return progress;
2930 }
2931
2932 static void
2933 clear_deps_for_inst_src(fs_inst *inst, bool *deps, int first_grf, int grf_len)
2934 {
2935    /* Clear the flag for registers that actually got read (as expected). */
2936    for (int i = 0; i < inst->sources; i++) {
2937       int grf;
2938       if (inst->src[i].file == GRF) {
2939          grf = inst->src[i].reg;
2940       } else if (inst->src[i].file == HW_REG &&
2941                  inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2942          grf = inst->src[i].fixed_hw_reg.nr;
2943       } else {
2944          continue;
2945       }
2946
2947       if (grf >= first_grf &&
2948           grf < first_grf + grf_len) {
2949          deps[grf - first_grf] = false;
2950          if (inst->exec_size == 16)
2951             deps[grf - first_grf + 1] = false;
2952       }
2953    }
2954 }
2955
2956 /**
2957  * Implements this workaround for the original 965:
2958  *
2959  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2960  *      check for post destination dependencies on this instruction, software
2961  *      must ensure that there is no destination hazard for the case of ‘write
2962  *      followed by a posted write’ shown in the following example.
2963  *
2964  *      1. mov r3 0
2965  *      2. send r3.xy <rest of send instruction>
2966  *      3. mov r2 r3
2967  *
2968  *      Due to no post-destination dependency check on the ‘send’, the above
2969  *      code sequence could have two instructions (1 and 2) in flight at the
2970  *      same time that both consider ‘r3’ as the target of their final writes.
2971  */
2972 void
2973 fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
2974                                                         fs_inst *inst)
2975 {
2976    int write_len = inst->regs_written;
2977    int first_write_grf = inst->dst.reg;
2978    bool needs_dep[BRW_MAX_MRF];
2979    assert(write_len < (int)sizeof(needs_dep) - 1);
2980
2981    memset(needs_dep, false, sizeof(needs_dep));
2982    memset(needs_dep, true, write_len);
2983
2984    clear_deps_for_inst_src(inst, needs_dep, first_write_grf, write_len);
2985
2986    /* Walk backwards looking for writes to registers we're writing which
2987     * aren't read since being written.  If we hit the start of the program,
2988     * we assume that there are no outstanding dependencies on entry to the
2989     * program.
2990     */
2991    foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2992       /* If we hit control flow, assume that there *are* outstanding
2993        * dependencies, and force their cleanup before our instruction.
2994        */
2995       if (block->start() == scan_inst) {
2996          for (int i = 0; i < write_len; i++) {
2997             if (needs_dep[i]) {
2998                inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
2999             }
3000          }
3001          return;
3002       }
3003
3004       /* We insert our reads as late as possible on the assumption that any
3005        * instruction but a MOV that might have left us an outstanding
3006        * dependency has more latency than a MOV.
3007        */
3008       if (scan_inst->dst.file == GRF) {
3009          for (int i = 0; i < scan_inst->regs_written; i++) {
3010             int reg = scan_inst->dst.reg + i;
3011
3012             if (reg >= first_write_grf &&
3013                 reg < first_write_grf + write_len &&
3014                 needs_dep[reg - first_write_grf]) {
3015                inst->insert_before(block, DEP_RESOLVE_MOV(reg));
3016                needs_dep[reg - first_write_grf] = false;
3017                if (scan_inst->exec_size == 16)
3018                   needs_dep[reg - first_write_grf + 1] = false;
3019             }
3020          }
3021       }
3022
3023       /* Clear the flag for registers that actually got read (as expected). */
3024       clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3025
3026       /* Continue the loop only if we haven't resolved all the dependencies */
3027       int i;
3028       for (i = 0; i < write_len; i++) {
3029          if (needs_dep[i])
3030             break;
3031       }
3032       if (i == write_len)
3033          return;
3034    }
3035 }
3036
3037 /**
3038  * Implements this workaround for the original 965:
3039  *
3040  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
3041  *      used as a destination register until after it has been sourced by an
3042  *      instruction with a different destination register.
3043  */
3044 void
3045 fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
3046 {
3047    int write_len = inst->regs_written;
3048    int first_write_grf = inst->dst.reg;
3049    bool needs_dep[BRW_MAX_MRF];
3050    assert(write_len < (int)sizeof(needs_dep) - 1);
3051
3052    memset(needs_dep, false, sizeof(needs_dep));
3053    memset(needs_dep, true, write_len);
3054    /* Walk forwards looking for writes to registers we're writing which aren't
3055     * read before being written.
3056     */
3057    foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst, block) {
3058       /* If we hit control flow, force resolve all remaining dependencies. */
3059       if (block->end() == scan_inst) {
3060          for (int i = 0; i < write_len; i++) {
3061             if (needs_dep[i])
3062                scan_inst->insert_before(block,
3063                                         DEP_RESOLVE_MOV(first_write_grf + i));
3064          }
3065          return;
3066       }
3067
3068       /* Clear the flag for registers that actually got read (as expected). */
3069       clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3070
3071       /* We insert our reads as late as possible since they're reading the
3072        * result of a SEND, which has massive latency.
3073        */
3074       if (scan_inst->dst.file == GRF &&
3075           scan_inst->dst.reg >= first_write_grf &&
3076           scan_inst->dst.reg < first_write_grf + write_len &&
3077           needs_dep[scan_inst->dst.reg - first_write_grf]) {
3078          scan_inst->insert_before(block, DEP_RESOLVE_MOV(scan_inst->dst.reg));
3079          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
3080       }
3081
3082       /* Continue the loop only if we haven't resolved all the dependencies */
3083       int i;
3084       for (i = 0; i < write_len; i++) {
3085          if (needs_dep[i])
3086             break;
3087       }
3088       if (i == write_len)
3089          return;
3090    }
3091 }
3092
3093 void
3094 fs_visitor::insert_gen4_send_dependency_workarounds()
3095 {
3096    if (brw->gen != 4 || brw->is_g4x)
3097       return;
3098
3099    bool progress = false;
3100
3101    /* Note that we're done with register allocation, so GRF fs_regs always
3102     * have a .reg_offset of 0.
3103     */
3104
3105    foreach_block_and_inst(block, fs_inst, inst, cfg) {
3106       if (inst->mlen != 0 && inst->dst.file == GRF) {
3107          insert_gen4_pre_send_dependency_workarounds(block, inst);
3108          insert_gen4_post_send_dependency_workarounds(block, inst);
3109          progress = true;
3110       }
3111    }
3112
3113    if (progress)
3114       invalidate_live_intervals();
3115 }
3116
3117 /**
3118  * Turns the generic expression-style uniform pull constant load instruction
3119  * into a hardware-specific series of instructions for loading a pull
3120  * constant.
3121  *
3122  * The expression style allows the CSE pass before this to optimize out
3123  * repeated loads from the same offset, and gives the pre-register-allocation
3124  * scheduling full flexibility, while the conversion to native instructions
3125  * allows the post-register-allocation scheduler the best information
3126  * possible.
3127  *
3128  * Note that execution masking for setting up pull constant loads is special:
3129  * the channels that need to be written are unrelated to the current execution
3130  * mask, since a later instruction will use one of the result channels as a
3131  * source operand for all 8 or 16 of its channels.
3132  */
3133 void
3134 fs_visitor::lower_uniform_pull_constant_loads()
3135 {
3136    foreach_block_and_inst (block, fs_inst, inst, cfg) {
3137       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
3138          continue;
3139
3140       if (brw->gen >= 7) {
3141          /* The offset arg before was a vec4-aligned byte offset.  We need to
3142           * turn it into a dword offset.
3143           */
3144          fs_reg const_offset_reg = inst->src[1];
3145          assert(const_offset_reg.file == IMM &&
3146                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
3147          const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
3148          fs_reg payload = fs_reg(GRF, alloc.allocate(1));
3149
3150          /* We have to use a message header on Skylake to get SIMD4x2 mode.
3151           * Reserve space for the register.
3152           */
3153          if (brw->gen >= 9) {
3154             payload.reg_offset++;
3155             alloc.sizes[payload.reg] = 2;
3156          }
3157
3158          /* This is actually going to be a MOV, but since only the first dword
3159           * is accessed, we have a special opcode to do just that one.  Note
3160           * that this needs to be an operation that will be considered a def
3161           * by live variable analysis, or register allocation will explode.
3162           */
3163          fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
3164                                                8, payload, const_offset_reg);
3165          setup->force_writemask_all = true;
3166
3167          setup->ir = inst->ir;
3168          setup->annotation = inst->annotation;
3169          inst->insert_before(block, setup);
3170
3171          /* Similarly, this will only populate the first 4 channels of the
3172           * result register (since we only use smear values from 0-3), but we
3173           * don't tell the optimizer.
3174           */
3175          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
3176          inst->src[1] = payload;
3177
3178          invalidate_live_intervals();
3179       } else {
3180          /* Before register allocation, we didn't tell the scheduler about the
3181           * MRF we use.  We know it's safe to use this MRF because nothing
3182           * else does except for register spill/unspill, which generates and
3183           * uses its MRF within a single IR instruction.
3184           */
3185          inst->base_mrf = 14;
3186          inst->mlen = 1;
3187       }
3188    }
3189 }
3190
3191 bool
3192 fs_visitor::lower_load_payload()
3193 {
3194    bool progress = false;
3195
3196    int vgrf_to_reg[alloc.count];
3197    int reg_count = 0;
3198    for (unsigned i = 0; i < alloc.count; ++i) {
3199       vgrf_to_reg[i] = reg_count;
3200       reg_count += alloc.sizes[i];
3201    }
3202
3203    struct {
3204       bool written:1; /* Whether this register has ever been written */
3205       bool force_writemask_all:1;
3206       bool force_sechalf:1;
3207    } metadata[reg_count];
3208    memset(metadata, 0, sizeof(metadata));
3209
3210    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3211       if (inst->dst.file == GRF) {
3212          const int dst_reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
3213          bool force_sechalf = inst->force_sechalf &&
3214                               !inst->force_writemask_all;
3215          bool toggle_sechalf = inst->dst.width == 16 &&
3216                                type_sz(inst->dst.type) == 4 &&
3217                                !inst->force_writemask_all;
3218          for (int i = 0; i < inst->regs_written; ++i) {
3219             metadata[dst_reg + i].written = true;
3220             metadata[dst_reg + i].force_sechalf = force_sechalf;
3221             metadata[dst_reg + i].force_writemask_all = inst->force_writemask_all;
3222             force_sechalf = (toggle_sechalf != force_sechalf);
3223          }
3224       }
3225
3226       if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
3227          assert(inst->dst.file == MRF || inst->dst.file == GRF);
3228          fs_reg dst = inst->dst;
3229
3230          for (int i = 0; i < inst->sources; i++) {
3231             dst.width = inst->src[i].effective_width;
3232             dst.type = inst->src[i].type;
3233
3234             if (inst->src[i].file == BAD_FILE) {
3235                /* Do nothing but otherwise increment as normal */
3236             } else if (dst.file == MRF &&
3237                        dst.width == 8 &&
3238                        brw->has_compr4 &&
3239                        i + 4 < inst->sources &&
3240                        inst->src[i + 4].equals(horiz_offset(inst->src[i], 8))) {
3241                fs_reg compr4_dst = dst;
3242                compr4_dst.reg += BRW_MRF_COMPR4;
3243                compr4_dst.width = 16;
3244                fs_reg compr4_src = inst->src[i];
3245                compr4_src.width = 16;
3246                fs_inst *mov = MOV(compr4_dst, compr4_src);
3247                mov->force_writemask_all = true;
3248                inst->insert_before(block, mov);
3249                /* Mark i+4 as BAD_FILE so we don't emit a MOV for it */
3250                inst->src[i + 4].file = BAD_FILE;
3251             } else {
3252                fs_inst *mov = MOV(dst, inst->src[i]);
3253                if (inst->src[i].file == GRF) {
3254                   int src_reg = vgrf_to_reg[inst->src[i].reg] +
3255                                 inst->src[i].reg_offset;
3256                   mov->force_sechalf = metadata[src_reg].force_sechalf;
3257                   mov->force_writemask_all = metadata[src_reg].force_writemask_all;
3258                } else {
3259                   /* We don't have any useful metadata for immediates or
3260                    * uniforms.  Assume that any of the channels of the
3261                    * destination may be used.
3262                    */
3263                   assert(inst->src[i].file == IMM ||
3264                          inst->src[i].file == UNIFORM);
3265                   mov->force_writemask_all = true;
3266                }
3267
3268                if (dst.file == GRF) {
3269                   const int dst_reg = vgrf_to_reg[dst.reg] + dst.reg_offset;
3270                   const bool force_writemask = mov->force_writemask_all;
3271                   metadata[dst_reg].force_writemask_all = force_writemask;
3272                   metadata[dst_reg].force_sechalf = mov->force_sechalf;
3273                   if (dst.width * type_sz(dst.type) > 32) {
3274                      assert(!mov->force_sechalf);
3275                      metadata[dst_reg + 1].force_writemask_all = force_writemask;
3276                      metadata[dst_reg + 1].force_sechalf = !force_writemask;
3277                   }
3278                }
3279
3280                inst->insert_before(block, mov);
3281             }
3282
3283             dst = offset(dst, 1);
3284          }
3285
3286          inst->remove(block);
3287          progress = true;
3288       }
3289    }
3290
3291    if (progress)
3292       invalidate_live_intervals();
3293
3294    return progress;
3295 }
3296
3297 void
3298 fs_visitor::dump_instructions()
3299 {
3300    dump_instructions(NULL);
3301 }
3302
3303 void
3304 fs_visitor::dump_instructions(const char *name)
3305 {
3306    FILE *file = stderr;
3307    if (name && geteuid() != 0) {
3308       file = fopen(name, "w");
3309       if (!file)
3310          file = stderr;
3311    }
3312
3313    if (cfg) {
3314       calculate_register_pressure();
3315       int ip = 0, max_pressure = 0;
3316       foreach_block_and_inst(block, backend_instruction, inst, cfg) {
3317          max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
3318          fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
3319          dump_instruction(inst, file);
3320          ip++;
3321       }
3322       fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
3323    } else {
3324       int ip = 0;
3325       foreach_in_list(backend_instruction, inst, &instructions) {
3326          fprintf(file, "%4d: ", ip++);
3327          dump_instruction(inst, file);
3328       }
3329    }
3330
3331    if (file != stderr) {
3332       fclose(file);
3333    }
3334 }
3335
3336 void
3337 fs_visitor::dump_instruction(backend_instruction *be_inst)
3338 {
3339    dump_instruction(be_inst, stderr);
3340 }
3341
3342 void
3343 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
3344 {
3345    fs_inst *inst = (fs_inst *)be_inst;
3346
3347    if (inst->predicate) {
3348       fprintf(file, "(%cf0.%d) ",
3349              inst->predicate_inverse ? '-' : '+',
3350              inst->flag_subreg);
3351    }
3352
3353    fprintf(file, "%s", brw_instruction_name(inst->opcode));
3354    if (inst->saturate)
3355       fprintf(file, ".sat");
3356    if (inst->conditional_mod) {
3357       fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
3358       if (!inst->predicate &&
3359           (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
3360                               inst->opcode != BRW_OPCODE_IF &&
3361                               inst->opcode != BRW_OPCODE_WHILE))) {
3362          fprintf(file, ".f0.%d", inst->flag_subreg);
3363       }
3364    }
3365    fprintf(file, "(%d) ", inst->exec_size);
3366
3367
3368    switch (inst->dst.file) {
3369    case GRF:
3370       fprintf(file, "vgrf%d", inst->dst.reg);
3371       if (inst->dst.width != dispatch_width)
3372          fprintf(file, "@%d", inst->dst.width);
3373       if (alloc.sizes[inst->dst.reg] != inst->dst.width / 8 ||
3374           inst->dst.subreg_offset)
3375          fprintf(file, "+%d.%d",
3376                  inst->dst.reg_offset, inst->dst.subreg_offset);
3377       break;
3378    case MRF:
3379       fprintf(file, "m%d", inst->dst.reg);
3380       break;
3381    case BAD_FILE:
3382       fprintf(file, "(null)");
3383       break;
3384    case UNIFORM:
3385       fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
3386       break;
3387    case ATTR:
3388       fprintf(file, "***attr%d***", inst->dst.reg + inst->dst.reg_offset);
3389       break;
3390    case HW_REG:
3391       if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3392          switch (inst->dst.fixed_hw_reg.nr) {
3393          case BRW_ARF_NULL:
3394             fprintf(file, "null");
3395             break;
3396          case BRW_ARF_ADDRESS:
3397             fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
3398             break;
3399          case BRW_ARF_ACCUMULATOR:
3400             fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
3401             break;
3402          case BRW_ARF_FLAG:
3403             fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3404                              inst->dst.fixed_hw_reg.subnr);
3405             break;
3406          default:
3407             fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3408                                inst->dst.fixed_hw_reg.subnr);
3409             break;
3410          }
3411       } else {
3412          fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
3413       }
3414       if (inst->dst.fixed_hw_reg.subnr)
3415          fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
3416       break;
3417    default:
3418       fprintf(file, "???");
3419       break;
3420    }
3421    fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
3422
3423    for (int i = 0; i < inst->sources; i++) {
3424       if (inst->src[i].negate)
3425          fprintf(file, "-");
3426       if (inst->src[i].abs)
3427          fprintf(file, "|");
3428       switch (inst->src[i].file) {
3429       case GRF:
3430          fprintf(file, "vgrf%d", inst->src[i].reg);
3431          if (inst->src[i].width != dispatch_width)
3432             fprintf(file, "@%d", inst->src[i].width);
3433          if (alloc.sizes[inst->src[i].reg] != inst->src[i].width / 8 ||
3434              inst->src[i].subreg_offset)
3435             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3436                     inst->src[i].subreg_offset);
3437          break;
3438       case MRF:
3439          fprintf(file, "***m%d***", inst->src[i].reg);
3440          break;
3441       case ATTR:
3442          fprintf(file, "attr%d", inst->src[i].reg + inst->src[i].reg_offset);
3443          break;
3444       case UNIFORM:
3445          fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
3446          if (inst->src[i].reladdr) {
3447             fprintf(file, "+reladdr");
3448          } else if (inst->src[i].subreg_offset) {
3449             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3450                     inst->src[i].subreg_offset);
3451          }
3452          break;
3453       case BAD_FILE:
3454          fprintf(file, "(null)");
3455          break;
3456       case IMM:
3457          switch (inst->src[i].type) {
3458          case BRW_REGISTER_TYPE_F:
3459             fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
3460             break;
3461          case BRW_REGISTER_TYPE_W:
3462          case BRW_REGISTER_TYPE_D:
3463             fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
3464             break;
3465          case BRW_REGISTER_TYPE_UW:
3466          case BRW_REGISTER_TYPE_UD:
3467             fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
3468             break;
3469          case BRW_REGISTER_TYPE_VF:
3470             fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
3471                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >>  0) & 0xff),
3472                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >>  8) & 0xff),
3473                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 16) & 0xff),
3474                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 24) & 0xff));
3475             break;
3476          default:
3477             fprintf(file, "???");
3478             break;
3479          }
3480          break;
3481       case HW_REG:
3482          if (inst->src[i].fixed_hw_reg.negate)
3483             fprintf(file, "-");
3484          if (inst->src[i].fixed_hw_reg.abs)
3485             fprintf(file, "|");
3486          if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3487             switch (inst->src[i].fixed_hw_reg.nr) {
3488             case BRW_ARF_NULL:
3489                fprintf(file, "null");
3490                break;
3491             case BRW_ARF_ADDRESS:
3492                fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
3493                break;
3494             case BRW_ARF_ACCUMULATOR:
3495                fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
3496                break;
3497             case BRW_ARF_FLAG:
3498                fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3499                                 inst->src[i].fixed_hw_reg.subnr);
3500                break;
3501             default:
3502                fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3503                                   inst->src[i].fixed_hw_reg.subnr);
3504                break;
3505             }
3506          } else {
3507             fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3508          }
3509          if (inst->src[i].fixed_hw_reg.subnr)
3510             fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
3511          if (inst->src[i].fixed_hw_reg.abs)
3512             fprintf(file, "|");
3513          break;
3514       default:
3515          fprintf(file, "???");
3516          break;
3517       }
3518       if (inst->src[i].abs)
3519          fprintf(file, "|");
3520
3521       if (inst->src[i].file != IMM) {
3522          fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
3523       }
3524
3525       if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
3526          fprintf(file, ", ");
3527    }
3528
3529    fprintf(file, " ");
3530
3531    if (dispatch_width == 16 && inst->exec_size == 8) {
3532       if (inst->force_sechalf)
3533          fprintf(file, "2ndhalf ");
3534       else
3535          fprintf(file, "1sthalf ");
3536    }
3537
3538    fprintf(file, "\n");
3539 }
3540
3541 /**
3542  * Possibly returns an instruction that set up @param reg.
3543  *
3544  * Sometimes we want to take the result of some expression/variable
3545  * dereference tree and rewrite the instruction generating the result
3546  * of the tree.  When processing the tree, we know that the
3547  * instructions generated are all writing temporaries that are dead
3548  * outside of this tree.  So, if we have some instructions that write
3549  * a temporary, we're free to point that temp write somewhere else.
3550  *
3551  * Note that this doesn't guarantee that the instruction generated
3552  * only reg -- it might be the size=4 destination of a texture instruction.
3553  */
3554 fs_inst *
3555 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3556                                            fs_inst *end,
3557                                            const fs_reg &reg)
3558 {
3559    if (end == start ||
3560        end->is_partial_write() ||
3561        reg.reladdr ||
3562        !reg.equals(end->dst)) {
3563       return NULL;
3564    } else {
3565       return end;
3566    }
3567 }
3568
3569 void
3570 fs_visitor::setup_payload_gen6()
3571 {
3572    bool uses_depth =
3573       (prog->InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3574    unsigned barycentric_interp_modes =
3575       (stage == MESA_SHADER_FRAGMENT) ?
3576       ((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
3577
3578    assert(brw->gen >= 6);
3579
3580    /* R0-1: masks, pixel X/Y coordinates. */
3581    payload.num_regs = 2;
3582    /* R2: only for 32-pixel dispatch.*/
3583
3584    /* R3-26: barycentric interpolation coordinates.  These appear in the
3585     * same order that they appear in the brw_wm_barycentric_interp_mode
3586     * enum.  Each set of coordinates occupies 2 registers if dispatch width
3587     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
3588     * appear if they were enabled using the "Barycentric Interpolation
3589     * Mode" bits in WM_STATE.
3590     */
3591    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3592       if (barycentric_interp_modes & (1 << i)) {
3593          payload.barycentric_coord_reg[i] = payload.num_regs;
3594          payload.num_regs += 2;
3595          if (dispatch_width == 16) {
3596             payload.num_regs += 2;
3597          }
3598       }
3599    }
3600
3601    /* R27: interpolated depth if uses source depth */
3602    if (uses_depth) {
3603       payload.source_depth_reg = payload.num_regs;
3604       payload.num_regs++;
3605       if (dispatch_width == 16) {
3606          /* R28: interpolated depth if not SIMD8. */
3607          payload.num_regs++;
3608       }
3609    }
3610    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3611    if (uses_depth) {
3612       payload.source_w_reg = payload.num_regs;
3613       payload.num_regs++;
3614       if (dispatch_width == 16) {
3615          /* R30: interpolated W if not SIMD8. */
3616          payload.num_regs++;
3617       }
3618    }
3619
3620    if (stage == MESA_SHADER_FRAGMENT) {
3621       brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3622       brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3623       prog_data->uses_pos_offset = key->compute_pos_offset;
3624       /* R31: MSAA position offsets. */
3625       if (prog_data->uses_pos_offset) {
3626          payload.sample_pos_reg = payload.num_regs;
3627          payload.num_regs++;
3628       }
3629    }
3630
3631    /* R32: MSAA input coverage mask */
3632    if (prog->SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3633       assert(brw->gen >= 7);
3634       payload.sample_mask_in_reg = payload.num_regs;
3635       payload.num_regs++;
3636       if (dispatch_width == 16) {
3637          /* R33: input coverage mask if not SIMD8. */
3638          payload.num_regs++;
3639       }
3640    }
3641
3642    /* R34-: bary for 32-pixel. */
3643    /* R58-59: interp W for 32-pixel. */
3644
3645    if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3646       source_depth_to_render_target = true;
3647    }
3648 }
3649
3650 void
3651 fs_visitor::setup_vs_payload()
3652 {
3653    /* R0: thread header, R1: urb handles */
3654    payload.num_regs = 2;
3655 }
3656
3657 void
3658 fs_visitor::assign_binding_table_offsets()
3659 {
3660    assert(stage == MESA_SHADER_FRAGMENT);
3661    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3662    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3663    uint32_t next_binding_table_offset = 0;
3664
3665    /* If there are no color regions, we still perform an FB write to a null
3666     * renderbuffer, which we place at surface index 0.
3667     */
3668    prog_data->binding_table.render_target_start = next_binding_table_offset;
3669    next_binding_table_offset += MAX2(key->nr_color_regions, 1);
3670
3671    assign_common_binding_table_offsets(next_binding_table_offset);
3672 }
3673
3674 void
3675 fs_visitor::calculate_register_pressure()
3676 {
3677    invalidate_live_intervals();
3678    calculate_live_intervals();
3679
3680    unsigned num_instructions = 0;
3681    foreach_block(block, cfg)
3682       num_instructions += block->instructions.length();
3683
3684    regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3685
3686    for (unsigned reg = 0; reg < alloc.count; reg++) {
3687       for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3688          regs_live_at_ip[ip] += alloc.sizes[reg];
3689    }
3690 }
3691
3692 void
3693 fs_visitor::optimize()
3694 {
3695    const char *stage_name = stage == MESA_SHADER_VERTEX ? "vs" : "fs";
3696
3697    split_virtual_grfs();
3698
3699    move_uniform_array_access_to_pull_constants();
3700    assign_constant_locations();
3701    demote_pull_constants();
3702
3703 #define OPT(pass, args...) ({                                           \
3704       pass_num++;                                                       \
3705       bool this_progress = pass(args);                                  \
3706                                                                         \
3707       if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) {   \
3708          char filename[64];                                             \
3709          snprintf(filename, 64, "%s%d-%04d-%02d-%02d-" #pass,              \
3710                   stage_name, dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
3711                                                                         \
3712          backend_visitor::dump_instructions(filename);                  \
3713       }                                                                 \
3714                                                                         \
3715       progress = progress || this_progress;                             \
3716       this_progress;                                                    \
3717    })
3718
3719    if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
3720       char filename[64];
3721       snprintf(filename, 64, "%s%d-%04d-00-start",
3722                stage_name, dispatch_width, shader_prog ? shader_prog->Name : 0);
3723
3724       backend_visitor::dump_instructions(filename);
3725    }
3726
3727    bool progress;
3728    int iteration = 0;
3729    int pass_num = 0;
3730    do {
3731       progress = false;
3732       pass_num = 0;
3733       iteration++;
3734
3735       OPT(remove_duplicate_mrf_writes);
3736
3737       OPT(opt_algebraic);
3738       OPT(opt_cse);
3739       OPT(opt_copy_propagate);
3740       OPT(opt_peephole_predicated_break);
3741       OPT(opt_cmod_propagation);
3742       OPT(dead_code_eliminate);
3743       OPT(opt_peephole_sel);
3744       OPT(dead_control_flow_eliminate, this);
3745       OPT(opt_register_renaming);
3746       OPT(opt_redundant_discard_jumps);
3747       OPT(opt_saturate_propagation);
3748       OPT(register_coalesce);
3749       OPT(compute_to_mrf);
3750
3751       OPT(compact_virtual_grfs);
3752    } while (progress);
3753
3754    pass_num = 0;
3755
3756    if (OPT(lower_load_payload)) {
3757       split_virtual_grfs();
3758       OPT(register_coalesce);
3759       OPT(compute_to_mrf);
3760       OPT(dead_code_eliminate);
3761    }
3762
3763    OPT(opt_combine_constants);
3764
3765    lower_uniform_pull_constant_loads();
3766 }
3767
3768 /**
3769  * Three source instruction must have a GRF/MRF destination register.
3770  * ARF NULL is not allowed.  Fix that up by allocating a temporary GRF.
3771  */
3772 void
3773 fs_visitor::fixup_3src_null_dest()
3774 {
3775    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3776       if (inst->is_3src() && inst->dst.is_null()) {
3777          inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
3778                             inst->dst.type);
3779       }
3780    }
3781 }
3782
3783 void
3784 fs_visitor::allocate_registers()
3785 {
3786    bool allocated_without_spills;
3787
3788    static const enum instruction_scheduler_mode pre_modes[] = {
3789       SCHEDULE_PRE,
3790       SCHEDULE_PRE_NON_LIFO,
3791       SCHEDULE_PRE_LIFO,
3792    };
3793
3794    /* Try each scheduling heuristic to see if it can successfully register
3795     * allocate without spilling.  They should be ordered by decreasing
3796     * performance but increasing likelihood of allocating.
3797     */
3798    for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3799       schedule_instructions(pre_modes[i]);
3800
3801       if (0) {
3802          assign_regs_trivial();
3803          allocated_without_spills = true;
3804       } else {
3805          allocated_without_spills = assign_regs(false);
3806       }
3807       if (allocated_without_spills)
3808          break;
3809    }
3810
3811    if (!allocated_without_spills) {
3812       const char *stage_name = stage == MESA_SHADER_VERTEX ?
3813          "Vertex" : "Fragment";
3814
3815       /* We assume that any spilling is worse than just dropping back to
3816        * SIMD8.  There's probably actually some intermediate point where
3817        * SIMD16 with a couple of spills is still better.
3818        */
3819       if (dispatch_width == 16) {
3820          fail("Failure to register allocate.  Reduce number of "
3821               "live scalar values to avoid this.");
3822       } else {
3823          perf_debug("%s shader triggered register spilling.  "
3824                     "Try reducing the number of live scalar values to "
3825                     "improve performance.\n", stage_name);
3826       }
3827
3828       /* Since we're out of heuristics, just go spill registers until we
3829        * get an allocation.
3830        */
3831       while (!assign_regs(true)) {
3832          if (failed)
3833             break;
3834       }
3835    }
3836
3837    /* This must come after all optimization and register allocation, since
3838     * it inserts dead code that happens to have side effects, and it does
3839     * so based on the actual physical registers in use.
3840     */
3841    insert_gen4_send_dependency_workarounds();
3842
3843    if (failed)
3844       return;
3845
3846    if (!allocated_without_spills)
3847       schedule_instructions(SCHEDULE_POST);
3848
3849    if (last_scratch > 0)
3850       prog_data->total_scratch = brw_get_scratch_size(last_scratch);
3851 }
3852
3853 static bool
3854 env_var_as_boolean(const char *var_name, bool default_value)
3855 {
3856    const char *str = getenv(var_name);
3857    if (str == NULL)
3858       return default_value;
3859
3860    if (strcmp(str, "1") == 0 ||
3861        strcasecmp(str, "true") == 0 ||
3862        strcasecmp(str, "yes") == 0) {
3863       return true;
3864    } else if (strcmp(str, "0") == 0 ||
3865               strcasecmp(str, "false") == 0 ||
3866               strcasecmp(str, "no") == 0) {
3867       return false;
3868    } else {
3869       return default_value;
3870    }
3871 }
3872
3873 bool
3874 fs_visitor::run_vs()
3875 {
3876    assert(stage == MESA_SHADER_VERTEX);
3877
3878    assign_common_binding_table_offsets(0);
3879    setup_vs_payload();
3880
3881    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3882       emit_shader_time_begin();
3883
3884    if (env_var_as_boolean("INTEL_USE_NIR", false)) {
3885       emit_nir_code();
3886    } else {
3887       foreach_in_list(ir_instruction, ir, shader->base.ir) {
3888          base_ir = ir;
3889          this->result = reg_undef;
3890          ir->accept(this);
3891       }
3892       base_ir = NULL;
3893    }
3894
3895    if (failed)
3896       return false;
3897
3898    emit_urb_writes();
3899
3900    calculate_cfg();
3901
3902    optimize();
3903
3904    assign_curb_setup();
3905    assign_vs_urb_setup();
3906
3907    fixup_3src_null_dest();
3908    allocate_registers();
3909
3910    return !failed;
3911 }
3912
3913 bool
3914 fs_visitor::run_fs()
3915 {
3916    brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
3917    brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
3918
3919    assert(stage == MESA_SHADER_FRAGMENT);
3920
3921    sanity_param_count = prog->Parameters->NumParameters;
3922
3923    assign_binding_table_offsets();
3924
3925    if (brw->gen >= 6)
3926       setup_payload_gen6();
3927    else
3928       setup_payload_gen4();
3929
3930    if (0) {
3931       emit_dummy_fs();
3932    } else if (brw->use_rep_send && dispatch_width == 16) {
3933       emit_repclear_shader();
3934    } else {
3935       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3936          emit_shader_time_begin();
3937
3938       calculate_urb_setup();
3939       if (prog->InputsRead > 0) {
3940          if (brw->gen < 6)
3941             emit_interpolation_setup_gen4();
3942          else
3943             emit_interpolation_setup_gen6();
3944       }
3945
3946       /* We handle discards by keeping track of the still-live pixels in f0.1.
3947        * Initialize it with the dispatched pixels.
3948        */
3949       if (wm_prog_data->uses_kill) {
3950          fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3951          discard_init->flag_subreg = 1;
3952       }
3953
3954       /* Generate FS IR for main().  (the visitor only descends into
3955        * functions called "main").
3956        */
3957       if (shader) {
3958          if (env_var_as_boolean("INTEL_USE_NIR", false)) {
3959             emit_nir_code();
3960          } else {
3961             foreach_in_list(ir_instruction, ir, shader->base.ir) {
3962                base_ir = ir;
3963                this->result = reg_undef;
3964                ir->accept(this);
3965             }
3966          }
3967       } else {
3968          emit_fragment_program_code();
3969       }
3970       base_ir = NULL;
3971       if (failed)
3972          return false;
3973
3974       emit(FS_OPCODE_PLACEHOLDER_HALT);
3975
3976       if (wm_key->alpha_test_func)
3977          emit_alpha_test();
3978
3979       emit_fb_writes();
3980
3981       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3982          emit_shader_time_end();
3983
3984       calculate_cfg();
3985
3986       optimize();
3987
3988       assign_curb_setup();
3989       assign_urb_setup();
3990
3991       fixup_3src_null_dest();
3992       allocate_registers();
3993
3994       if (failed)
3995          return false;
3996    }
3997
3998    if (dispatch_width == 8)
3999       wm_prog_data->reg_blocks = brw_register_blocks(grf_used);
4000    else
4001       wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
4002
4003    /* If any state parameters were appended, then ParameterValues could have
4004     * been realloced, in which case the driver uniform storage set up by
4005     * _mesa_associate_uniform_storage() would point to freed memory.  Make
4006     * sure that didn't happen.
4007     */
4008    assert(sanity_param_count == prog->Parameters->NumParameters);
4009
4010    return !failed;
4011 }
4012
4013 const unsigned *
4014 brw_wm_fs_emit(struct brw_context *brw,
4015                void *mem_ctx,
4016                const struct brw_wm_prog_key *key,
4017                struct brw_wm_prog_data *prog_data,
4018                struct gl_fragment_program *fp,
4019                struct gl_shader_program *prog,
4020                unsigned *final_assembly_size)
4021 {
4022    bool start_busy = false;
4023    double start_time = 0;
4024
4025    if (unlikely(brw->perf_debug)) {
4026       start_busy = (brw->batch.last_bo &&
4027                     drm_intel_bo_busy(brw->batch.last_bo));
4028       start_time = get_time();
4029    }
4030
4031    struct brw_shader *shader = NULL;
4032    if (prog)
4033       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
4034
4035    if (unlikely(INTEL_DEBUG & DEBUG_WM))
4036       brw_dump_ir("fragment", prog, &shader->base, &fp->Base);
4037
4038    /* Now the main event: Visit the shader IR and generate our FS IR for it.
4039     */
4040    fs_visitor v(brw, mem_ctx, key, prog_data, prog, fp, 8);
4041    if (!v.run_fs()) {
4042       if (prog) {
4043          prog->LinkStatus = false;
4044          ralloc_strcat(&prog->InfoLog, v.fail_msg);
4045       }
4046
4047       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
4048                     v.fail_msg);
4049
4050       return NULL;
4051    }
4052
4053    cfg_t *simd16_cfg = NULL;
4054    fs_visitor v2(brw, mem_ctx, key, prog_data, prog, fp, 16);
4055    if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16) ||
4056                                brw->use_rep_send)) {
4057       if (!v.simd16_unsupported) {
4058          /* Try a SIMD16 compile */
4059          v2.import_uniforms(&v);
4060          if (!v2.run_fs()) {
4061             perf_debug("SIMD16 shader failed to compile, falling back to "
4062                        "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
4063          } else {
4064             simd16_cfg = v2.cfg;
4065          }
4066       } else {
4067          perf_debug("SIMD16 shader unsupported, falling back to "
4068                     "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
4069       }
4070    }
4071
4072    cfg_t *simd8_cfg;
4073    int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8;
4074    if (no_simd8 && simd16_cfg) {
4075       simd8_cfg = NULL;
4076       prog_data->no_8 = true;
4077    } else {
4078       simd8_cfg = v.cfg;
4079       prog_data->no_8 = false;
4080    }
4081
4082    fs_generator g(brw, mem_ctx, (void *) key, &prog_data->base,
4083                   &fp->Base, v.promoted_constants, v.runtime_check_aads_emit, "FS");
4084
4085    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
4086       char *name;
4087       if (prog)
4088          name = ralloc_asprintf(mem_ctx, "%s fragment shader %d",
4089                                 prog->Label ? prog->Label : "unnamed",
4090                                 prog->Name);
4091       else
4092          name = ralloc_asprintf(mem_ctx, "fragment program %d", fp->Base.Id);
4093
4094       g.enable_debug(name);
4095    }
4096
4097    if (simd8_cfg)
4098       g.generate_code(simd8_cfg, 8);
4099    if (simd16_cfg)
4100       prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
4101
4102    if (unlikely(brw->perf_debug) && shader) {
4103       if (shader->compiled_once)
4104          brw_wm_debug_recompile(brw, prog, key);
4105       shader->compiled_once = true;
4106
4107       if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
4108          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
4109                     (get_time() - start_time) * 1000);
4110       }
4111    }
4112
4113    return g.get_assembly(final_assembly_size);
4114 }
4115
4116 extern "C" bool
4117 brw_fs_precompile(struct gl_context *ctx,
4118                   struct gl_shader_program *shader_prog,
4119                   struct gl_program *prog)
4120 {
4121    struct brw_context *brw = brw_context(ctx);
4122    struct brw_wm_prog_key key;
4123
4124    struct gl_fragment_program *fp = (struct gl_fragment_program *) prog;
4125    struct brw_fragment_program *bfp = brw_fragment_program(fp);
4126    bool program_uses_dfdy = fp->UsesDFdy;
4127
4128    memset(&key, 0, sizeof(key));
4129
4130    if (brw->gen < 6) {
4131       if (fp->UsesKill)
4132          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
4133
4134       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
4135          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
4136
4137       /* Just assume depth testing. */
4138       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
4139       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
4140    }
4141
4142    if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
4143                                          BRW_FS_VARYING_INPUT_MASK) > 16)
4144       key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
4145
4146    const bool has_shader_channel_select = brw->is_haswell || brw->gen >= 8;
4147    unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
4148    for (unsigned i = 0; i < sampler_count; i++) {
4149       if (!has_shader_channel_select && (fp->Base.ShadowSamplers & (1 << i))) {
4150          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
4151          key.tex.swizzles[i] =
4152             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
4153       } else {
4154          /* Color sampler: assume no swizzling. */
4155          key.tex.swizzles[i] = SWIZZLE_XYZW;
4156       }
4157    }
4158
4159    if (fp->Base.InputsRead & VARYING_BIT_POS) {
4160       key.drawable_height = ctx->DrawBuffer->Height;
4161    }
4162
4163    key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
4164          ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
4165          BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
4166
4167    if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
4168       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
4169                           key.nr_color_regions > 1;
4170    }
4171
4172    key.program_string_id = bfp->id;
4173
4174    uint32_t old_prog_offset = brw->wm.base.prog_offset;
4175    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
4176
4177    bool success = do_wm_prog(brw, shader_prog, bfp, &key);
4178
4179    brw->wm.base.prog_offset = old_prog_offset;
4180    brw->wm.prog_data = old_prog_data;
4181
4182    return success;
4183 }