src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 #include <sys/types.h>
  32
  33 #include "util/hash_table.h"
  34 #include "main/macros.h"
  35 #include "main/shaderobj.h"
  36 #include "main/fbobject.h"
  37 #include "program/prog_parameter.h"
  38 #include "program/prog_print.h"
  39 #include "util/register_allocate.h"
  40 #include "program/hash_table.h"
  41 #include "brw_context.h"
  42 #include "brw_eu.h"
  43 #include "brw_wm.h"
  44 #include "brw_fs.h"
  45 #include "brw_cfg.h"
  46 #include "brw_dead_control_flow.h"
  47 #include "main/uniforms.h"
  48 #include "brw_fs_live_variables.h"
  49 #include "glsl/glsl_types.h"
  50 #include "program/sampler.h"
  51
  52 void
  53 fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
  54               const fs_reg *src, unsigned sources)
  55 {
  56    memset(this, 0, sizeof(*this));
  57
  58    this->src = new fs_reg[MAX2(sources, 3)];
  59    for (unsigned i = 0; i < sources; i++)
  60       this->src[i] = src[i];
  61
  62    this->opcode = opcode;
  63    this->dst = dst;
  64    this->sources = sources;
  65    this->exec_size = exec_size;
  66
  67    assert(dst.file != IMM && dst.file != UNIFORM);
  68
  69    /* If exec_size == 0, try to guess it from the registers.  Since all
  70     * manner of things may use hardware registers, we first try to guess
  71     * based on GRF registers.  If this fails, we will go ahead and take the
  72     * width from the destination register.
  73     */
  74    if (this->exec_size == 0) {
  75       if (dst.file == GRF) {
  76          this->exec_size = dst.width;
  77       } else {
  78          for (unsigned i = 0; i < sources; ++i) {
  79             if (src[i].file != GRF && src[i].file != ATTR)
  80                continue;
  81
  82             if (this->exec_size <= 1)
  83                this->exec_size = src[i].width;
  84             assert(src[i].width == 1 || src[i].width == this->exec_size);
  85          }
  86       }
  87
  88       if (this->exec_size == 0 && dst.file != BAD_FILE)
  89          this->exec_size = dst.width;
  90    }
  91    assert(this->exec_size != 0);
  92
  93    this->conditional_mod = BRW_CONDITIONAL_NONE;
  94
  95    /* This will be the case for almost all instructions. */
  96    switch (dst.file) {
  97    case GRF:
  98    case HW_REG:
  99    case MRF:
 100    case ATTR:
 101       this->regs_written =
 102          DIV_ROUND_UP(MAX2(dst.width * dst.stride, 1) * type_sz(dst.type), 32);
 103       break;
 104    case BAD_FILE:
 105       this->regs_written = 0;
 106       break;
 107    case IMM:
 108    case UNIFORM:
 109       unreachable("Invalid destination register file");
 110    default:
 111       unreachable("Invalid register file");
 112    }
 113
 114    this->writes_accumulator = false;
 115 }
 116
 117 fs_inst::fs_inst()
 118 {
 119    init(BRW_OPCODE_NOP, 8, dst, NULL, 0);
 120 }
 121
 122 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
 123 {
 124    init(opcode, exec_size, reg_undef, NULL, 0);
 125 }
 126
 127 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
 128 {
 129    init(opcode, 0, dst, NULL, 0);
 130 }
 131
 132 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 133                  const fs_reg &src0)
 134 {
 135    const fs_reg src[1] = { src0 };
 136    init(opcode, exec_size, dst, src, 1);
 137 }
 138
 139 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
 140 {
 141    const fs_reg src[1] = { src0 };
 142    init(opcode, 0, dst, src, 1);
 143 }
 144
 145 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 146                  const fs_reg &src0, const fs_reg &src1)
 147 {
 148    const fs_reg src[2] = { src0, src1 };
 149    init(opcode, exec_size, dst, src, 2);
 150 }
 151
 152 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 153                  const fs_reg &src1)
 154 {
 155    const fs_reg src[2] = { src0, src1 };
 156    init(opcode, 0, dst, src, 2);
 157 }
 158
 159 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 160                  const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
 161 {
 162    const fs_reg src[3] = { src0, src1, src2 };
 163    init(opcode, exec_size, dst, src, 3);
 164 }
 165
 166 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 167                  const fs_reg &src1, const fs_reg &src2)
 168 {
 169    const fs_reg src[3] = { src0, src1, src2 };
 170    init(opcode, 0, dst, src, 3);
 171 }
 172
 173 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst,
 174                  const fs_reg src[], unsigned sources)
 175 {
 176    init(opcode, 0, dst, src, sources);
 177 }
 178
 179 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
 180                  const fs_reg src[], unsigned sources)
 181 {
 182    init(opcode, exec_width, dst, src, sources);
 183 }
 184
 185 fs_inst::fs_inst(const fs_inst &that)
 186 {
 187    memcpy(this, &that, sizeof(that));
 188
 189    this->src = new fs_reg[MAX2(that.sources, 3)];
 190
 191    for (unsigned i = 0; i < that.sources; i++)
 192       this->src[i] = that.src[i];
 193 }
 194
 195 fs_inst::~fs_inst()
 196 {
 197    delete[] this->src;
 198 }
 199
 200 void
 201 fs_inst::resize_sources(uint8_t num_sources)
 202 {
 203    if (this->sources != num_sources) {
 204       fs_reg *src = new fs_reg[MAX2(num_sources, 3)];
 205
 206       for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i)
 207          src[i] = this->src[i];
 208
 209       delete[] this->src;
 210       this->src = src;
 211       this->sources = num_sources;
 212    }
 213 }
 214
 215 #define ALU1(op)                                                        \
 216    fs_inst *                                                            \
 217    fs_visitor::op(const fs_reg &dst, const fs_reg &src0)                \
 218    {                                                                    \
 219       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
 220    }
 221
 222 #define ALU2(op)                                                        \
 223    fs_inst *                                                            \
 224    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 225                   const fs_reg &src1)                                   \
 226    {                                                                    \
 227       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
 228    }
 229
 230 #define ALU2_ACC(op)                                                    \
 231    fs_inst *                                                            \
 232    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 233                   const fs_reg &src1)                                   \
 234    {                                                                    \
 235       fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
 236       inst->writes_accumulator = true;                                  \
 237       return inst;                                                      \
 238    }
 239
 240 #define ALU3(op)                                                        \
 241    fs_inst *                                                            \
 242    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 243                   const fs_reg &src1, const fs_reg &src2)               \
 244    {                                                                    \
 245       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
 246    }
 247
 248 ALU1(NOT)
 249 ALU1(MOV)
 250 ALU1(FRC)
 251 ALU1(RNDD)
 252 ALU1(RNDE)
 253 ALU1(RNDZ)
 254 ALU2(ADD)
 255 ALU2(MUL)
 256 ALU2_ACC(MACH)
 257 ALU2(AND)
 258 ALU2(OR)
 259 ALU2(XOR)
 260 ALU2(SHL)
 261 ALU2(SHR)
 262 ALU2(ASR)
 263 ALU3(LRP)
 264 ALU1(BFREV)
 265 ALU3(BFE)
 266 ALU2(BFI1)
 267 ALU3(BFI2)
 268 ALU1(FBH)
 269 ALU1(FBL)
 270 ALU1(CBIT)
 271 ALU3(MAD)
 272 ALU2_ACC(ADDC)
 273 ALU2_ACC(SUBB)
 274 ALU2(SEL)
 275 ALU2(MAC)
 276
 277 /** Gen4 predicated IF. */
 278 fs_inst *
 279 fs_visitor::IF(enum brw_predicate predicate)
 280 {
 281    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width);
 282    inst->predicate = predicate;
 283    return inst;
 284 }
 285
 286 /** Gen6 IF with embedded comparison. */
 287 fs_inst *
 288 fs_visitor::IF(const fs_reg &src0, const fs_reg &src1,
 289                enum brw_conditional_mod condition)
 290 {
 291    assert(devinfo->gen == 6);
 292    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width,
 293                                         reg_null_d, src0, src1);
 294    inst->conditional_mod = condition;
 295    return inst;
 296 }
 297
 298 /**
 299  * CMP: Sets the low bit of the destination channels with the result
 300  * of the comparison, while the upper bits are undefined, and updates
 301  * the flag register with the packed 16 bits of the result.
 302  */
 303 fs_inst *
 304 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
 305                 enum brw_conditional_mod condition)
 306 {
 307    fs_inst *inst;
 308
 309    /* Take the instruction:
 310     *
 311     * CMP null<d> src0<f> src1<f>
 312     *
 313     * Original gen4 does type conversion to the destination type before
 314     * comparison, producing garbage results for floating point comparisons.
 315     *
 316     * The destination type doesn't matter on newer generations, so we set the
 317     * type to match src0 so we can compact the instruction.
 318     */
 319    dst.type = src0.type;
 320    if (dst.file == HW_REG)
 321       dst.fixed_hw_reg.type = dst.type;
 322
 323    resolve_ud_negate(&src0);
 324    resolve_ud_negate(&src1);
 325
 326    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
 327    inst->conditional_mod = condition;
 328
 329    return inst;
 330 }
 331
 332 fs_inst *
 333 fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources,
 334                          int header_size)
 335 {
 336    assert(dst.width % 8 == 0);
 337    fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, dst.width,
 338                                         dst, src, sources);
 339    inst->header_size = header_size;
 340
 341    for (int i = 0; i < header_size; i++)
 342       assert(src[i].file != GRF || src[i].width * type_sz(src[i].type) == 32);
 343    inst->regs_written = header_size;
 344
 345    for (int i = header_size; i < sources; ++i)
 346       assert(src[i].file != GRF || src[i].width == dst.width);
 347    inst->regs_written += (sources - header_size) * (dst.width / 8);
 348
 349    return inst;
 350 }
 351
 352 exec_list
 353 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
 354                                        const fs_reg &surf_index,
 355                                        const fs_reg &varying_offset,
 356                                        uint32_t const_offset)
 357 {
 358    exec_list instructions;
 359    fs_inst *inst;
 360
 361    /* We have our constant surface use a pitch of 4 bytes, so our index can
 362     * be any component of a vector, and then we load 4 contiguous
 363     * components starting from that.
 364     *
 365     * We break down the const_offset to a portion added to the variable
 366     * offset and a portion done using reg_offset, which means that if you
 367     * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
 368     * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
 369     * CSE can later notice that those loads are all the same and eliminate
 370     * the redundant ones.
 371     */
 372    fs_reg vec4_offset = vgrf(glsl_type::int_type);
 373    instructions.push_tail(ADD(vec4_offset,
 374                               varying_offset, fs_reg(const_offset & ~3)));
 375
 376    int scale = 1;
 377    if (devinfo->gen == 4 && dst.width == 8) {
 378       /* Pre-gen5, we can either use a SIMD8 message that requires (header,
 379        * u, v, r) as parameters, or we can just use the SIMD16 message
 380        * consisting of (header, u).  We choose the second, at the cost of a
 381        * longer return length.
 382        */
 383       scale = 2;
 384    }
 385
 386    enum opcode op;
 387    if (devinfo->gen >= 7)
 388       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
 389    else
 390       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
 391
 392    assert(dst.width % 8 == 0);
 393    int regs_written = 4 * (dst.width / 8) * scale;
 394    fs_reg vec4_result = fs_reg(GRF, alloc.allocate(regs_written),
 395                                dst.type, dst.width);
 396    inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
 397    inst->regs_written = regs_written;
 398    instructions.push_tail(inst);
 399
 400    if (devinfo->gen < 7) {
 401       inst->base_mrf = 13;
 402       inst->header_size = 1;
 403       if (devinfo->gen == 4)
 404          inst->mlen = 3;
 405       else
 406          inst->mlen = 1 + dispatch_width / 8;
 407    }
 408
 409    fs_reg result = offset(vec4_result, (const_offset & 3) * scale);
 410    instructions.push_tail(MOV(dst, result));
 411
 412    return instructions;
 413 }
 414
 415 /**
 416  * A helper for MOV generation for fixing up broken hardware SEND dependency
 417  * handling.
 418  */
 419 fs_inst *
 420 fs_visitor::DEP_RESOLVE_MOV(int grf)
 421 {
 422    fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
 423
 424    inst->ir = NULL;
 425    inst->annotation = "send dependency resolve";
 426
 427    /* The caller always wants uncompressed to emit the minimal extra
 428     * dependencies, and to avoid having to deal with aligning its regs to 2.
 429     */
 430    inst->exec_size = 8;
 431
 432    return inst;
 433 }
 434
 435 bool
 436 fs_inst::equals(fs_inst *inst) const
 437 {
 438    return (opcode == inst->opcode &&
 439            dst.equals(inst->dst) &&
 440            src[0].equals(inst->src[0]) &&
 441            src[1].equals(inst->src[1]) &&
 442            src[2].equals(inst->src[2]) &&
 443            saturate == inst->saturate &&
 444            predicate == inst->predicate &&
 445            conditional_mod == inst->conditional_mod &&
 446            mlen == inst->mlen &&
 447            base_mrf == inst->base_mrf &&
 448            target == inst->target &&
 449            eot == inst->eot &&
 450            header_size == inst->header_size &&
 451            shadow_compare == inst->shadow_compare &&
 452            exec_size == inst->exec_size &&
 453            offset == inst->offset);
 454 }
 455
 456 bool
 457 fs_inst::overwrites_reg(const fs_reg &reg) const
 458 {
 459    return reg.in_range(dst, regs_written);
 460 }
 461
 462 bool
 463 fs_inst::is_send_from_grf() const
 464 {
 465    switch (opcode) {
 466    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
 467    case SHADER_OPCODE_SHADER_TIME_ADD:
 468    case FS_OPCODE_INTERPOLATE_AT_CENTROID:
 469    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
 470    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
 471    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
 472    case SHADER_OPCODE_UNTYPED_ATOMIC:
 473    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
 474    case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
 475    case SHADER_OPCODE_TYPED_ATOMIC:
 476    case SHADER_OPCODE_TYPED_SURFACE_READ:
 477    case SHADER_OPCODE_TYPED_SURFACE_WRITE:
 478    case SHADER_OPCODE_URB_WRITE_SIMD8:
 479       return true;
 480    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 481       return src[1].file == GRF;
 482    case FS_OPCODE_FB_WRITE:
 483       return src[0].file == GRF;
 484    default:
 485       if (is_tex())
 486          return src[0].file == GRF;
 487
 488       return false;
 489    }
 490 }
 491
 492 bool
 493 fs_inst::is_copy_payload(const brw::simple_allocator &grf_alloc) const
 494 {
 495    if (this->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
 496       return false;
 497
 498    fs_reg reg = this->src[0];
 499    if (reg.file != GRF || reg.reg_offset != 0 || reg.stride == 0)
 500       return false;
 501
 502    if (grf_alloc.sizes[reg.reg] != this->regs_written)
 503       return false;
 504
 505    for (int i = 0; i < this->sources; i++) {
 506       reg.type = this->src[i].type;
 507       reg.width = this->src[i].width;
 508       if (!this->src[i].equals(reg))
 509          return false;
 510       reg = ::offset(reg, 1);
 511    }
 512
 513    return true;
 514 }
 515
 516 bool
 517 fs_inst::can_do_source_mods(const struct brw_device_info *devinfo)
 518 {
 519    if (devinfo->gen == 6 && is_math())
 520       return false;
 521
 522    if (is_send_from_grf())
 523       return false;
 524
 525    if (!backend_instruction::can_do_source_mods())
 526       return false;
 527
 528    return true;
 529 }
 530
 531 bool
 532 fs_inst::has_side_effects() const
 533 {
 534    return this->eot || backend_instruction::has_side_effects();
 535 }
 536
 537 void
 538 fs_reg::init()
 539 {
 540    memset(this, 0, sizeof(*this));
 541    stride = 1;
 542 }
 543
 544 /** Generic unset register constructor. */
 545 fs_reg::fs_reg()
 546 {
 547    init();
 548    this->file = BAD_FILE;
 549 }
 550
 551 /** Immediate value constructor. */
 552 fs_reg::fs_reg(float f)
 553 {
 554    init();
 555    this->file = IMM;
 556    this->type = BRW_REGISTER_TYPE_F;
 557    this->fixed_hw_reg.dw1.f = f;
 558    this->width = 1;
 559 }
 560
 561 /** Immediate value constructor. */
 562 fs_reg::fs_reg(int32_t i)
 563 {
 564    init();
 565    this->file = IMM;
 566    this->type = BRW_REGISTER_TYPE_D;
 567    this->fixed_hw_reg.dw1.d = i;
 568    this->width = 1;
 569 }
 570
 571 /** Immediate value constructor. */
 572 fs_reg::fs_reg(uint32_t u)
 573 {
 574    init();
 575    this->file = IMM;
 576    this->type = BRW_REGISTER_TYPE_UD;
 577    this->fixed_hw_reg.dw1.ud = u;
 578    this->width = 1;
 579 }
 580
 581 /** Vector float immediate value constructor. */
 582 fs_reg::fs_reg(uint8_t vf[4])
 583 {
 584    init();
 585    this->file = IMM;
 586    this->type = BRW_REGISTER_TYPE_VF;
 587    memcpy(&this->fixed_hw_reg.dw1.ud, vf, sizeof(unsigned));
 588 }
 589
 590 /** Vector float immediate value constructor. */
 591 fs_reg::fs_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
 592 {
 593    init();
 594    this->file = IMM;
 595    this->type = BRW_REGISTER_TYPE_VF;
 596    this->fixed_hw_reg.dw1.ud = (vf0 <<  0) |
 597                                (vf1 <<  8) |
 598                                (vf2 << 16) |
 599                                (vf3 << 24);
 600 }
 601
 602 /** Fixed brw_reg. */
 603 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 604 {
 605    init();
 606    this->file = HW_REG;
 607    this->fixed_hw_reg = fixed_hw_reg;
 608    this->type = fixed_hw_reg.type;
 609    this->width = 1 << fixed_hw_reg.width;
 610 }
 611
 612 bool
 613 fs_reg::equals(const fs_reg &r) const
 614 {
 615    return (file == r.file &&
 616            reg == r.reg &&
 617            reg_offset == r.reg_offset &&
 618            subreg_offset == r.subreg_offset &&
 619            type == r.type &&
 620            negate == r.negate &&
 621            abs == r.abs &&
 622            !reladdr && !r.reladdr &&
 623            memcmp(&fixed_hw_reg, &r.fixed_hw_reg, sizeof(fixed_hw_reg)) == 0 &&
 624            width == r.width &&
 625            stride == r.stride);
 626 }
 627
 628 fs_reg &
 629 fs_reg::set_smear(unsigned subreg)
 630 {
 631    assert(file != HW_REG && file != IMM);
 632    subreg_offset = subreg * type_sz(type);
 633    stride = 0;
 634    return *this;
 635 }
 636
 637 bool
 638 fs_reg::is_contiguous() const
 639 {
 640    return stride == 1;
 641 }
 642
 643 int
 644 fs_visitor::type_size(const struct glsl_type *type)
 645 {
 646    unsigned int size, i;
 647
 648    switch (type->base_type) {
 649    case GLSL_TYPE_UINT:
 650    case GLSL_TYPE_INT:
 651    case GLSL_TYPE_FLOAT:
 652    case GLSL_TYPE_BOOL:
 653       return type->components();
 654    case GLSL_TYPE_ARRAY:
 655       return type_size(type->fields.array) * type->length;
 656    case GLSL_TYPE_STRUCT:
 657       size = 0;
 658       for (i = 0; i < type->length; i++) {
 659          size += type_size(type->fields.structure[i].type);
 660       }
 661       return size;
 662    case GLSL_TYPE_SAMPLER:
 663       /* Samplers take up no register space, since they're baked in at
 664        * link time.
 665        */
 666       return 0;
 667    case GLSL_TYPE_ATOMIC_UINT:
 668       return 0;
 669    case GLSL_TYPE_IMAGE:
 670    case GLSL_TYPE_VOID:
 671    case GLSL_TYPE_ERROR:
 672    case GLSL_TYPE_INTERFACE:
 673    case GLSL_TYPE_DOUBLE:
 674       unreachable("not reached");
 675    }
 676
 677    return 0;
 678 }
 679
 680 /**
 681  * Create a MOV to read the timestamp register.
 682  *
 683  * The caller is responsible for emitting the MOV.  The return value is
 684  * the destination of the MOV, with extra parameters set.
 685  */
 686 fs_reg
 687 fs_visitor::get_timestamp(fs_inst **out_mov)
 688 {
 689    assert(devinfo->gen >= 7);
 690
 691    fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 692                                           BRW_ARF_TIMESTAMP,
 693                                           0),
 694                              BRW_REGISTER_TYPE_UD));
 695
 696    fs_reg dst = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 4);
 697
 698    fs_inst *mov = MOV(dst, ts);
 699    /* We want to read the 3 fields we care about even if it's not enabled in
 700     * the dispatch.
 701     */
 702    mov->force_writemask_all = true;
 703
 704    /* The caller wants the low 32 bits of the timestamp.  Since it's running
 705     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 706     * which is plenty of time for our purposes.  It is identical across the
 707     * EUs, but since it's tracking GPU core speed it will increment at a
 708     * varying rate as render P-states change.
 709     *
 710     * The caller could also check if render P-states have changed (or anything
 711     * else that might disrupt timing) by setting smear to 2 and checking if
 712     * that field is != 0.
 713     */
 714    dst.set_smear(0);
 715
 716    *out_mov = mov;
 717    return dst;
 718 }
 719
 720 void
 721 fs_visitor::emit_shader_time_begin()
 722 {
 723    current_annotation = "shader time start";
 724    fs_inst *mov;
 725    shader_start_time = get_timestamp(&mov);
 726    emit(mov);
 727 }
 728
 729 void
 730 fs_visitor::emit_shader_time_end()
 731 {
 732    current_annotation = "shader time end";
 733
 734    enum shader_time_shader_type type, written_type, reset_type;
 735    switch (stage) {
 736    case MESA_SHADER_VERTEX:
 737       type = ST_VS;
 738       written_type = ST_VS_WRITTEN;
 739       reset_type = ST_VS_RESET;
 740       break;
 741    case MESA_SHADER_GEOMETRY:
 742       type = ST_GS;
 743       written_type = ST_GS_WRITTEN;
 744       reset_type = ST_GS_RESET;
 745       break;
 746    case MESA_SHADER_FRAGMENT:
 747       if (dispatch_width == 8) {
 748          type = ST_FS8;
 749          written_type = ST_FS8_WRITTEN;
 750          reset_type = ST_FS8_RESET;
 751       } else {
 752          assert(dispatch_width == 16);
 753          type = ST_FS16;
 754          written_type = ST_FS16_WRITTEN;
 755          reset_type = ST_FS16_RESET;
 756       }
 757       break;
 758    case MESA_SHADER_COMPUTE:
 759       type = ST_CS;
 760       written_type = ST_CS_WRITTEN;
 761       reset_type = ST_CS_RESET;
 762       break;
 763    default:
 764       unreachable("fs_visitor::emit_shader_time_end missing code");
 765    }
 766
 767    /* Insert our code just before the final SEND with EOT. */
 768    exec_node *end = this->instructions.get_tail();
 769    assert(end && ((fs_inst *) end)->eot);
 770
 771    fs_inst *tm_read;
 772    fs_reg shader_end_time = get_timestamp(&tm_read);
 773    end->insert_before(tm_read);
 774
 775    /* Check that there weren't any timestamp reset events (assuming these
 776     * were the only two timestamp reads that happened).
 777     */
 778    fs_reg reset = shader_end_time;
 779    reset.set_smear(2);
 780    fs_inst *test = AND(reg_null_d, reset, fs_reg(1u));
 781    test->conditional_mod = BRW_CONDITIONAL_Z;
 782    test->force_writemask_all = true;
 783    end->insert_before(test);
 784    end->insert_before(IF(BRW_PREDICATE_NORMAL));
 785
 786    fs_reg start = shader_start_time;
 787    start.negate = true;
 788    fs_reg diff = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 1);
 789    diff.set_smear(0);
 790    fs_inst *add = ADD(diff, start, shader_end_time);
 791    add->force_writemask_all = true;
 792    end->insert_before(add);
 793
 794    /* If there were no instructions between the two timestamp gets, the diff
 795     * is 2 cycles.  Remove that overhead, so I can forget about that when
 796     * trying to determine the time taken for single instructions.
 797     */
 798    add = ADD(diff, diff, fs_reg(-2u));
 799    add->force_writemask_all = true;
 800    end->insert_before(add);
 801
 802    end->insert_before(SHADER_TIME_ADD(type, diff));
 803    end->insert_before(SHADER_TIME_ADD(written_type, fs_reg(1u)));
 804    end->insert_before(new(mem_ctx) fs_inst(BRW_OPCODE_ELSE, dispatch_width));
 805    end->insert_before(SHADER_TIME_ADD(reset_type, fs_reg(1u)));
 806    end->insert_before(new(mem_ctx) fs_inst(BRW_OPCODE_ENDIF, dispatch_width));
 807 }
 808
 809 fs_inst *
 810 fs_visitor::SHADER_TIME_ADD(enum shader_time_shader_type type, fs_reg value)
 811 {
 812    int shader_time_index =
 813       brw_get_shader_time_index(brw, shader_prog, prog, type);
 814    fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
 815
 816    fs_reg payload;
 817    if (dispatch_width == 8)
 818       payload = vgrf(glsl_type::uvec2_type);
 819    else
 820       payload = vgrf(glsl_type::uint_type);
 821
 822    return new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
 823                                fs_reg(), payload, offset, value);
 824 }
 825
 826 void
 827 fs_visitor::vfail(const char *format, va_list va)
 828 {
 829    char *msg;
 830
 831    if (failed)
 832       return;
 833
 834    failed = true;
 835
 836    msg = ralloc_vasprintf(mem_ctx, format, va);
 837    msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
 838
 839    this->fail_msg = msg;
 840
 841    if (debug_enabled) {
 842       fprintf(stderr, "%s",  msg);
 843    }
 844 }
 845
 846 void
 847 fs_visitor::fail(const char *format, ...)
 848 {
 849    va_list va;
 850
 851    va_start(va, format);
 852    vfail(format, va);
 853    va_end(va);
 854 }
 855
 856 /**
 857  * Mark this program as impossible to compile in SIMD16 mode.
 858  *
 859  * During the SIMD8 compile (which happens first), we can detect and flag
 860  * things that are unsupported in SIMD16 mode, so the compiler can skip
 861  * the SIMD16 compile altogether.
 862  *
 863  * During a SIMD16 compile (if one happens anyway), this just calls fail().
 864  */
 865 void
 866 fs_visitor::no16(const char *format, ...)
 867 {
 868    va_list va;
 869
 870    va_start(va, format);
 871
 872    if (dispatch_width == 16) {
 873       vfail(format, va);
 874    } else {
 875       simd16_unsupported = true;
 876
 877       if (brw->perf_debug) {
 878          if (no16_msg)
 879             ralloc_vasprintf_append(&no16_msg, format, va);
 880          else
 881             no16_msg = ralloc_vasprintf(mem_ctx, format, va);
 882       }
 883    }
 884
 885    va_end(va);
 886 }
 887
 888 fs_inst *
 889 fs_visitor::emit(enum opcode opcode)
 890 {
 891    return emit(new(mem_ctx) fs_inst(opcode, dispatch_width));
 892 }
 893
 894 fs_inst *
 895 fs_visitor::emit(enum opcode opcode, const fs_reg &dst)
 896 {
 897    return emit(new(mem_ctx) fs_inst(opcode, dst));
 898 }
 899
 900 fs_inst *
 901 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
 902 {
 903    return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
 904 }
 905
 906 fs_inst *
 907 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 908                  const fs_reg &src1)
 909 {
 910    return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
 911 }
 912
 913 fs_inst *
 914 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 915                  const fs_reg &src1, const fs_reg &src2)
 916 {
 917    return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
 918 }
 919
 920 fs_inst *
 921 fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
 922                  fs_reg src[], int sources)
 923 {
 924    return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
 925 }
 926
 927 /**
 928  * Returns true if the instruction has a flag that means it won't
 929  * update an entire destination register.
 930  *
 931  * For example, dead code elimination and live variable analysis want to know
 932  * when a write to a variable screens off any preceding values that were in
 933  * it.
 934  */
 935 bool
 936 fs_inst::is_partial_write() const
 937 {
 938    return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
 939            (this->dst.width * type_sz(this->dst.type)) < 32 ||
 940            !this->dst.is_contiguous());
 941 }
 942
 943 int
 944 fs_inst::regs_read(int arg) const
 945 {
 946    if (is_tex() && arg == 0 && src[0].file == GRF) {
 947       return mlen;
 948    } else if (opcode == FS_OPCODE_FB_WRITE && arg == 0) {
 949       return mlen;
 950    } else if (opcode == SHADER_OPCODE_URB_WRITE_SIMD8 && arg == 0) {
 951       return mlen;
 952    } else if (opcode == SHADER_OPCODE_UNTYPED_ATOMIC && arg == 0) {
 953       return mlen;
 954    } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ && arg == 0) {
 955       return mlen;
 956    } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_WRITE && arg == 0) {
 957       return mlen;
 958    } else if (opcode == SHADER_OPCODE_TYPED_ATOMIC && arg == 0) {
 959       return mlen;
 960    } else if (opcode == SHADER_OPCODE_TYPED_SURFACE_READ && arg == 0) {
 961       return mlen;
 962    } else if (opcode == SHADER_OPCODE_TYPED_SURFACE_WRITE && arg == 0) {
 963       return mlen;
 964    } else if (opcode == FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET && arg == 0) {
 965       return mlen;
 966    } else if (opcode == FS_OPCODE_LINTERP && arg == 0) {
 967       return exec_size / 4;
 968    }
 969
 970    switch (src[arg].file) {
 971    case BAD_FILE:
 972    case UNIFORM:
 973    case IMM:
 974       return 1;
 975    case GRF:
 976    case HW_REG:
 977       if (src[arg].stride == 0) {
 978          return 1;
 979       } else {
 980          int size = src[arg].width * src[arg].stride * type_sz(src[arg].type);
 981          return (size + 31) / 32;
 982       }
 983    case MRF:
 984       unreachable("MRF registers are not allowed as sources");
 985    default:
 986       unreachable("Invalid register file");
 987    }
 988 }
 989
 990 bool
 991 fs_inst::reads_flag() const
 992 {
 993    return predicate;
 994 }
 995
 996 bool
 997 fs_inst::writes_flag() const
 998 {
 999    return (conditional_mod && (opcode != BRW_OPCODE_SEL &&
1000                                opcode != BRW_OPCODE_IF &&
1001                                opcode != BRW_OPCODE_WHILE)) ||
1002           opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
1003 }
1004
1005 /**
1006  * Returns how many MRFs an FS opcode will write over.
1007  *
1008  * Note that this is not the 0 or 1 implied writes in an actual gen
1009  * instruction -- the FS opcodes often generate MOVs in addition.
1010  */
1011 int
1012 fs_visitor::implied_mrf_writes(fs_inst *inst)
1013 {
1014    if (inst->mlen == 0)
1015       return 0;
1016
1017    if (inst->base_mrf == -1)
1018       return 0;
1019
1020    switch (inst->opcode) {
1021    case SHADER_OPCODE_RCP:
1022    case SHADER_OPCODE_RSQ:
1023    case SHADER_OPCODE_SQRT:
1024    case SHADER_OPCODE_EXP2:
1025    case SHADER_OPCODE_LOG2:
1026    case SHADER_OPCODE_SIN:
1027    case SHADER_OPCODE_COS:
1028       return 1 * dispatch_width / 8;
1029    case SHADER_OPCODE_POW:
1030    case SHADER_OPCODE_INT_QUOTIENT:
1031    case SHADER_OPCODE_INT_REMAINDER:
1032       return 2 * dispatch_width / 8;
1033    case SHADER_OPCODE_TEX:
1034    case FS_OPCODE_TXB:
1035    case SHADER_OPCODE_TXD:
1036    case SHADER_OPCODE_TXF:
1037    case SHADER_OPCODE_TXF_CMS:
1038    case SHADER_OPCODE_TXF_MCS:
1039    case SHADER_OPCODE_TG4:
1040    case SHADER_OPCODE_TG4_OFFSET:
1041    case SHADER_OPCODE_TXL:
1042    case SHADER_OPCODE_TXS:
1043    case SHADER_OPCODE_LOD:
1044       return 1;
1045    case FS_OPCODE_FB_WRITE:
1046       return 2;
1047    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
1048    case SHADER_OPCODE_GEN4_SCRATCH_READ:
1049       return 1;
1050    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
1051       return inst->mlen;
1052    case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
1053       return inst->mlen;
1054    case SHADER_OPCODE_UNTYPED_ATOMIC:
1055    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
1056    case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
1057    case SHADER_OPCODE_TYPED_ATOMIC:
1058    case SHADER_OPCODE_TYPED_SURFACE_READ:
1059    case SHADER_OPCODE_TYPED_SURFACE_WRITE:
1060    case SHADER_OPCODE_URB_WRITE_SIMD8:
1061    case FS_OPCODE_INTERPOLATE_AT_CENTROID:
1062    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
1063    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
1064    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
1065       return 0;
1066    default:
1067       unreachable("not reached");
1068    }
1069 }
1070
1071 fs_reg
1072 fs_visitor::vgrf(const glsl_type *const type)
1073 {
1074    int reg_width = dispatch_width / 8;
1075    return fs_reg(GRF, alloc.allocate(type_size(type) * reg_width),
1076                  brw_type_for_base_type(type), dispatch_width);
1077 }
1078
1079 fs_reg
1080 fs_visitor::vgrf(int num_components)
1081 {
1082    int reg_width = dispatch_width / 8;
1083    return fs_reg(GRF, alloc.allocate(num_components * reg_width),
1084                  BRW_REGISTER_TYPE_F, dispatch_width);
1085 }
1086
1087 /** Fixed HW reg constructor. */
1088 fs_reg::fs_reg(enum register_file file, int reg)
1089 {
1090    init();
1091    this->file = file;
1092    this->reg = reg;
1093    this->type = BRW_REGISTER_TYPE_F;
1094
1095    switch (file) {
1096    case UNIFORM:
1097       this->width = 1;
1098       break;
1099    default:
1100       this->width = 8;
1101    }
1102 }
1103
1104 /** Fixed HW reg constructor. */
1105 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
1106 {
1107    init();
1108    this->file = file;
1109    this->reg = reg;
1110    this->type = type;
1111
1112    switch (file) {
1113    case UNIFORM:
1114       this->width = 1;
1115       break;
1116    default:
1117       this->width = 8;
1118    }
1119 }
1120
1121 /** Fixed HW reg constructor. */
1122 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type,
1123                uint8_t width)
1124 {
1125    init();
1126    this->file = file;
1127    this->reg = reg;
1128    this->type = type;
1129    this->width = width;
1130 }
1131
1132 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
1133  * This brings in those uniform definitions
1134  */
1135 void
1136 fs_visitor::import_uniforms(fs_visitor *v)
1137 {
1138    this->push_constant_loc = v->push_constant_loc;
1139    this->pull_constant_loc = v->pull_constant_loc;
1140    this->uniforms = v->uniforms;
1141    this->param_size = v->param_size;
1142 }
1143
1144 fs_reg *
1145 fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
1146                                          bool origin_upper_left)
1147 {
1148    assert(stage == MESA_SHADER_FRAGMENT);
1149    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1150    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec4_type));
1151    fs_reg wpos = *reg;
1152    bool flip = !origin_upper_left ^ key->render_to_fbo;
1153
1154    /* gl_FragCoord.x */
1155    if (pixel_center_integer) {
1156       emit(MOV(wpos, this->pixel_x));
1157    } else {
1158       emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
1159    }
1160    wpos = offset(wpos, 1);
1161
1162    /* gl_FragCoord.y */
1163    if (!flip && pixel_center_integer) {
1164       emit(MOV(wpos, this->pixel_y));
1165    } else {
1166       fs_reg pixel_y = this->pixel_y;
1167       float offset = (pixel_center_integer ? 0.0 : 0.5);
1168
1169       if (flip) {
1170          pixel_y.negate = true;
1171          offset += key->drawable_height - 1.0;
1172       }
1173
1174       emit(ADD(wpos, pixel_y, fs_reg(offset)));
1175    }
1176    wpos = offset(wpos, 1);
1177
1178    /* gl_FragCoord.z */
1179    if (devinfo->gen >= 6) {
1180       emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
1181    } else {
1182       emit(FS_OPCODE_LINTERP, wpos,
1183            this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1184            interp_reg(VARYING_SLOT_POS, 2));
1185    }
1186    wpos = offset(wpos, 1);
1187
1188    /* gl_FragCoord.w: Already set up in emit_interpolation */
1189    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1190
1191    return reg;
1192 }
1193
1194 fs_inst *
1195 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1196                          glsl_interp_qualifier interpolation_mode,
1197                          bool is_centroid, bool is_sample)
1198 {
1199    brw_wm_barycentric_interp_mode barycoord_mode;
1200    if (devinfo->gen >= 6) {
1201       if (is_centroid) {
1202          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1203             barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1204          else
1205             barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1206       } else if (is_sample) {
1207           if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1208             barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1209          else
1210             barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1211       } else {
1212          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1213             barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1214          else
1215             barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1216       }
1217    } else {
1218       /* On Ironlake and below, there is only one interpolation mode.
1219        * Centroid interpolation doesn't mean anything on this hardware --
1220        * there is no multisampling.
1221        */
1222       barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1223    }
1224    return emit(FS_OPCODE_LINTERP, attr,
1225                this->delta_xy[barycoord_mode], interp);
1226 }
1227
1228 void
1229 fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
1230                                        const glsl_type *type,
1231                                        glsl_interp_qualifier interpolation_mode,
1232                                        int location, bool mod_centroid,
1233                                        bool mod_sample)
1234 {
1235    attr.type = brw_type_for_base_type(type->get_scalar_type());
1236
1237    assert(stage == MESA_SHADER_FRAGMENT);
1238    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1239    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1240
1241    unsigned int array_elements;
1242
1243    if (type->is_array()) {
1244       array_elements = type->length;
1245       if (array_elements == 0) {
1246          fail("dereferenced array '%s' has length 0\n", name);
1247       }
1248       type = type->fields.array;
1249    } else {
1250       array_elements = 1;
1251    }
1252
1253    if (interpolation_mode == INTERP_QUALIFIER_NONE) {
1254       bool is_gl_Color =
1255          location == VARYING_SLOT_COL0 || location == VARYING_SLOT_COL1;
1256       if (key->flat_shade && is_gl_Color) {
1257          interpolation_mode = INTERP_QUALIFIER_FLAT;
1258       } else {
1259          interpolation_mode = INTERP_QUALIFIER_SMOOTH;
1260       }
1261    }
1262
1263    for (unsigned int i = 0; i < array_elements; i++) {
1264       for (unsigned int j = 0; j < type->matrix_columns; j++) {
1265          if (prog_data->urb_setup[location] == -1) {
1266             /* If there's no incoming setup data for this slot, don't
1267              * emit interpolation for it.
1268              */
1269             attr = offset(attr, type->vector_elements);
1270             location++;
1271             continue;
1272          }
1273
1274          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1275             /* Constant interpolation (flat shading) case. The SF has
1276              * handed us defined values in only the constant offset
1277              * field of the setup reg.
1278              */
1279             for (unsigned int k = 0; k < type->vector_elements; k++) {
1280                struct brw_reg interp = interp_reg(location, k);
1281                interp = suboffset(interp, 3);
1282                interp.type = attr.type;
1283                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1284                attr = offset(attr, 1);
1285             }
1286          } else {
1287             /* Smooth/noperspective interpolation case. */
1288             for (unsigned int k = 0; k < type->vector_elements; k++) {
1289                struct brw_reg interp = interp_reg(location, k);
1290                if (devinfo->needs_unlit_centroid_workaround && mod_centroid) {
1291                   /* Get the pixel/sample mask into f0 so that we know
1292                    * which pixels are lit.  Then, for each channel that is
1293                    * unlit, replace the centroid data with non-centroid
1294                    * data.
1295                    */
1296                   emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1297
1298                   fs_inst *inst;
1299                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1300                                       false, false);
1301                   inst->predicate = BRW_PREDICATE_NORMAL;
1302                   inst->predicate_inverse = true;
1303                   if (devinfo->has_pln)
1304                      inst->no_dd_clear = true;
1305
1306                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1307                                       mod_centroid && !key->persample_shading,
1308                                       mod_sample || key->persample_shading);
1309                   inst->predicate = BRW_PREDICATE_NORMAL;
1310                   inst->predicate_inverse = false;
1311                   if (devinfo->has_pln)
1312                      inst->no_dd_check = true;
1313
1314                } else {
1315                   emit_linterp(attr, fs_reg(interp), interpolation_mode,
1316                                mod_centroid && !key->persample_shading,
1317                                mod_sample || key->persample_shading);
1318                }
1319                if (devinfo->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1320                   emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1321                }
1322                attr = offset(attr, 1);
1323             }
1324
1325          }
1326          location++;
1327       }
1328    }
1329 }
1330
1331 fs_reg *
1332 fs_visitor::emit_frontfacing_interpolation()
1333 {
1334    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type));
1335
1336    if (devinfo->gen >= 6) {
1337       /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
1338        * a boolean result from this (~0/true or 0/false).
1339        *
1340        * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
1341        * this task in only one instruction:
1342        *    - a negation source modifier will flip the bit; and
1343        *    - a W -> D type conversion will sign extend the bit into the high
1344        *      word of the destination.
1345        *
1346        * An ASR 15 fills the low word of the destination.
1347        */
1348       fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
1349       g0.negate = true;
1350
1351       emit(ASR(*reg, g0, fs_reg(15)));
1352    } else {
1353       /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
1354        * a boolean result from this (1/true or 0/false).
1355        *
1356        * Like in the above case, since the bit is the MSB of g1.6:UD we can use
1357        * the negation source modifier to flip it. Unfortunately the SHR
1358        * instruction only operates on UD (or D with an abs source modifier)
1359        * sources without negation.
1360        *
1361        * Instead, use ASR (which will give ~0/true or 0/false).
1362        */
1363       fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
1364       g1_6.negate = true;
1365
1366       emit(ASR(*reg, g1_6, fs_reg(31)));
1367    }
1368
1369    return reg;
1370 }
1371
1372 void
1373 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1374 {
1375    assert(stage == MESA_SHADER_FRAGMENT);
1376    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1377    assert(dst.type == BRW_REGISTER_TYPE_F);
1378
1379    if (key->compute_pos_offset) {
1380       /* Convert int_sample_pos to floating point */
1381       emit(MOV(dst, int_sample_pos));
1382       /* Scale to the range [0, 1] */
1383       emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1384    }
1385    else {
1386       /* From ARB_sample_shading specification:
1387        * "When rendering to a non-multisample buffer, or if multisample
1388        *  rasterization is disabled, gl_SamplePosition will always be
1389        *  (0.5, 0.5).
1390        */
1391       emit(MOV(dst, fs_reg(0.5f)));
1392    }
1393 }
1394
1395 fs_reg *
1396 fs_visitor::emit_samplepos_setup()
1397 {
1398    assert(devinfo->gen >= 6);
1399
1400    this->current_annotation = "compute sample position";
1401    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type));
1402    fs_reg pos = *reg;
1403    fs_reg int_sample_x = vgrf(glsl_type::int_type);
1404    fs_reg int_sample_y = vgrf(glsl_type::int_type);
1405
1406    /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1407     * mode will be enabled.
1408     *
1409     * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1410     * R31.1:0         Position Offset X/Y for Slot[3:0]
1411     * R31.3:2         Position Offset X/Y for Slot[7:4]
1412     * .....
1413     *
1414     * The X, Y sample positions come in as bytes in  thread payload. So, read
1415     * the positions using vstride=16, width=8, hstride=2.
1416     */
1417    struct brw_reg sample_pos_reg =
1418       stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1419                     BRW_REGISTER_TYPE_B), 16, 8, 2);
1420
1421    if (dispatch_width == 8) {
1422       emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1423    } else {
1424       emit(MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg)));
1425       emit(MOV(half(int_sample_x, 1), fs_reg(suboffset(sample_pos_reg, 16))))
1426          ->force_sechalf = true;
1427    }
1428    /* Compute gl_SamplePosition.x */
1429    compute_sample_position(pos, int_sample_x);
1430    pos = offset(pos, 1);
1431    if (dispatch_width == 8) {
1432       emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1433    } else {
1434       emit(MOV(half(int_sample_y, 0),
1435                fs_reg(suboffset(sample_pos_reg, 1))));
1436       emit(MOV(half(int_sample_y, 1), fs_reg(suboffset(sample_pos_reg, 17))))
1437          ->force_sechalf = true;
1438    }
1439    /* Compute gl_SamplePosition.y */
1440    compute_sample_position(pos, int_sample_y);
1441    return reg;
1442 }
1443
1444 fs_reg *
1445 fs_visitor::emit_sampleid_setup()
1446 {
1447    assert(stage == MESA_SHADER_FRAGMENT);
1448    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1449    assert(devinfo->gen >= 6);
1450
1451    this->current_annotation = "compute sample id";
1452    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
1453
1454    if (key->compute_sample_id) {
1455       fs_reg t1 = vgrf(glsl_type::int_type);
1456       fs_reg t2 = vgrf(glsl_type::int_type);
1457       t2.type = BRW_REGISTER_TYPE_UW;
1458
1459       /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1460        * 8x multisampling, subspan 0 will represent sample N (where N
1461        * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1462        * 7. We can find the value of N by looking at R0.0 bits 7:6
1463        * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1464        * (since samples are always delivered in pairs). That is, we
1465        * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1466        * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1467        * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1468        * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1469        * populating a temporary variable with the sequence (0, 1, 2, 3),
1470        * and then reading from it using vstride=1, width=4, hstride=0.
1471        * These computations hold good for 4x multisampling as well.
1472        *
1473        * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1474        * the first four slots are sample 0 of subspan 0; the next four
1475        * are sample 1 of subspan 0; the third group is sample 0 of
1476        * subspan 1, and finally sample 1 of subspan 1.
1477        */
1478       fs_inst *inst;
1479       inst = emit(BRW_OPCODE_AND, t1,
1480                   fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1481                   fs_reg(0xc0));
1482       inst->force_writemask_all = true;
1483       inst = emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1484       inst->force_writemask_all = true;
1485       /* This works for both SIMD8 and SIMD16 */
1486       inst = emit(MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210)));
1487       inst->force_writemask_all = true;
1488       /* This special instruction takes care of setting vstride=1,
1489        * width=4, hstride=0 of t2 during an ADD instruction.
1490        */
1491       emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1492    } else {
1493       /* As per GL_ARB_sample_shading specification:
1494        * "When rendering to a non-multisample buffer, or if multisample
1495        *  rasterization is disabled, gl_SampleID will always be zero."
1496        */
1497       emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1498    }
1499
1500    return reg;
1501 }
1502
1503 void
1504 fs_visitor::resolve_source_modifiers(fs_reg *src)
1505 {
1506    if (!src->abs && !src->negate)
1507       return;
1508
1509    fs_reg temp = retype(vgrf(1), src->type);
1510    emit(MOV(temp, *src));
1511    *src = temp;
1512 }
1513
1514 fs_reg
1515 fs_visitor::fix_math_operand(fs_reg src)
1516 {
1517    /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1518     * might be able to do better by doing execsize = 1 math and then
1519     * expanding that result out, but we would need to be careful with
1520     * masking.
1521     *
1522     * The hardware ignores source modifiers (negate and abs) on math
1523     * instructions, so we also move to a temp to set those up.
1524     */
1525    if (devinfo->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1526        !src.abs && !src.negate)
1527       return src;
1528
1529    /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1530     * operands to math
1531     */
1532    if (devinfo->gen >= 7 && src.file != IMM)
1533       return src;
1534
1535    fs_reg expanded = vgrf(glsl_type::float_type);
1536    expanded.type = src.type;
1537    emit(BRW_OPCODE_MOV, expanded, src);
1538    return expanded;
1539 }
1540
1541 fs_inst *
1542 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1543 {
1544    switch (opcode) {
1545    case SHADER_OPCODE_RCP:
1546    case SHADER_OPCODE_RSQ:
1547    case SHADER_OPCODE_SQRT:
1548    case SHADER_OPCODE_EXP2:
1549    case SHADER_OPCODE_LOG2:
1550    case SHADER_OPCODE_SIN:
1551    case SHADER_OPCODE_COS:
1552       break;
1553    default:
1554       unreachable("not reached: bad math opcode");
1555    }
1556
1557    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
1558     * might be able to do better by doing execsize = 1 math and then
1559     * expanding that result out, but we would need to be careful with
1560     * masking.
1561     *
1562     * Gen 6 hardware ignores source modifiers (negate and abs) on math
1563     * instructions, so we also move to a temp to set those up.
1564     */
1565    if (devinfo->gen == 6 || devinfo->gen == 7)
1566       src = fix_math_operand(src);
1567
1568    fs_inst *inst = emit(opcode, dst, src);
1569
1570    if (devinfo->gen < 6) {
1571       inst->base_mrf = 2;
1572       inst->mlen = dispatch_width / 8;
1573    }
1574
1575    return inst;
1576 }
1577
1578 fs_inst *
1579 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1580 {
1581    int base_mrf = 2;
1582    fs_inst *inst;
1583
1584    if (devinfo->gen >= 8) {
1585       inst = emit(opcode, dst, src0, src1);
1586    } else if (devinfo->gen >= 6) {
1587       src0 = fix_math_operand(src0);
1588       src1 = fix_math_operand(src1);
1589
1590       inst = emit(opcode, dst, src0, src1);
1591    } else {
1592       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1593        * "Message Payload":
1594        *
1595        * "Operand0[7].  For the INT DIV functions, this operand is the
1596        *  denominator."
1597        *  ...
1598        * "Operand1[7].  For the INT DIV functions, this operand is the
1599        *  numerator."
1600        */
1601       bool is_int_div = opcode != SHADER_OPCODE_POW;
1602       fs_reg &op0 = is_int_div ? src1 : src0;
1603       fs_reg &op1 = is_int_div ? src0 : src1;
1604
1605       emit(MOV(fs_reg(MRF, base_mrf + 1, op1.type, dispatch_width), op1));
1606       inst = emit(opcode, dst, op0, reg_null_f);
1607
1608       inst->base_mrf = base_mrf;
1609       inst->mlen = 2 * dispatch_width / 8;
1610    }
1611    return inst;
1612 }
1613
1614 void
1615 fs_visitor::emit_discard_jump()
1616 {
1617    assert(((brw_wm_prog_data*) this->prog_data)->uses_kill);
1618
1619    /* For performance, after a discard, jump to the end of the
1620     * shader if all relevant channels have been discarded.
1621     */
1622    fs_inst *discard_jump = emit(FS_OPCODE_DISCARD_JUMP);
1623    discard_jump->flag_subreg = 1;
1624
1625    discard_jump->predicate = (dispatch_width == 8)
1626                              ? BRW_PREDICATE_ALIGN1_ANY8H
1627                              : BRW_PREDICATE_ALIGN1_ANY16H;
1628    discard_jump->predicate_inverse = true;
1629 }
1630
1631 void
1632 fs_visitor::assign_curb_setup()
1633 {
1634    if (dispatch_width == 8) {
1635       prog_data->dispatch_grf_start_reg = payload.num_regs;
1636    } else {
1637       if (stage == MESA_SHADER_FRAGMENT) {
1638          brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1639          prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1640       } else if (stage == MESA_SHADER_COMPUTE) {
1641          brw_cs_prog_data *prog_data = (brw_cs_prog_data*) this->prog_data;
1642          prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1643       } else {
1644          unreachable("Unsupported shader type!");
1645       }
1646    }
1647
1648    prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1649
1650    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1651    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1652       for (unsigned int i = 0; i < inst->sources; i++) {
1653          if (inst->src[i].file == UNIFORM) {
1654             int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1655             int constant_nr;
1656             if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1657                constant_nr = push_constant_loc[uniform_nr];
1658             } else {
1659                /* Section 5.11 of the OpenGL 4.1 spec says:
1660                 * "Out-of-bounds reads return undefined values, which include
1661                 *  values from other variables of the active program or zero."
1662                 * Just return the first push constant.
1663                 */
1664                constant_nr = 0;
1665             }
1666
1667             struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1668                                                   constant_nr / 8,
1669                                                   constant_nr % 8);
1670
1671             inst->src[i].file = HW_REG;
1672             inst->src[i].fixed_hw_reg = byte_offset(
1673                retype(brw_reg, inst->src[i].type),
1674                inst->src[i].subreg_offset);
1675          }
1676       }
1677    }
1678 }
1679
1680 void
1681 fs_visitor::calculate_urb_setup()
1682 {
1683    assert(stage == MESA_SHADER_FRAGMENT);
1684    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1685    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1686
1687    memset(prog_data->urb_setup, -1,
1688           sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
1689
1690    int urb_next = 0;
1691    /* Figure out where each of the incoming setup attributes lands. */
1692    if (devinfo->gen >= 6) {
1693       if (_mesa_bitcount_64(prog->InputsRead &
1694                             BRW_FS_VARYING_INPUT_MASK) <= 16) {
1695          /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1696           * first 16 varying inputs, so we can put them wherever we want.
1697           * Just put them in order.
1698           *
1699           * This is useful because it means that (a) inputs not used by the
1700           * fragment shader won't take up valuable register space, and (b) we
1701           * won't have to recompile the fragment shader if it gets paired with
1702           * a different vertex (or geometry) shader.
1703           */
1704          for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1705             if (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1706                 BITFIELD64_BIT(i)) {
1707                prog_data->urb_setup[i] = urb_next++;
1708             }
1709          }
1710       } else {
1711          /* We have enough input varyings that the SF/SBE pipeline stage can't
1712           * arbitrarily rearrange them to suit our whim; we have to put them
1713           * in an order that matches the output of the previous pipeline stage
1714           * (geometry or vertex shader).
1715           */
1716          struct brw_vue_map prev_stage_vue_map;
1717          brw_compute_vue_map(devinfo, &prev_stage_vue_map,
1718                              key->input_slots_valid);
1719          int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1720          assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1721          for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1722               slot++) {
1723             int varying = prev_stage_vue_map.slot_to_varying[slot];
1724             /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1725              * unused.
1726              */
1727             if (varying != BRW_VARYING_SLOT_COUNT &&
1728                 (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1729                  BITFIELD64_BIT(varying))) {
1730                prog_data->urb_setup[varying] = slot - first_slot;
1731             }
1732          }
1733          urb_next = prev_stage_vue_map.num_slots - first_slot;
1734       }
1735    } else {
1736       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1737       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1738          /* Point size is packed into the header, not as a general attribute */
1739          if (i == VARYING_SLOT_PSIZ)
1740             continue;
1741
1742          if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1743             /* The back color slot is skipped when the front color is
1744              * also written to.  In addition, some slots can be
1745              * written in the vertex shader and not read in the
1746              * fragment shader.  So the register number must always be
1747              * incremented, mapped or not.
1748              */
1749             if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1750                prog_data->urb_setup[i] = urb_next;
1751             urb_next++;
1752          }
1753       }
1754
1755       /*
1756        * It's a FS only attribute, and we did interpolation for this attribute
1757        * in SF thread. So, count it here, too.
1758        *
1759        * See compile_sf_prog() for more info.
1760        */
1761       if (prog->InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1762          prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1763    }
1764
1765    prog_data->num_varying_inputs = urb_next;
1766 }
1767
1768 void
1769 fs_visitor::assign_urb_setup()
1770 {
1771    assert(stage == MESA_SHADER_FRAGMENT);
1772    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1773
1774    int urb_start = payload.num_regs + prog_data->base.curb_read_length;
1775
1776    /* Offset all the urb_setup[] index by the actual position of the
1777     * setup regs, now that the location of the constants has been chosen.
1778     */
1779    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1780       if (inst->opcode == FS_OPCODE_LINTERP) {
1781          assert(inst->src[1].file == HW_REG);
1782          inst->src[1].fixed_hw_reg.nr += urb_start;
1783       }
1784
1785       if (inst->opcode == FS_OPCODE_CINTERP) {
1786          assert(inst->src[0].file == HW_REG);
1787          inst->src[0].fixed_hw_reg.nr += urb_start;
1788       }
1789    }
1790
1791    /* Each attribute is 4 setup channels, each of which is half a reg. */
1792    this->first_non_payload_grf =
1793       urb_start + prog_data->num_varying_inputs * 2;
1794 }
1795
1796 void
1797 fs_visitor::assign_vs_urb_setup()
1798 {
1799    brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
1800    int grf, count, slot, channel, attr;
1801
1802    assert(stage == MESA_SHADER_VERTEX);
1803    count = _mesa_bitcount_64(vs_prog_data->inputs_read);
1804    if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid)
1805       count++;
1806
1807    /* Each attribute is 4 regs. */
1808    this->first_non_payload_grf =
1809       payload.num_regs + prog_data->curb_read_length + count * 4;
1810
1811    unsigned vue_entries =
1812       MAX2(count, vs_prog_data->base.vue_map.num_slots);
1813
1814    vs_prog_data->base.urb_entry_size = ALIGN(vue_entries, 4) / 4;
1815    vs_prog_data->base.urb_read_length = (count + 1) / 2;
1816
1817    assert(vs_prog_data->base.urb_read_length <= 15);
1818
1819    /* Rewrite all ATTR file references to the hw grf that they land in. */
1820    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1821       for (int i = 0; i < inst->sources; i++) {
1822          if (inst->src[i].file == ATTR) {
1823
1824             if (inst->src[i].reg == VERT_ATTRIB_MAX) {
1825                slot = count - 1;
1826             } else {
1827                /* Attributes come in in a contiguous block, ordered by their
1828                 * gl_vert_attrib value.  That means we can compute the slot
1829                 * number for an attribute by masking out the enabled
1830                 * attributes before it and counting the bits.
1831                 */
1832                attr = inst->src[i].reg + inst->src[i].reg_offset / 4;
1833                slot = _mesa_bitcount_64(vs_prog_data->inputs_read &
1834                                         BITFIELD64_MASK(attr));
1835             }
1836
1837             channel = inst->src[i].reg_offset & 3;
1838
1839             grf = payload.num_regs +
1840                prog_data->curb_read_length +
1841                slot * 4 + channel;
1842
1843             inst->src[i].file = HW_REG;
1844             inst->src[i].fixed_hw_reg =
1845                retype(brw_vec8_grf(grf, 0), inst->src[i].type);
1846          }
1847       }
1848    }
1849 }
1850
1851 /**
1852  * Split large virtual GRFs into separate components if we can.
1853  *
1854  * This is mostly duplicated with what brw_fs_vector_splitting does,
1855  * but that's really conservative because it's afraid of doing
1856  * splitting that doesn't result in real progress after the rest of
1857  * the optimization phases, which would cause infinite looping in
1858  * optimization.  We can do it once here, safely.  This also has the
1859  * opportunity to split interpolated values, or maybe even uniforms,
1860  * which we don't have at the IR level.
1861  *
1862  * We want to split, because virtual GRFs are what we register
1863  * allocate and spill (due to contiguousness requirements for some
1864  * instructions), and they're what we naturally generate in the
1865  * codegen process, but most virtual GRFs don't actually need to be
1866  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1867  * live intervals and better dead code elimination and coalescing.
1868  */
1869 void
1870 fs_visitor::split_virtual_grfs()
1871 {
1872    int num_vars = this->alloc.count;
1873
1874    /* Count the total number of registers */
1875    int reg_count = 0;
1876    int vgrf_to_reg[num_vars];
1877    for (int i = 0; i < num_vars; i++) {
1878       vgrf_to_reg[i] = reg_count;
1879       reg_count += alloc.sizes[i];
1880    }
1881
1882    /* An array of "split points".  For each register slot, this indicates
1883     * if this slot can be separated from the previous slot.  Every time an
1884     * instruction uses multiple elements of a register (as a source or
1885     * destination), we mark the used slots as inseparable.  Then we go
1886     * through and split the registers into the smallest pieces we can.
1887     */
1888    bool split_points[reg_count];
1889    memset(split_points, 0, sizeof(split_points));
1890
1891    /* Mark all used registers as fully splittable */
1892    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1893       if (inst->dst.file == GRF) {
1894          int reg = vgrf_to_reg[inst->dst.reg];
1895          for (unsigned j = 1; j < this->alloc.sizes[inst->dst.reg]; j++)
1896             split_points[reg + j] = true;
1897       }
1898
1899       for (int i = 0; i < inst->sources; i++) {
1900          if (inst->src[i].file == GRF) {
1901             int reg = vgrf_to_reg[inst->src[i].reg];
1902             for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].reg]; j++)
1903                split_points[reg + j] = true;
1904          }
1905       }
1906    }
1907
1908    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1909       if (inst->dst.file == GRF) {
1910          int reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
1911          for (int j = 1; j < inst->regs_written; j++)
1912             split_points[reg + j] = false;
1913       }
1914       for (int i = 0; i < inst->sources; i++) {
1915          if (inst->src[i].file == GRF) {
1916             int reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
1917             for (int j = 1; j < inst->regs_read(i); j++)
1918                split_points[reg + j] = false;
1919          }
1920       }
1921    }
1922
1923    int new_virtual_grf[reg_count];
1924    int new_reg_offset[reg_count];
1925
1926    int reg = 0;
1927    for (int i = 0; i < num_vars; i++) {
1928       /* The first one should always be 0 as a quick sanity check. */
1929       assert(split_points[reg] == false);
1930
1931       /* j = 0 case */
1932       new_reg_offset[reg] = 0;
1933       reg++;
1934       int offset = 1;
1935
1936       /* j > 0 case */
1937       for (unsigned j = 1; j < alloc.sizes[i]; j++) {
1938          /* If this is a split point, reset the offset to 0 and allocate a
1939           * new virtual GRF for the previous offset many registers
1940           */
1941          if (split_points[reg]) {
1942             assert(offset <= MAX_VGRF_SIZE);
1943             int grf = alloc.allocate(offset);
1944             for (int k = reg - offset; k < reg; k++)
1945                new_virtual_grf[k] = grf;
1946             offset = 0;
1947          }
1948          new_reg_offset[reg] = offset;
1949          offset++;
1950          reg++;
1951       }
1952
1953       /* The last one gets the original register number */
1954       assert(offset <= MAX_VGRF_SIZE);
1955       alloc.sizes[i] = offset;
1956       for (int k = reg - offset; k < reg; k++)
1957          new_virtual_grf[k] = i;
1958    }
1959    assert(reg == reg_count);
1960
1961    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1962       if (inst->dst.file == GRF) {
1963          reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
1964          inst->dst.reg = new_virtual_grf[reg];
1965          inst->dst.reg_offset = new_reg_offset[reg];
1966          assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
1967       }
1968       for (int i = 0; i < inst->sources; i++) {
1969          if (inst->src[i].file == GRF) {
1970             reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
1971             inst->src[i].reg = new_virtual_grf[reg];
1972             inst->src[i].reg_offset = new_reg_offset[reg];
1973             assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
1974          }
1975       }
1976    }
1977    invalidate_live_intervals();
1978 }
1979
1980 /**
1981  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1982  *
1983  * During code generation, we create tons of temporary variables, many of
1984  * which get immediately killed and are never used again.  Yet, in later
1985  * optimization and analysis passes, such as compute_live_intervals, we need
1986  * to loop over all the virtual GRFs.  Compacting them can save a lot of
1987  * overhead.
1988  */
1989 bool
1990 fs_visitor::compact_virtual_grfs()
1991 {
1992    bool progress = false;
1993    int remap_table[this->alloc.count];
1994    memset(remap_table, -1, sizeof(remap_table));
1995
1996    /* Mark which virtual GRFs are used. */
1997    foreach_block_and_inst(block, const fs_inst, inst, cfg) {
1998       if (inst->dst.file == GRF)
1999          remap_table[inst->dst.reg] = 0;
2000
2001       for (int i = 0; i < inst->sources; i++) {
2002          if (inst->src[i].file == GRF)
2003             remap_table[inst->src[i].reg] = 0;
2004       }
2005    }
2006
2007    /* Compact the GRF arrays. */
2008    int new_index = 0;
2009    for (unsigned i = 0; i < this->alloc.count; i++) {
2010       if (remap_table[i] == -1) {
2011          /* We just found an unused register.  This means that we are
2012           * actually going to compact something.
2013           */
2014          progress = true;
2015       } else {
2016          remap_table[i] = new_index;
2017          alloc.sizes[new_index] = alloc.sizes[i];
2018          invalidate_live_intervals();
2019          ++new_index;
2020       }
2021    }
2022
2023    this->alloc.count = new_index;
2024
2025    /* Patch all the instructions to use the newly renumbered registers */
2026    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2027       if (inst->dst.file == GRF)
2028          inst->dst.reg = remap_table[inst->dst.reg];
2029
2030       for (int i = 0; i < inst->sources; i++) {
2031          if (inst->src[i].file == GRF)
2032             inst->src[i].reg = remap_table[inst->src[i].reg];
2033       }
2034    }
2035
2036    /* Patch all the references to delta_xy, since they're used in register
2037     * allocation.  If they're unused, switch them to BAD_FILE so we don't
2038     * think some random VGRF is delta_xy.
2039     */
2040    for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
2041       if (delta_xy[i].file == GRF) {
2042          if (remap_table[delta_xy[i].reg] != -1) {
2043             delta_xy[i].reg = remap_table[delta_xy[i].reg];
2044          } else {
2045             delta_xy[i].file = BAD_FILE;
2046          }
2047       }
2048    }
2049
2050    return progress;
2051 }
2052
2053 /*
2054  * Implements array access of uniforms by inserting a
2055  * PULL_CONSTANT_LOAD instruction.
2056  *
2057  * Unlike temporary GRF array access (where we don't support it due to
2058  * the difficulty of doing relative addressing on instruction
2059  * destinations), we could potentially do array access of uniforms
2060  * that were loaded in GRF space as push constants.  In real-world
2061  * usage we've seen, though, the arrays being used are always larger
2062  * than we could load as push constants, so just always move all
2063  * uniform array access out to a pull constant buffer.
2064  */
2065 void
2066 fs_visitor::move_uniform_array_access_to_pull_constants()
2067 {
2068    if (dispatch_width != 8)
2069       return;
2070
2071    pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2072    memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
2073
2074    /* Walk through and find array access of uniforms.  Put a copy of that
2075     * uniform in the pull constant buffer.
2076     *
2077     * Note that we don't move constant-indexed accesses to arrays.  No
2078     * testing has been done of the performance impact of this choice.
2079     */
2080    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2081       for (int i = 0 ; i < inst->sources; i++) {
2082          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2083             continue;
2084
2085          int uniform = inst->src[i].reg;
2086
2087          /* If this array isn't already present in the pull constant buffer,
2088           * add it.
2089           */
2090          if (pull_constant_loc[uniform] == -1) {
2091             const gl_constant_value **values = &stage_prog_data->param[uniform];
2092
2093             assert(param_size[uniform]);
2094
2095             for (int j = 0; j < param_size[uniform]; j++) {
2096                pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
2097
2098                stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
2099                   values[j];
2100             }
2101          }
2102       }
2103    }
2104 }
2105
2106 /**
2107  * Assign UNIFORM file registers to either push constants or pull constants.
2108  *
2109  * We allow a fragment shader to have more than the specified minimum
2110  * maximum number of fragment shader uniform components (64).  If
2111  * there are too many of these, they'd fill up all of register space.
2112  * So, this will push some of them out to the pull constant buffer and
2113  * update the program to load them.
2114  */
2115 void
2116 fs_visitor::assign_constant_locations()
2117 {
2118    /* Only the first compile (SIMD8 mode) gets to decide on locations. */
2119    if (dispatch_width != 8)
2120       return;
2121
2122    /* Find which UNIFORM registers are still in use. */
2123    bool is_live[uniforms];
2124    for (unsigned int i = 0; i < uniforms; i++) {
2125       is_live[i] = false;
2126    }
2127
2128    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2129       for (int i = 0; i < inst->sources; i++) {
2130          if (inst->src[i].file != UNIFORM)
2131             continue;
2132
2133          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
2134          if (constant_nr >= 0 && constant_nr < (int) uniforms)
2135             is_live[constant_nr] = true;
2136       }
2137    }
2138
2139    /* Only allow 16 registers (128 uniform components) as push constants.
2140     *
2141     * Just demote the end of the list.  We could probably do better
2142     * here, demoting things that are rarely used in the program first.
2143     *
2144     * If changing this value, note the limitation about total_regs in
2145     * brw_curbe.c.
2146     */
2147    unsigned int max_push_components = 16 * 8;
2148    unsigned int num_push_constants = 0;
2149
2150    push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2151
2152    for (unsigned int i = 0; i < uniforms; i++) {
2153       if (!is_live[i] || pull_constant_loc[i] != -1) {
2154          /* This UNIFORM register is either dead, or has already been demoted
2155           * to a pull const.  Mark it as no longer living in the param[] array.
2156           */
2157          push_constant_loc[i] = -1;
2158          continue;
2159       }
2160
2161       if (num_push_constants < max_push_components) {
2162          /* Retain as a push constant.  Record the location in the params[]
2163           * array.
2164           */
2165          push_constant_loc[i] = num_push_constants++;
2166       } else {
2167          /* Demote to a pull constant. */
2168          push_constant_loc[i] = -1;
2169
2170          int pull_index = stage_prog_data->nr_pull_params++;
2171          stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
2172          pull_constant_loc[i] = pull_index;
2173       }
2174    }
2175
2176    stage_prog_data->nr_params = num_push_constants;
2177
2178    /* Up until now, the param[] array has been indexed by reg + reg_offset
2179     * of UNIFORM registers.  Condense it to only contain the uniforms we
2180     * chose to upload as push constants.
2181     */
2182    for (unsigned int i = 0; i < uniforms; i++) {
2183       int remapped = push_constant_loc[i];
2184
2185       if (remapped == -1)
2186          continue;
2187
2188       assert(remapped <= (int)i);
2189       stage_prog_data->param[remapped] = stage_prog_data->param[i];
2190    }
2191 }
2192
2193 /**
2194  * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
2195  * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
2196  */
2197 void
2198 fs_visitor::demote_pull_constants()
2199 {
2200    foreach_block_and_inst (block, fs_inst, inst, cfg) {
2201       for (int i = 0; i < inst->sources; i++) {
2202          if (inst->src[i].file != UNIFORM)
2203             continue;
2204
2205          int pull_index;
2206          unsigned location = inst->src[i].reg + inst->src[i].reg_offset;
2207          if (location >= uniforms) /* Out of bounds access */
2208             pull_index = -1;
2209          else
2210             pull_index = pull_constant_loc[location];
2211
2212          if (pull_index == -1)
2213             continue;
2214
2215          /* Set up the annotation tracking for new generated instructions. */
2216          base_ir = inst->ir;
2217          current_annotation = inst->annotation;
2218
2219          fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
2220          fs_reg dst = vgrf(glsl_type::float_type);
2221
2222          /* Generate a pull load into dst. */
2223          if (inst->src[i].reladdr) {
2224             exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
2225                                                         surf_index,
2226                                                         *inst->src[i].reladdr,
2227                                                         pull_index);
2228             inst->insert_before(block, &list);
2229             inst->src[i].reladdr = NULL;
2230          } else {
2231             fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
2232             fs_inst *pull =
2233                new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 8,
2234                                     dst, surf_index, offset);
2235             inst->insert_before(block, pull);
2236             inst->src[i].set_smear(pull_index & 3);
2237          }
2238
2239          /* Rewrite the instruction to use the temporary VGRF. */
2240          inst->src[i].file = GRF;
2241          inst->src[i].reg = dst.reg;
2242          inst->src[i].reg_offset = 0;
2243          inst->src[i].width = dispatch_width;
2244       }
2245    }
2246    invalidate_live_intervals();
2247 }
2248
2249 bool
2250 fs_visitor::opt_algebraic()
2251 {
2252    bool progress = false;
2253
2254    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2255       switch (inst->opcode) {
2256       case BRW_OPCODE_MOV:
2257          if (inst->src[0].file != IMM)
2258             break;
2259
2260          if (inst->saturate) {
2261             if (inst->dst.type != inst->src[0].type)
2262                assert(!"unimplemented: saturate mixed types");
2263
2264             if (brw_saturate_immediate(inst->dst.type,
2265                                        &inst->src[0].fixed_hw_reg)) {
2266                inst->saturate = false;
2267                progress = true;
2268             }
2269          }
2270          break;
2271
2272       case BRW_OPCODE_MUL:
2273          if (inst->src[1].file != IMM)
2274             continue;
2275
2276          /* a * 1.0 = a */
2277          if (inst->src[1].is_one()) {
2278             inst->opcode = BRW_OPCODE_MOV;
2279             inst->src[1] = reg_undef;
2280             progress = true;
2281             break;
2282          }
2283
2284          /* a * -1.0 = -a */
2285          if (inst->src[1].is_negative_one()) {
2286             inst->opcode = BRW_OPCODE_MOV;
2287             inst->src[0].negate = !inst->src[0].negate;
2288             inst->src[1] = reg_undef;
2289             progress = true;
2290             break;
2291          }
2292
2293          /* a * 0.0 = 0.0 */
2294          if (inst->src[1].is_zero()) {
2295             inst->opcode = BRW_OPCODE_MOV;
2296             inst->src[0] = inst->src[1];
2297             inst->src[1] = reg_undef;
2298             progress = true;
2299             break;
2300          }
2301
2302          if (inst->src[0].file == IMM) {
2303             assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2304             inst->opcode = BRW_OPCODE_MOV;
2305             inst->src[0].fixed_hw_reg.dw1.f *= inst->src[1].fixed_hw_reg.dw1.f;
2306             inst->src[1] = reg_undef;
2307             progress = true;
2308             break;
2309          }
2310          break;
2311       case BRW_OPCODE_ADD:
2312          if (inst->src[1].file != IMM)
2313             continue;
2314
2315          /* a + 0.0 = a */
2316          if (inst->src[1].is_zero()) {
2317             inst->opcode = BRW_OPCODE_MOV;
2318             inst->src[1] = reg_undef;
2319             progress = true;
2320             break;
2321          }
2322
2323          if (inst->src[0].file == IMM) {
2324             assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2325             inst->opcode = BRW_OPCODE_MOV;
2326             inst->src[0].fixed_hw_reg.dw1.f += inst->src[1].fixed_hw_reg.dw1.f;
2327             inst->src[1] = reg_undef;
2328             progress = true;
2329             break;
2330          }
2331          break;
2332       case BRW_OPCODE_OR:
2333          if (inst->src[0].equals(inst->src[1])) {
2334             inst->opcode = BRW_OPCODE_MOV;
2335             inst->src[1] = reg_undef;
2336             progress = true;
2337             break;
2338          }
2339          break;
2340       case BRW_OPCODE_LRP:
2341          if (inst->src[1].equals(inst->src[2])) {
2342             inst->opcode = BRW_OPCODE_MOV;
2343             inst->src[0] = inst->src[1];
2344             inst->src[1] = reg_undef;
2345             inst->src[2] = reg_undef;
2346             progress = true;
2347             break;
2348          }
2349          break;
2350       case BRW_OPCODE_CMP:
2351          if (inst->conditional_mod == BRW_CONDITIONAL_GE &&
2352              inst->src[0].abs &&
2353              inst->src[0].negate &&
2354              inst->src[1].is_zero()) {
2355             inst->src[0].abs = false;
2356             inst->src[0].negate = false;
2357             inst->conditional_mod = BRW_CONDITIONAL_Z;
2358             progress = true;
2359             break;
2360          }
2361          break;
2362       case BRW_OPCODE_SEL:
2363          if (inst->src[0].equals(inst->src[1])) {
2364             inst->opcode = BRW_OPCODE_MOV;
2365             inst->src[1] = reg_undef;
2366             inst->predicate = BRW_PREDICATE_NONE;
2367             inst->predicate_inverse = false;
2368             progress = true;
2369          } else if (inst->saturate && inst->src[1].file == IMM) {
2370             switch (inst->conditional_mod) {
2371             case BRW_CONDITIONAL_LE:
2372             case BRW_CONDITIONAL_L:
2373                switch (inst->src[1].type) {
2374                case BRW_REGISTER_TYPE_F:
2375                   if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
2376                      inst->opcode = BRW_OPCODE_MOV;
2377                      inst->src[1] = reg_undef;
2378                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
2379                      progress = true;
2380                   }
2381                   break;
2382                default:
2383                   break;
2384                }
2385                break;
2386             case BRW_CONDITIONAL_GE:
2387             case BRW_CONDITIONAL_G:
2388                switch (inst->src[1].type) {
2389                case BRW_REGISTER_TYPE_F:
2390                   if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
2391                      inst->opcode = BRW_OPCODE_MOV;
2392                      inst->src[1] = reg_undef;
2393                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
2394                      progress = true;
2395                   }
2396                   break;
2397                default:
2398                   break;
2399                }
2400             default:
2401                break;
2402             }
2403          }
2404          break;
2405       case BRW_OPCODE_MAD:
2406          if (inst->src[1].is_zero() || inst->src[2].is_zero()) {
2407             inst->opcode = BRW_OPCODE_MOV;
2408             inst->src[1] = reg_undef;
2409             inst->src[2] = reg_undef;
2410             progress = true;
2411          } else if (inst->src[0].is_zero()) {
2412             inst->opcode = BRW_OPCODE_MUL;
2413             inst->src[0] = inst->src[2];
2414             inst->src[2] = reg_undef;
2415             progress = true;
2416          } else if (inst->src[1].is_one()) {
2417             inst->opcode = BRW_OPCODE_ADD;
2418             inst->src[1] = inst->src[2];
2419             inst->src[2] = reg_undef;
2420             progress = true;
2421          } else if (inst->src[2].is_one()) {
2422             inst->opcode = BRW_OPCODE_ADD;
2423             inst->src[2] = reg_undef;
2424             progress = true;
2425          } else if (inst->src[1].file == IMM && inst->src[2].file == IMM) {
2426             inst->opcode = BRW_OPCODE_ADD;
2427             inst->src[1].fixed_hw_reg.dw1.f *= inst->src[2].fixed_hw_reg.dw1.f;
2428             inst->src[2] = reg_undef;
2429             progress = true;
2430          }
2431          break;
2432       case SHADER_OPCODE_RCP: {
2433          fs_inst *prev = (fs_inst *)inst->prev;
2434          if (prev->opcode == SHADER_OPCODE_SQRT) {
2435             if (inst->src[0].equals(prev->dst)) {
2436                inst->opcode = SHADER_OPCODE_RSQ;
2437                inst->src[0] = prev->src[0];
2438                progress = true;
2439             }
2440          }
2441          break;
2442       }
2443       case SHADER_OPCODE_BROADCAST:
2444          if (is_uniform(inst->src[0])) {
2445             inst->opcode = BRW_OPCODE_MOV;
2446             inst->sources = 1;
2447             inst->force_writemask_all = true;
2448             progress = true;
2449          } else if (inst->src[1].file == IMM) {
2450             inst->opcode = BRW_OPCODE_MOV;
2451             inst->src[0] = component(inst->src[0],
2452                                      inst->src[1].fixed_hw_reg.dw1.ud);
2453             inst->sources = 1;
2454             inst->force_writemask_all = true;
2455             progress = true;
2456          }
2457          break;
2458
2459       default:
2460          break;
2461       }
2462
2463       /* Swap if src[0] is immediate. */
2464       if (progress && inst->is_commutative()) {
2465          if (inst->src[0].file == IMM) {
2466             fs_reg tmp = inst->src[1];
2467             inst->src[1] = inst->src[0];
2468             inst->src[0] = tmp;
2469          }
2470       }
2471    }
2472    return progress;
2473 }
2474
2475 /**
2476  * Optimize sample messages that have constant zero values for the trailing
2477  * texture coordinates. We can just reduce the message length for these
2478  * instructions instead of reserving a register for it. Trailing parameters
2479  * that aren't sent default to zero anyway. This will cause the dead code
2480  * eliminator to remove the MOV instruction that would otherwise be emitted to
2481  * set up the zero value.
2482  */
2483 bool
2484 fs_visitor::opt_zero_samples()
2485 {
2486    /* Gen4 infers the texturing opcode based on the message length so we can't
2487     * change it.
2488     */
2489    if (devinfo->gen < 5)
2490       return false;
2491
2492    bool progress = false;
2493
2494    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2495       if (!inst->is_tex())
2496          continue;
2497
2498       fs_inst *load_payload = (fs_inst *) inst->prev;
2499
2500       if (load_payload->is_head_sentinel() ||
2501           load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2502          continue;
2503
2504       /* We don't want to remove the message header or the first parameter.
2505        * Removing the first parameter is not allowed, see the Haswell PRM
2506        * volume 7, page 149:
2507        *
2508        *     "Parameter 0 is required except for the sampleinfo message, which
2509        *      has no parameter 0"
2510        */
2511       while (inst->mlen > inst->header_size + dispatch_width / 8 &&
2512              load_payload->src[(inst->mlen - inst->header_size) /
2513                                (dispatch_width / 8) +
2514                                inst->header_size - 1].is_zero()) {
2515          inst->mlen -= dispatch_width / 8;
2516          progress = true;
2517       }
2518    }
2519
2520    if (progress)
2521       invalidate_live_intervals();
2522
2523    return progress;
2524 }
2525
2526 /**
2527  * Optimize sample messages which are followed by the final RT write.
2528  *
2529  * CHV, and GEN9+ can mark a texturing SEND instruction with EOT to have its
2530  * results sent directly to the framebuffer, bypassing the EU.  Recognize the
2531  * final texturing results copied to the framebuffer write payload and modify
2532  * them to write to the framebuffer directly.
2533  */
2534 bool
2535 fs_visitor::opt_sampler_eot()
2536 {
2537    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2538
2539    if (stage != MESA_SHADER_FRAGMENT)
2540       return false;
2541
2542    if (devinfo->gen < 9 && !devinfo->is_cherryview)
2543       return false;
2544
2545    /* FINISHME: It should be possible to implement this optimization when there
2546     * are multiple drawbuffers.
2547     */
2548    if (key->nr_color_regions != 1)
2549       return false;
2550
2551    /* Look for a texturing instruction immediately before the final FB_WRITE. */
2552    fs_inst *fb_write = (fs_inst *) cfg->blocks[cfg->num_blocks - 1]->end();
2553    assert(fb_write->eot);
2554    assert(fb_write->opcode == FS_OPCODE_FB_WRITE);
2555
2556    fs_inst *tex_inst = (fs_inst *) fb_write->prev;
2557
2558    /* There wasn't one; nothing to do. */
2559    if (unlikely(tex_inst->is_head_sentinel()) || !tex_inst->is_tex())
2560       return false;
2561
2562    /* This optimisation doesn't seem to work for textureGather for some
2563     * reason. I can't find any documentation or known workarounds to indicate
2564     * that this is expected, but considering that it is probably pretty
2565     * unlikely that a shader would directly write out the results from
2566     * textureGather we might as well just disable it.
2567     */
2568    if (tex_inst->opcode == SHADER_OPCODE_TG4 ||
2569        tex_inst->opcode == SHADER_OPCODE_TG4_OFFSET)
2570       return false;
2571
2572    /* If there's no header present, we need to munge the LOAD_PAYLOAD as well.
2573     * It's very likely to be the previous instruction.
2574     */
2575    fs_inst *load_payload = (fs_inst *) tex_inst->prev;
2576    if (load_payload->is_head_sentinel() ||
2577        load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2578       return false;
2579
2580    assert(!tex_inst->eot); /* We can't get here twice */
2581    assert((tex_inst->offset & (0xff << 24)) == 0);
2582
2583    tex_inst->offset |= fb_write->target << 24;
2584    tex_inst->eot = true;
2585    tex_inst->dst = reg_null_ud;
2586    fb_write->remove(cfg->blocks[cfg->num_blocks - 1]);
2587
2588    /* If a header is present, marking the eot is sufficient. Otherwise, we need
2589     * to create a new LOAD_PAYLOAD command with the same sources and a space
2590     * saved for the header. Using a new destination register not only makes sure
2591     * we have enough space, but it will make sure the dead code eliminator kills
2592     * the instruction that this will replace.
2593     */
2594    if (tex_inst->header_size != 0)
2595       return true;
2596
2597    fs_reg send_header = vgrf(load_payload->sources + 1);
2598    fs_reg *new_sources =
2599       ralloc_array(mem_ctx, fs_reg, load_payload->sources + 1);
2600
2601    new_sources[0] = fs_reg();
2602    for (int i = 0; i < load_payload->sources; i++)
2603       new_sources[i+1] = load_payload->src[i];
2604
2605    /* The LOAD_PAYLOAD helper seems like the obvious choice here. However, it
2606     * requires a lot of information about the sources to appropriately figure
2607     * out the number of registers needed to be used. Given this stage in our
2608     * optimization, we may not have the appropriate GRFs required by
2609     * LOAD_PAYLOAD at this point (copy propagation). Therefore, we need to
2610     * manually emit the instruction.
2611     */
2612    fs_inst *new_load_payload = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD,
2613                                                     load_payload->exec_size,
2614                                                     send_header,
2615                                                     new_sources,
2616                                                     load_payload->sources + 1);
2617
2618    new_load_payload->regs_written = load_payload->regs_written + 1;
2619    new_load_payload->header_size = 1;
2620    tex_inst->mlen++;
2621    tex_inst->header_size = 1;
2622    tex_inst->insert_before(cfg->blocks[cfg->num_blocks - 1], new_load_payload);
2623    tex_inst->src[0] = send_header;
2624
2625    return true;
2626 }
2627
2628 bool
2629 fs_visitor::opt_register_renaming()
2630 {
2631    bool progress = false;
2632    int depth = 0;
2633
2634    int remap[alloc.count];
2635    memset(remap, -1, sizeof(int) * alloc.count);
2636
2637    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2638       if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
2639          depth++;
2640       } else if (inst->opcode == BRW_OPCODE_ENDIF ||
2641                  inst->opcode == BRW_OPCODE_WHILE) {
2642          depth--;
2643       }
2644
2645       /* Rewrite instruction sources. */
2646       for (int i = 0; i < inst->sources; i++) {
2647          if (inst->src[i].file == GRF &&
2648              remap[inst->src[i].reg] != -1 &&
2649              remap[inst->src[i].reg] != inst->src[i].reg) {
2650             inst->src[i].reg = remap[inst->src[i].reg];
2651             progress = true;
2652          }
2653       }
2654
2655       const int dst = inst->dst.reg;
2656
2657       if (depth == 0 &&
2658           inst->dst.file == GRF &&
2659           alloc.sizes[inst->dst.reg] == inst->dst.width / 8 &&
2660           !inst->is_partial_write()) {
2661          if (remap[dst] == -1) {
2662             remap[dst] = dst;
2663          } else {
2664             remap[dst] = alloc.allocate(inst->dst.width / 8);
2665             inst->dst.reg = remap[dst];
2666             progress = true;
2667          }
2668       } else if (inst->dst.file == GRF &&
2669                  remap[dst] != -1 &&
2670                  remap[dst] != dst) {
2671          inst->dst.reg = remap[dst];
2672          progress = true;
2673       }
2674    }
2675
2676    if (progress) {
2677       invalidate_live_intervals();
2678
2679       for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
2680          if (delta_xy[i].file == GRF && remap[delta_xy[i].reg] != -1) {
2681             delta_xy[i].reg = remap[delta_xy[i].reg];
2682          }
2683       }
2684    }
2685
2686    return progress;
2687 }
2688
2689 /**
2690  * Remove redundant or useless discard jumps.
2691  *
2692  * For example, we can eliminate jumps in the following sequence:
2693  *
2694  * discard-jump       (redundant with the next jump)
2695  * discard-jump       (useless; jumps to the next instruction)
2696  * placeholder-halt
2697  */
2698 bool
2699 fs_visitor::opt_redundant_discard_jumps()
2700 {
2701    bool progress = false;
2702
2703    bblock_t *last_bblock = cfg->blocks[cfg->num_blocks - 1];
2704
2705    fs_inst *placeholder_halt = NULL;
2706    foreach_inst_in_block_reverse(fs_inst, inst, last_bblock) {
2707       if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT) {
2708          placeholder_halt = inst;
2709          break;
2710       }
2711    }
2712
2713    if (!placeholder_halt)
2714       return false;
2715
2716    /* Delete any HALTs immediately before the placeholder halt. */
2717    for (fs_inst *prev = (fs_inst *) placeholder_halt->prev;
2718         !prev->is_head_sentinel() && prev->opcode == FS_OPCODE_DISCARD_JUMP;
2719         prev = (fs_inst *) placeholder_halt->prev) {
2720       prev->remove(last_bblock);
2721       progress = true;
2722    }
2723
2724    if (progress)
2725       invalidate_live_intervals();
2726
2727    return progress;
2728 }
2729
2730 bool
2731 fs_visitor::compute_to_mrf()
2732 {
2733    bool progress = false;
2734    int next_ip = 0;
2735
2736    /* No MRFs on Gen >= 7. */
2737    if (devinfo->gen >= 7)
2738       return false;
2739
2740    calculate_live_intervals();
2741
2742    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2743       int ip = next_ip;
2744       next_ip++;
2745
2746       if (inst->opcode != BRW_OPCODE_MOV ||
2747           inst->is_partial_write() ||
2748           inst->dst.file != MRF || inst->src[0].file != GRF ||
2749           inst->dst.type != inst->src[0].type ||
2750           inst->src[0].abs || inst->src[0].negate ||
2751           !inst->src[0].is_contiguous() ||
2752           inst->src[0].subreg_offset)
2753          continue;
2754
2755       /* Work out which hardware MRF registers are written by this
2756        * instruction.
2757        */
2758       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2759       int mrf_high;
2760       if (inst->dst.reg & BRW_MRF_COMPR4) {
2761          mrf_high = mrf_low + 4;
2762       } else if (inst->exec_size == 16) {
2763          mrf_high = mrf_low + 1;
2764       } else {
2765          mrf_high = mrf_low;
2766       }
2767
2768       /* Can't compute-to-MRF this GRF if someone else was going to
2769        * read it later.
2770        */
2771       if (this->virtual_grf_end[inst->src[0].reg] > ip)
2772          continue;
2773
2774       /* Found a move of a GRF to a MRF.  Let's see if we can go
2775        * rewrite the thing that made this GRF to write into the MRF.
2776        */
2777       foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2778          if (scan_inst->dst.file == GRF &&
2779              scan_inst->dst.reg == inst->src[0].reg) {
2780             /* Found the last thing to write our reg we want to turn
2781              * into a compute-to-MRF.
2782              */
2783
2784             /* If this one instruction didn't populate all the
2785              * channels, bail.  We might be able to rewrite everything
2786              * that writes that reg, but it would require smarter
2787              * tracking to delay the rewriting until complete success.
2788              */
2789             if (scan_inst->is_partial_write())
2790                break;
2791
2792             /* Things returning more than one register would need us to
2793              * understand coalescing out more than one MOV at a time.
2794              */
2795             if (scan_inst->regs_written > scan_inst->dst.width / 8)
2796                break;
2797
2798             /* SEND instructions can't have MRF as a destination. */
2799             if (scan_inst->mlen)
2800                break;
2801
2802             if (devinfo->gen == 6) {
2803                /* gen6 math instructions must have the destination be
2804                 * GRF, so no compute-to-MRF for them.
2805                 */
2806                if (scan_inst->is_math()) {
2807                   break;
2808                }
2809             }
2810
2811             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2812                /* Found the creator of our MRF's source value. */
2813                scan_inst->dst.file = MRF;
2814                scan_inst->dst.reg = inst->dst.reg;
2815                scan_inst->saturate |= inst->saturate;
2816                inst->remove(block);
2817                progress = true;
2818             }
2819             break;
2820          }
2821
2822          /* We don't handle control flow here.  Most computation of
2823           * values that end up in MRFs are shortly before the MRF
2824           * write anyway.
2825           */
2826          if (block->start() == scan_inst)
2827             break;
2828
2829          /* You can't read from an MRF, so if someone else reads our
2830           * MRF's source GRF that we wanted to rewrite, that stops us.
2831           */
2832          bool interfered = false;
2833          for (int i = 0; i < scan_inst->sources; i++) {
2834             if (scan_inst->src[i].file == GRF &&
2835                 scan_inst->src[i].reg == inst->src[0].reg &&
2836                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2837                interfered = true;
2838             }
2839          }
2840          if (interfered)
2841             break;
2842
2843          if (scan_inst->dst.file == MRF) {
2844             /* If somebody else writes our MRF here, we can't
2845              * compute-to-MRF before that.
2846              */
2847             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2848             int scan_mrf_high;
2849
2850             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2851                scan_mrf_high = scan_mrf_low + 4;
2852             } else if (scan_inst->exec_size == 16) {
2853                scan_mrf_high = scan_mrf_low + 1;
2854             } else {
2855                scan_mrf_high = scan_mrf_low;
2856             }
2857
2858             if (mrf_low == scan_mrf_low ||
2859                 mrf_low == scan_mrf_high ||
2860                 mrf_high == scan_mrf_low ||
2861                 mrf_high == scan_mrf_high) {
2862                break;
2863             }
2864          }
2865
2866          if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2867             /* Found a SEND instruction, which means that there are
2868              * live values in MRFs from base_mrf to base_mrf +
2869              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2870              * above it.
2871              */
2872             if (mrf_low >= scan_inst->base_mrf &&
2873                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2874                break;
2875             }
2876             if (mrf_high >= scan_inst->base_mrf &&
2877                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2878                break;
2879             }
2880          }
2881       }
2882    }
2883
2884    if (progress)
2885       invalidate_live_intervals();
2886
2887    return progress;
2888 }
2889
2890 /**
2891  * Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control
2892  * flow.  We could probably do better here with some form of divergence
2893  * analysis.
2894  */
2895 bool
2896 fs_visitor::eliminate_find_live_channel()
2897 {
2898    bool progress = false;
2899    unsigned depth = 0;
2900
2901    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2902       switch (inst->opcode) {
2903       case BRW_OPCODE_IF:
2904       case BRW_OPCODE_DO:
2905          depth++;
2906          break;
2907
2908       case BRW_OPCODE_ENDIF:
2909       case BRW_OPCODE_WHILE:
2910          depth--;
2911          break;
2912
2913       case FS_OPCODE_DISCARD_JUMP:
2914          /* This can potentially make control flow non-uniform until the end
2915           * of the program.
2916           */
2917          return progress;
2918
2919       case SHADER_OPCODE_FIND_LIVE_CHANNEL:
2920          if (depth == 0) {
2921             inst->opcode = BRW_OPCODE_MOV;
2922             inst->src[0] = fs_reg(0);
2923             inst->sources = 1;
2924             inst->force_writemask_all = true;
2925             progress = true;
2926          }
2927          break;
2928
2929       default:
2930          break;
2931       }
2932    }
2933
2934    return progress;
2935 }
2936
2937 /**
2938  * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
2939  * instructions to FS_OPCODE_REP_FB_WRITE.
2940  */
2941 void
2942 fs_visitor::emit_repclear_shader()
2943 {
2944    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2945    int base_mrf = 1;
2946    int color_mrf = base_mrf + 2;
2947
2948    fs_inst *mov = emit(MOV(vec4(brw_message_reg(color_mrf)),
2949                            fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F)));
2950    mov->force_writemask_all = true;
2951
2952    fs_inst *write;
2953    if (key->nr_color_regions == 1) {
2954       write = emit(FS_OPCODE_REP_FB_WRITE);
2955       write->saturate = key->clamp_fragment_color;
2956       write->base_mrf = color_mrf;
2957       write->target = 0;
2958       write->header_size = 0;
2959       write->mlen = 1;
2960    } else {
2961       assume(key->nr_color_regions > 0);
2962       for (int i = 0; i < key->nr_color_regions; ++i) {
2963          write = emit(FS_OPCODE_REP_FB_WRITE);
2964          write->saturate = key->clamp_fragment_color;
2965          write->base_mrf = base_mrf;
2966          write->target = i;
2967          write->header_size = 2;
2968          write->mlen = 3;
2969       }
2970    }
2971    write->eot = true;
2972
2973    calculate_cfg();
2974
2975    assign_constant_locations();
2976    assign_curb_setup();
2977
2978    /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
2979    assert(mov->src[0].file == HW_REG);
2980    mov->src[0] = brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
2981 }
2982
2983 /**
2984  * Walks through basic blocks, looking for repeated MRF writes and
2985  * removing the later ones.
2986  */
2987 bool
2988 fs_visitor::remove_duplicate_mrf_writes()
2989 {
2990    fs_inst *last_mrf_move[16];
2991    bool progress = false;
2992
2993    /* Need to update the MRF tracking for compressed instructions. */
2994    if (dispatch_width == 16)
2995       return false;
2996
2997    memset(last_mrf_move, 0, sizeof(last_mrf_move));
2998
2999    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3000       if (inst->is_control_flow()) {
3001          memset(last_mrf_move, 0, sizeof(last_mrf_move));
3002       }
3003
3004       if (inst->opcode == BRW_OPCODE_MOV &&
3005           inst->dst.file == MRF) {
3006          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
3007          if (prev_inst && inst->equals(prev_inst)) {
3008             inst->remove(block);
3009             progress = true;
3010             continue;
3011          }
3012       }
3013
3014       /* Clear out the last-write records for MRFs that were overwritten. */
3015       if (inst->dst.file == MRF) {
3016          last_mrf_move[inst->dst.reg] = NULL;
3017       }
3018
3019       if (inst->mlen > 0 && inst->base_mrf != -1) {
3020          /* Found a SEND instruction, which will include two or fewer
3021           * implied MRF writes.  We could do better here.
3022           */
3023          for (int i = 0; i < implied_mrf_writes(inst); i++) {
3024             last_mrf_move[inst->base_mrf + i] = NULL;
3025          }
3026       }
3027
3028       /* Clear out any MRF move records whose sources got overwritten. */
3029       if (inst->dst.file == GRF) {
3030          for (unsigned int i = 0; i < ARRAY_SIZE(last_mrf_move); i++) {
3031             if (last_mrf_move[i] &&
3032                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
3033                last_mrf_move[i] = NULL;
3034             }
3035          }
3036       }
3037
3038       if (inst->opcode == BRW_OPCODE_MOV &&
3039           inst->dst.file == MRF &&
3040           inst->src[0].file == GRF &&
3041           !inst->is_partial_write()) {
3042          last_mrf_move[inst->dst.reg] = inst;
3043       }
3044    }
3045
3046    if (progress)
3047       invalidate_live_intervals();
3048
3049    return progress;
3050 }
3051
3052 static void
3053 clear_deps_for_inst_src(fs_inst *inst, bool *deps, int first_grf, int grf_len)
3054 {
3055    /* Clear the flag for registers that actually got read (as expected). */
3056    for (int i = 0; i < inst->sources; i++) {
3057       int grf;
3058       if (inst->src[i].file == GRF) {
3059          grf = inst->src[i].reg;
3060       } else if (inst->src[i].file == HW_REG &&
3061                  inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
3062          grf = inst->src[i].fixed_hw_reg.nr;
3063       } else {
3064          continue;
3065       }
3066
3067       if (grf >= first_grf &&
3068           grf < first_grf + grf_len) {
3069          deps[grf - first_grf] = false;
3070          if (inst->exec_size == 16)
3071             deps[grf - first_grf + 1] = false;
3072       }
3073    }
3074 }
3075
3076 /**
3077  * Implements this workaround for the original 965:
3078  *
3079  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
3080  *      check for post destination dependencies on this instruction, software
3081  *      must ensure that there is no destination hazard for the case of ‘write
3082  *      followed by a posted write’ shown in the following example.
3083  *
3084  *      1. mov r3 0
3085  *      2. send r3.xy <rest of send instruction>
3086  *      3. mov r2 r3
3087  *
3088  *      Due to no post-destination dependency check on the ‘send’, the above
3089  *      code sequence could have two instructions (1 and 2) in flight at the
3090  *      same time that both consider ‘r3’ as the target of their final writes.
3091  */
3092 void
3093 fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
3094                                                         fs_inst *inst)
3095 {
3096    int write_len = inst->regs_written;
3097    int first_write_grf = inst->dst.reg;
3098    bool needs_dep[BRW_MAX_MRF];
3099    assert(write_len < (int)sizeof(needs_dep) - 1);
3100
3101    memset(needs_dep, false, sizeof(needs_dep));
3102    memset(needs_dep, true, write_len);
3103
3104    clear_deps_for_inst_src(inst, needs_dep, first_write_grf, write_len);
3105
3106    /* Walk backwards looking for writes to registers we're writing which
3107     * aren't read since being written.  If we hit the start of the program,
3108     * we assume that there are no outstanding dependencies on entry to the
3109     * program.
3110     */
3111    foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
3112       /* If we hit control flow, assume that there *are* outstanding
3113        * dependencies, and force their cleanup before our instruction.
3114        */
3115       if (block->start() == scan_inst) {
3116          for (int i = 0; i < write_len; i++) {
3117             if (needs_dep[i]) {
3118                inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
3119             }
3120          }
3121          return;
3122       }
3123
3124       /* We insert our reads as late as possible on the assumption that any
3125        * instruction but a MOV that might have left us an outstanding
3126        * dependency has more latency than a MOV.
3127        */
3128       if (scan_inst->dst.file == GRF) {
3129          for (int i = 0; i < scan_inst->regs_written; i++) {
3130             int reg = scan_inst->dst.reg + i;
3131
3132             if (reg >= first_write_grf &&
3133                 reg < first_write_grf + write_len &&
3134                 needs_dep[reg - first_write_grf]) {
3135                inst->insert_before(block, DEP_RESOLVE_MOV(reg));
3136                needs_dep[reg - first_write_grf] = false;
3137                if (scan_inst->exec_size == 16)
3138                   needs_dep[reg - first_write_grf + 1] = false;
3139             }
3140          }
3141       }
3142
3143       /* Clear the flag for registers that actually got read (as expected). */
3144       clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3145
3146       /* Continue the loop only if we haven't resolved all the dependencies */
3147       int i;
3148       for (i = 0; i < write_len; i++) {
3149          if (needs_dep[i])
3150             break;
3151       }
3152       if (i == write_len)
3153          return;
3154    }
3155 }
3156
3157 /**
3158  * Implements this workaround for the original 965:
3159  *
3160  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
3161  *      used as a destination register until after it has been sourced by an
3162  *      instruction with a different destination register.
3163  */
3164 void
3165 fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
3166 {
3167    int write_len = inst->regs_written;
3168    int first_write_grf = inst->dst.reg;
3169    bool needs_dep[BRW_MAX_MRF];
3170    assert(write_len < (int)sizeof(needs_dep) - 1);
3171
3172    memset(needs_dep, false, sizeof(needs_dep));
3173    memset(needs_dep, true, write_len);
3174    /* Walk forwards looking for writes to registers we're writing which aren't
3175     * read before being written.
3176     */
3177    foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst, block) {
3178       /* If we hit control flow, force resolve all remaining dependencies. */
3179       if (block->end() == scan_inst) {
3180          for (int i = 0; i < write_len; i++) {
3181             if (needs_dep[i])
3182                scan_inst->insert_before(block,
3183                                         DEP_RESOLVE_MOV(first_write_grf + i));
3184          }
3185          return;
3186       }
3187
3188       /* Clear the flag for registers that actually got read (as expected). */
3189       clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3190
3191       /* We insert our reads as late as possible since they're reading the
3192        * result of a SEND, which has massive latency.
3193        */
3194       if (scan_inst->dst.file == GRF &&
3195           scan_inst->dst.reg >= first_write_grf &&
3196           scan_inst->dst.reg < first_write_grf + write_len &&
3197           needs_dep[scan_inst->dst.reg - first_write_grf]) {
3198          scan_inst->insert_before(block, DEP_RESOLVE_MOV(scan_inst->dst.reg));
3199          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
3200       }
3201
3202       /* Continue the loop only if we haven't resolved all the dependencies */
3203       int i;
3204       for (i = 0; i < write_len; i++) {
3205          if (needs_dep[i])
3206             break;
3207       }
3208       if (i == write_len)
3209          return;
3210    }
3211 }
3212
3213 void
3214 fs_visitor::insert_gen4_send_dependency_workarounds()
3215 {
3216    if (devinfo->gen != 4 || devinfo->is_g4x)
3217       return;
3218
3219    bool progress = false;
3220
3221    /* Note that we're done with register allocation, so GRF fs_regs always
3222     * have a .reg_offset of 0.
3223     */
3224
3225    foreach_block_and_inst(block, fs_inst, inst, cfg) {
3226       if (inst->mlen != 0 && inst->dst.file == GRF) {
3227          insert_gen4_pre_send_dependency_workarounds(block, inst);
3228          insert_gen4_post_send_dependency_workarounds(block, inst);
3229          progress = true;
3230       }
3231    }
3232
3233    if (progress)
3234       invalidate_live_intervals();
3235 }
3236
3237 /**
3238  * Turns the generic expression-style uniform pull constant load instruction
3239  * into a hardware-specific series of instructions for loading a pull
3240  * constant.
3241  *
3242  * The expression style allows the CSE pass before this to optimize out
3243  * repeated loads from the same offset, and gives the pre-register-allocation
3244  * scheduling full flexibility, while the conversion to native instructions
3245  * allows the post-register-allocation scheduler the best information
3246  * possible.
3247  *
3248  * Note that execution masking for setting up pull constant loads is special:
3249  * the channels that need to be written are unrelated to the current execution
3250  * mask, since a later instruction will use one of the result channels as a
3251  * source operand for all 8 or 16 of its channels.
3252  */
3253 void
3254 fs_visitor::lower_uniform_pull_constant_loads()
3255 {
3256    foreach_block_and_inst (block, fs_inst, inst, cfg) {
3257       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
3258          continue;
3259
3260       if (devinfo->gen >= 7) {
3261          /* The offset arg before was a vec4-aligned byte offset.  We need to
3262           * turn it into a dword offset.
3263           */
3264          fs_reg const_offset_reg = inst->src[1];
3265          assert(const_offset_reg.file == IMM &&
3266                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
3267          const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
3268          fs_reg payload = fs_reg(GRF, alloc.allocate(1));
3269
3270          /* We have to use a message header on Skylake to get SIMD4x2 mode.
3271           * Reserve space for the register.
3272           */
3273          if (devinfo->gen >= 9) {
3274             payload.reg_offset++;
3275             alloc.sizes[payload.reg] = 2;
3276          }
3277
3278          /* This is actually going to be a MOV, but since only the first dword
3279           * is accessed, we have a special opcode to do just that one.  Note
3280           * that this needs to be an operation that will be considered a def
3281           * by live variable analysis, or register allocation will explode.
3282           */
3283          fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
3284                                                8, payload, const_offset_reg);
3285          setup->force_writemask_all = true;
3286
3287          setup->ir = inst->ir;
3288          setup->annotation = inst->annotation;
3289          inst->insert_before(block, setup);
3290
3291          /* Similarly, this will only populate the first 4 channels of the
3292           * result register (since we only use smear values from 0-3), but we
3293           * don't tell the optimizer.
3294           */
3295          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
3296          inst->src[1] = payload;
3297
3298          invalidate_live_intervals();
3299       } else {
3300          /* Before register allocation, we didn't tell the scheduler about the
3301           * MRF we use.  We know it's safe to use this MRF because nothing
3302           * else does except for register spill/unspill, which generates and
3303           * uses its MRF within a single IR instruction.
3304           */
3305          inst->base_mrf = 14;
3306          inst->mlen = 1;
3307       }
3308    }
3309 }
3310
3311 bool
3312 fs_visitor::lower_load_payload()
3313 {
3314    bool progress = false;
3315
3316    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3317       if (inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
3318          continue;
3319
3320       assert(inst->dst.file == MRF || inst->dst.file == GRF);
3321       assert(inst->saturate == false);
3322
3323       fs_reg dst = inst->dst;
3324
3325       /* Get rid of COMPR4.  We'll add it back in if we need it */
3326       if (dst.file == MRF)
3327          dst.reg = dst.reg & ~BRW_MRF_COMPR4;
3328
3329       dst.width = 8;
3330       for (uint8_t i = 0; i < inst->header_size; i++) {
3331          if (inst->src[i].file != BAD_FILE) {
3332             fs_reg mov_dst = retype(dst, BRW_REGISTER_TYPE_UD);
3333             fs_reg mov_src = retype(inst->src[i], BRW_REGISTER_TYPE_UD);
3334             mov_src.width = 8;
3335             fs_inst *mov = MOV(mov_dst, mov_src);
3336             mov->force_writemask_all = true;
3337             inst->insert_before(block, mov);
3338          }
3339          dst = offset(dst, 1);
3340       }
3341
3342       dst.width = inst->exec_size;
3343       if (inst->dst.file == MRF && (inst->dst.reg & BRW_MRF_COMPR4) &&
3344           inst->exec_size > 8) {
3345          /* In this case, the payload portion of the LOAD_PAYLOAD isn't
3346           * a straightforward copy.  Instead, the result of the
3347           * LOAD_PAYLOAD is treated as interleaved and the first four
3348           * non-header sources are unpacked as:
3349           *
3350           * m + 0: r0
3351           * m + 1: g0
3352           * m + 2: b0
3353           * m + 3: a0
3354           * m + 4: r1
3355           * m + 5: g1
3356           * m + 6: b1
3357           * m + 7: a1
3358           *
3359           * This is used for gen <= 5 fb writes.
3360           */
3361          assert(inst->exec_size == 16);
3362          assert(inst->header_size + 4 <= inst->sources);
3363          for (uint8_t i = inst->header_size; i < inst->header_size + 4; i++) {
3364             if (inst->src[i].file != BAD_FILE) {
3365                if (devinfo->has_compr4) {
3366                   fs_reg compr4_dst = retype(dst, inst->src[i].type);
3367                   compr4_dst.reg |= BRW_MRF_COMPR4;
3368
3369                   fs_inst *mov = MOV(compr4_dst, inst->src[i]);
3370                   mov->force_writemask_all = inst->force_writemask_all;
3371                   inst->insert_before(block, mov);
3372                } else {
3373                   /* Platform doesn't have COMPR4.  We have to fake it */
3374                   fs_reg mov_dst = retype(dst, inst->src[i].type);
3375                   mov_dst.width = 8;
3376
3377                   fs_inst *mov = MOV(mov_dst, half(inst->src[i], 0));
3378                   mov->force_writemask_all = inst->force_writemask_all;
3379                   inst->insert_before(block, mov);
3380
3381                   mov = MOV(offset(mov_dst, 4), half(inst->src[i], 1));
3382                   mov->force_writemask_all = inst->force_writemask_all;
3383                   mov->force_sechalf = true;
3384                   inst->insert_before(block, mov);
3385                }
3386             }
3387
3388             dst.reg++;
3389          }
3390
3391          /* The loop above only ever incremented us through the first set
3392           * of 4 registers.  However, thanks to the magic of COMPR4, we
3393           * actually wrote to the first 8 registers, so we need to take
3394           * that into account now.
3395           */
3396          dst.reg += 4;
3397
3398          /* The COMPR4 code took care of the first 4 sources.  We'll let
3399           * the regular path handle any remaining sources.  Yes, we are
3400           * modifying the instruction but we're about to delete it so
3401           * this really doesn't hurt anything.
3402           */
3403          inst->header_size += 4;
3404       }
3405
3406       for (uint8_t i = inst->header_size; i < inst->sources; i++) {
3407          if (inst->src[i].file != BAD_FILE) {
3408             fs_inst *mov = MOV(retype(dst, inst->src[i].type),
3409                                inst->src[i]);
3410             mov->force_writemask_all = inst->force_writemask_all;
3411             mov->force_sechalf = inst->force_sechalf;
3412             inst->insert_before(block, mov);
3413          }
3414          dst = offset(dst, 1);
3415       }
3416
3417       inst->remove(block);
3418       progress = true;
3419    }
3420
3421    if (progress)
3422       invalidate_live_intervals();
3423
3424    return progress;
3425 }
3426
3427 bool
3428 fs_visitor::lower_integer_multiplication()
3429 {
3430    bool progress = false;
3431
3432    /* Gen8's MUL instruction can do a 32-bit x 32-bit -> 32-bit operation
3433     * directly, but Cherryview cannot.
3434     */
3435    if (devinfo->gen >= 8 && !devinfo->is_cherryview)
3436       return false;
3437
3438    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
3439       if (inst->opcode != BRW_OPCODE_MUL ||
3440           inst->dst.is_accumulator() ||
3441           (inst->dst.type != BRW_REGISTER_TYPE_D &&
3442            inst->dst.type != BRW_REGISTER_TYPE_UD))
3443          continue;
3444
3445 #define insert(instr) inst->insert_before(block, instr)
3446
3447       /* The MUL instruction isn't commutative. On Gen <= 6, only the low
3448        * 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of
3449        * src1 are used.
3450        *
3451        * If multiplying by an immediate value that fits in 16-bits, do a
3452        * single MUL instruction with that value in the proper location.
3453        */
3454       if (inst->src[1].file == IMM &&
3455           inst->src[1].fixed_hw_reg.dw1.ud < (1 << 16)) {
3456          if (devinfo->gen < 7) {
3457             fs_reg imm(GRF, alloc.allocate(dispatch_width / 8),
3458                        inst->dst.type, dispatch_width);
3459             insert(MOV(imm, inst->src[1]));
3460             insert(MUL(inst->dst, imm, inst->src[0]));
3461          } else {
3462             insert(MUL(inst->dst, inst->src[0], inst->src[1]));
3463          }
3464       } else {
3465          /* Gen < 8 (and some Gen8+ low-power parts like Cherryview) cannot
3466           * do 32-bit integer multiplication in one instruction, but instead
3467           * must do a sequence (which actually calculates a 64-bit result):
3468           *
3469           *    mul(8)  acc0<1>D   g3<8,8,1>D      g4<8,8,1>D
3470           *    mach(8) null       g3<8,8,1>D      g4<8,8,1>D
3471           *    mov(8)  g2<1>D     acc0<8,8,1>D
3472           *
3473           * But on Gen > 6, the ability to use second accumulator register
3474           * (acc1) for non-float data types was removed, preventing a simple
3475           * implementation in SIMD16. A 16-channel result can be calculated by
3476           * executing the three instructions twice in SIMD8, once with quarter
3477           * control of 1Q for the first eight channels and again with 2Q for
3478           * the second eight channels.
3479           *
3480           * Which accumulator register is implicitly accessed (by AccWrEnable
3481           * for instance) is determined by the quarter control. Unfortunately
3482           * Ivybridge (and presumably Baytrail) has a hardware bug in which an
3483           * implicit accumulator access by an instruction with 2Q will access
3484           * acc1 regardless of whether the data type is usable in acc1.
3485           *
3486           * Specifically, the 2Q mach(8) writes acc1 which does not exist for
3487           * integer data types.
3488           *
3489           * Since we only want the low 32-bits of the result, we can do two
3490           * 32-bit x 16-bit multiplies (like the mul and mach are doing), and
3491           * adjust the high result and add them (like the mach is doing):
3492           *
3493           *    mul(8)  g7<1>D     g3<8,8,1>D      g4.0<8,8,1>UW
3494           *    mul(8)  g8<1>D     g3<8,8,1>D      g4.1<8,8,1>UW
3495           *    shl(8)  g9<1>D     g8<8,8,1>D      16D
3496           *    add(8)  g2<1>D     g7<8,8,1>D      g8<8,8,1>D
3497           *
3498           * We avoid the shl instruction by realizing that we only want to add
3499           * the low 16-bits of the "high" result to the high 16-bits of the
3500           * "low" result and using proper regioning on the add:
3501           *
3502           *    mul(8)  g7<1>D     g3<8,8,1>D      g4.0<16,8,2>UW
3503           *    mul(8)  g8<1>D     g3<8,8,1>D      g4.1<16,8,2>UW
3504           *    add(8)  g7.1<2>UW  g7.1<16,8,2>UW  g8<16,8,2>UW
3505           *
3506           * Since it does not use the (single) accumulator register, we can
3507           * schedule multi-component multiplications much better.
3508           */
3509
3510          if (inst->conditional_mod && inst->dst.is_null()) {
3511             inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
3512                                inst->dst.type, dispatch_width);
3513          }
3514          fs_reg low = inst->dst;
3515          fs_reg high(GRF, alloc.allocate(dispatch_width / 8),
3516                      inst->dst.type, dispatch_width);
3517
3518          if (brw->gen >= 7) {
3519             fs_reg src1_0_w = inst->src[1];
3520             fs_reg src1_1_w = inst->src[1];
3521
3522             if (inst->src[1].file == IMM) {
3523                src1_0_w.fixed_hw_reg.dw1.ud &= 0xffff;
3524                src1_1_w.fixed_hw_reg.dw1.ud >>= 16;
3525             } else {
3526                src1_0_w.type = BRW_REGISTER_TYPE_UW;
3527                src1_0_w.stride = 2;
3528
3529                src1_1_w.type = BRW_REGISTER_TYPE_UW;
3530                src1_1_w.stride = 2;
3531                src1_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
3532             }
3533             insert(MUL(low, inst->src[0], src1_0_w));
3534             insert(MUL(high, inst->src[0], src1_1_w));
3535          } else {
3536             fs_reg src0_0_w = inst->src[0];
3537             fs_reg src0_1_w = inst->src[0];
3538
3539             src0_0_w.type = BRW_REGISTER_TYPE_UW;
3540             src0_0_w.stride = 2;
3541
3542             src0_1_w.type = BRW_REGISTER_TYPE_UW;
3543             src0_1_w.stride = 2;
3544             src0_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
3545
3546             insert(MUL(low, src0_0_w, inst->src[1]));
3547             insert(MUL(high, src0_1_w, inst->src[1]));
3548          }
3549
3550          fs_reg dst = inst->dst;
3551          dst.type = BRW_REGISTER_TYPE_UW;
3552          dst.subreg_offset = 2;
3553          dst.stride = 2;
3554
3555          high.type = BRW_REGISTER_TYPE_UW;
3556          high.stride = 2;
3557
3558          low.type = BRW_REGISTER_TYPE_UW;
3559          low.subreg_offset = 2;
3560          low.stride = 2;
3561
3562          insert(ADD(dst, low, high));
3563
3564          if (inst->conditional_mod) {
3565             fs_reg null(retype(brw_null_reg(), inst->dst.type));
3566             fs_inst *mov = MOV(null, inst->dst);
3567             mov->conditional_mod = inst->conditional_mod;
3568             insert(mov);
3569          }
3570       }
3571 #undef insert
3572
3573       inst->remove(block);
3574       progress = true;
3575    }
3576
3577    if (progress)
3578       invalidate_live_intervals();
3579
3580    return progress;
3581 }
3582
3583 void
3584 fs_visitor::dump_instructions()
3585 {
3586    dump_instructions(NULL);
3587 }
3588
3589 void
3590 fs_visitor::dump_instructions(const char *name)
3591 {
3592    FILE *file = stderr;
3593    if (name && geteuid() != 0) {
3594       file = fopen(name, "w");
3595       if (!file)
3596          file = stderr;
3597    }
3598
3599    if (cfg) {
3600       calculate_register_pressure();
3601       int ip = 0, max_pressure = 0;
3602       foreach_block_and_inst(block, backend_instruction, inst, cfg) {
3603          max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
3604          fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
3605          dump_instruction(inst, file);
3606          ip++;
3607       }
3608       fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
3609    } else {
3610       int ip = 0;
3611       foreach_in_list(backend_instruction, inst, &instructions) {
3612          fprintf(file, "%4d: ", ip++);
3613          dump_instruction(inst, file);
3614       }
3615    }
3616
3617    if (file != stderr) {
3618       fclose(file);
3619    }
3620 }
3621
3622 void
3623 fs_visitor::dump_instruction(backend_instruction *be_inst)
3624 {
3625    dump_instruction(be_inst, stderr);
3626 }
3627
3628 void
3629 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
3630 {
3631    fs_inst *inst = (fs_inst *)be_inst;
3632
3633    if (inst->predicate) {
3634       fprintf(file, "(%cf0.%d) ",
3635              inst->predicate_inverse ? '-' : '+',
3636              inst->flag_subreg);
3637    }
3638
3639    fprintf(file, "%s", brw_instruction_name(inst->opcode));
3640    if (inst->saturate)
3641       fprintf(file, ".sat");
3642    if (inst->conditional_mod) {
3643       fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
3644       if (!inst->predicate &&
3645           (devinfo->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
3646                               inst->opcode != BRW_OPCODE_IF &&
3647                               inst->opcode != BRW_OPCODE_WHILE))) {
3648          fprintf(file, ".f0.%d", inst->flag_subreg);
3649       }
3650    }
3651    fprintf(file, "(%d) ", inst->exec_size);
3652
3653    if (inst->mlen) {
3654       fprintf(file, "(mlen: %d) ", inst->mlen);
3655    }
3656
3657    switch (inst->dst.file) {
3658    case GRF:
3659       fprintf(file, "vgrf%d", inst->dst.reg);
3660       if (inst->dst.width != dispatch_width)
3661          fprintf(file, "@%d", inst->dst.width);
3662       if (alloc.sizes[inst->dst.reg] != inst->dst.width / 8 ||
3663           inst->dst.subreg_offset)
3664          fprintf(file, "+%d.%d",
3665                  inst->dst.reg_offset, inst->dst.subreg_offset);
3666       break;
3667    case MRF:
3668       fprintf(file, "m%d", inst->dst.reg);
3669       break;
3670    case BAD_FILE:
3671       fprintf(file, "(null)");
3672       break;
3673    case UNIFORM:
3674       fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
3675       break;
3676    case ATTR:
3677       fprintf(file, "***attr%d***", inst->dst.reg + inst->dst.reg_offset);
3678       break;
3679    case HW_REG:
3680       if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3681          switch (inst->dst.fixed_hw_reg.nr) {
3682          case BRW_ARF_NULL:
3683             fprintf(file, "null");
3684             break;
3685          case BRW_ARF_ADDRESS:
3686             fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
3687             break;
3688          case BRW_ARF_ACCUMULATOR:
3689             fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
3690             break;
3691          case BRW_ARF_FLAG:
3692             fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3693                              inst->dst.fixed_hw_reg.subnr);
3694             break;
3695          default:
3696             fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3697                                inst->dst.fixed_hw_reg.subnr);
3698             break;
3699          }
3700       } else {
3701          fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
3702       }
3703       if (inst->dst.fixed_hw_reg.subnr)
3704          fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
3705       break;
3706    default:
3707       fprintf(file, "???");
3708       break;
3709    }
3710    fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
3711
3712    for (int i = 0; i < inst->sources; i++) {
3713       if (inst->src[i].negate)
3714          fprintf(file, "-");
3715       if (inst->src[i].abs)
3716          fprintf(file, "|");
3717       switch (inst->src[i].file) {
3718       case GRF:
3719          fprintf(file, "vgrf%d", inst->src[i].reg);
3720          if (inst->src[i].width != dispatch_width)
3721             fprintf(file, "@%d", inst->src[i].width);
3722          if (alloc.sizes[inst->src[i].reg] != inst->src[i].width / 8 ||
3723              inst->src[i].subreg_offset)
3724             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3725                     inst->src[i].subreg_offset);
3726          break;
3727       case MRF:
3728          fprintf(file, "***m%d***", inst->src[i].reg);
3729          break;
3730       case ATTR:
3731          fprintf(file, "attr%d", inst->src[i].reg + inst->src[i].reg_offset);
3732          break;
3733       case UNIFORM:
3734          fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
3735          if (inst->src[i].reladdr) {
3736             fprintf(file, "+reladdr");
3737          } else if (inst->src[i].subreg_offset) {
3738             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3739                     inst->src[i].subreg_offset);
3740          }
3741          break;
3742       case BAD_FILE:
3743          fprintf(file, "(null)");
3744          break;
3745       case IMM:
3746          switch (inst->src[i].type) {
3747          case BRW_REGISTER_TYPE_F:
3748             fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
3749             break;
3750          case BRW_REGISTER_TYPE_W:
3751          case BRW_REGISTER_TYPE_D:
3752             fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
3753             break;
3754          case BRW_REGISTER_TYPE_UW:
3755          case BRW_REGISTER_TYPE_UD:
3756             fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
3757             break;
3758          case BRW_REGISTER_TYPE_VF:
3759             fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
3760                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >>  0) & 0xff),
3761                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >>  8) & 0xff),
3762                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 16) & 0xff),
3763                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 24) & 0xff));
3764             break;
3765          default:
3766             fprintf(file, "???");
3767             break;
3768          }
3769          break;
3770       case HW_REG:
3771          if (inst->src[i].fixed_hw_reg.negate)
3772             fprintf(file, "-");
3773          if (inst->src[i].fixed_hw_reg.abs)
3774             fprintf(file, "|");
3775          if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3776             switch (inst->src[i].fixed_hw_reg.nr) {
3777             case BRW_ARF_NULL:
3778                fprintf(file, "null");
3779                break;
3780             case BRW_ARF_ADDRESS:
3781                fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
3782                break;
3783             case BRW_ARF_ACCUMULATOR:
3784                fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
3785                break;
3786             case BRW_ARF_FLAG:
3787                fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3788                                 inst->src[i].fixed_hw_reg.subnr);
3789                break;
3790             default:
3791                fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3792                                   inst->src[i].fixed_hw_reg.subnr);
3793                break;
3794             }
3795          } else {
3796             fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3797          }
3798          if (inst->src[i].fixed_hw_reg.subnr)
3799             fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
3800          if (inst->src[i].fixed_hw_reg.abs)
3801             fprintf(file, "|");
3802          break;
3803       default:
3804          fprintf(file, "???");
3805          break;
3806       }
3807       if (inst->src[i].abs)
3808          fprintf(file, "|");
3809
3810       if (inst->src[i].file != IMM) {
3811          fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
3812       }
3813
3814       if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
3815          fprintf(file, ", ");
3816    }
3817
3818    fprintf(file, " ");
3819
3820    if (dispatch_width == 16 && inst->exec_size == 8) {
3821       if (inst->force_sechalf)
3822          fprintf(file, "2ndhalf ");
3823       else
3824          fprintf(file, "1sthalf ");
3825    }
3826
3827    fprintf(file, "\n");
3828 }
3829
3830 /**
3831  * Possibly returns an instruction that set up @param reg.
3832  *
3833  * Sometimes we want to take the result of some expression/variable
3834  * dereference tree and rewrite the instruction generating the result
3835  * of the tree.  When processing the tree, we know that the
3836  * instructions generated are all writing temporaries that are dead
3837  * outside of this tree.  So, if we have some instructions that write
3838  * a temporary, we're free to point that temp write somewhere else.
3839  *
3840  * Note that this doesn't guarantee that the instruction generated
3841  * only reg -- it might be the size=4 destination of a texture instruction.
3842  */
3843 fs_inst *
3844 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3845                                            fs_inst *end,
3846                                            const fs_reg &reg)
3847 {
3848    if (end == start ||
3849        end->is_partial_write() ||
3850        reg.reladdr ||
3851        !reg.equals(end->dst)) {
3852       return NULL;
3853    } else {
3854       return end;
3855    }
3856 }
3857
3858 void
3859 fs_visitor::setup_payload_gen6()
3860 {
3861    bool uses_depth =
3862       (prog->InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3863    unsigned barycentric_interp_modes =
3864       (stage == MESA_SHADER_FRAGMENT) ?
3865       ((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
3866
3867    assert(devinfo->gen >= 6);
3868
3869    /* R0-1: masks, pixel X/Y coordinates. */
3870    payload.num_regs = 2;
3871    /* R2: only for 32-pixel dispatch.*/
3872
3873    /* R3-26: barycentric interpolation coordinates.  These appear in the
3874     * same order that they appear in the brw_wm_barycentric_interp_mode
3875     * enum.  Each set of coordinates occupies 2 registers if dispatch width
3876     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
3877     * appear if they were enabled using the "Barycentric Interpolation
3878     * Mode" bits in WM_STATE.
3879     */
3880    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3881       if (barycentric_interp_modes & (1 << i)) {
3882          payload.barycentric_coord_reg[i] = payload.num_regs;
3883          payload.num_regs += 2;
3884          if (dispatch_width == 16) {
3885             payload.num_regs += 2;
3886          }
3887       }
3888    }
3889
3890    /* R27: interpolated depth if uses source depth */
3891    if (uses_depth) {
3892       payload.source_depth_reg = payload.num_regs;
3893       payload.num_regs++;
3894       if (dispatch_width == 16) {
3895          /* R28: interpolated depth if not SIMD8. */
3896          payload.num_regs++;
3897       }
3898    }
3899    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3900    if (uses_depth) {
3901       payload.source_w_reg = payload.num_regs;
3902       payload.num_regs++;
3903       if (dispatch_width == 16) {
3904          /* R30: interpolated W if not SIMD8. */
3905          payload.num_regs++;
3906       }
3907    }
3908
3909    if (stage == MESA_SHADER_FRAGMENT) {
3910       brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3911       brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3912       prog_data->uses_pos_offset = key->compute_pos_offset;
3913       /* R31: MSAA position offsets. */
3914       if (prog_data->uses_pos_offset) {
3915          payload.sample_pos_reg = payload.num_regs;
3916          payload.num_regs++;
3917       }
3918    }
3919
3920    /* R32: MSAA input coverage mask */
3921    if (prog->SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3922       assert(devinfo->gen >= 7);
3923       payload.sample_mask_in_reg = payload.num_regs;
3924       payload.num_regs++;
3925       if (dispatch_width == 16) {
3926          /* R33: input coverage mask if not SIMD8. */
3927          payload.num_regs++;
3928       }
3929    }
3930
3931    /* R34-: bary for 32-pixel. */
3932    /* R58-59: interp W for 32-pixel. */
3933
3934    if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3935       source_depth_to_render_target = true;
3936    }
3937 }
3938
3939 void
3940 fs_visitor::setup_vs_payload()
3941 {
3942    /* R0: thread header, R1: urb handles */
3943    payload.num_regs = 2;
3944 }
3945
3946 void
3947 fs_visitor::setup_cs_payload()
3948 {
3949    assert(brw->gen >= 7);
3950
3951    payload.num_regs = 1;
3952 }
3953
3954 void
3955 fs_visitor::assign_binding_table_offsets()
3956 {
3957    assert(stage == MESA_SHADER_FRAGMENT);
3958    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3959    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3960    uint32_t next_binding_table_offset = 0;
3961
3962    /* If there are no color regions, we still perform an FB write to a null
3963     * renderbuffer, which we place at surface index 0.
3964     */
3965    prog_data->binding_table.render_target_start = next_binding_table_offset;
3966    next_binding_table_offset += MAX2(key->nr_color_regions, 1);
3967
3968    assign_common_binding_table_offsets(next_binding_table_offset);
3969 }
3970
3971 void
3972 fs_visitor::calculate_register_pressure()
3973 {
3974    invalidate_live_intervals();
3975    calculate_live_intervals();
3976
3977    unsigned num_instructions = 0;
3978    foreach_block(block, cfg)
3979       num_instructions += block->instructions.length();
3980
3981    regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3982
3983    for (unsigned reg = 0; reg < alloc.count; reg++) {
3984       for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3985          regs_live_at_ip[ip] += alloc.sizes[reg];
3986    }
3987 }
3988
3989 void
3990 fs_visitor::optimize()
3991 {
3992    /* bld is the common builder object pointing at the end of the program we
3993     * used to translate it into i965 IR.  For the optimization and lowering
3994     * passes coming next, any code added after the end of the program without
3995     * having explicitly called fs_builder::at() clearly points at a mistake.
3996     * Ideally optimization passes wouldn't be part of the visitor so they
3997     * wouldn't have access to bld at all, but they do, so just in case some
3998     * pass forgets to ask for a location explicitly set it to NULL here to
3999     * make it trip.
4000     */
4001    bld = bld.at(NULL, NULL);
4002
4003    split_virtual_grfs();
4004
4005    move_uniform_array_access_to_pull_constants();
4006    assign_constant_locations();
4007    demote_pull_constants();
4008
4009 #define OPT(pass, args...) ({                                           \
4010       pass_num++;                                                       \
4011       bool this_progress = pass(args);                                  \
4012                                                                         \
4013       if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) {   \
4014          char filename[64];                                             \
4015          snprintf(filename, 64, "%s%d-%04d-%02d-%02d-" #pass,              \
4016                   stage_abbrev, dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
4017                                                                         \
4018          backend_shader::dump_instructions(filename);                   \
4019       }                                                                 \
4020                                                                         \
4021       progress = progress || this_progress;                             \
4022       this_progress;                                                    \
4023    })
4024
4025    if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
4026       char filename[64];
4027       snprintf(filename, 64, "%s%d-%04d-00-start",
4028                stage_abbrev, dispatch_width,
4029                shader_prog ? shader_prog->Name : 0);
4030
4031       backend_shader::dump_instructions(filename);
4032    }
4033
4034    bool progress;
4035    int iteration = 0;
4036    int pass_num = 0;
4037    do {
4038       progress = false;
4039       pass_num = 0;
4040       iteration++;
4041
4042       OPT(remove_duplicate_mrf_writes);
4043
4044       OPT(opt_algebraic);
4045       OPT(opt_cse);
4046       OPT(opt_copy_propagate);
4047       OPT(opt_peephole_predicated_break);
4048       OPT(opt_cmod_propagation);
4049       OPT(dead_code_eliminate);
4050       OPT(opt_peephole_sel);
4051       OPT(dead_control_flow_eliminate, this);
4052       OPT(opt_register_renaming);
4053       OPT(opt_redundant_discard_jumps);
4054       OPT(opt_saturate_propagation);
4055       OPT(opt_zero_samples);
4056       OPT(register_coalesce);
4057       OPT(compute_to_mrf);
4058       OPT(eliminate_find_live_channel);
4059
4060       OPT(compact_virtual_grfs);
4061    } while (progress);
4062
4063    pass_num = 0;
4064
4065    OPT(opt_sampler_eot);
4066
4067    if (OPT(lower_load_payload)) {
4068       split_virtual_grfs();
4069       OPT(register_coalesce);
4070       OPT(compute_to_mrf);
4071       OPT(dead_code_eliminate);
4072    }
4073
4074    OPT(opt_combine_constants);
4075    OPT(lower_integer_multiplication);
4076
4077    lower_uniform_pull_constant_loads();
4078 }
4079
4080 /**
4081  * Three source instruction must have a GRF/MRF destination register.
4082  * ARF NULL is not allowed.  Fix that up by allocating a temporary GRF.
4083  */
4084 void
4085 fs_visitor::fixup_3src_null_dest()
4086 {
4087    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
4088       if (inst->is_3src() && inst->dst.is_null()) {
4089          inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
4090                             inst->dst.type);
4091       }
4092    }
4093 }
4094
4095 void
4096 fs_visitor::allocate_registers()
4097 {
4098    bool allocated_without_spills;
4099
4100    static const enum instruction_scheduler_mode pre_modes[] = {
4101       SCHEDULE_PRE,
4102       SCHEDULE_PRE_NON_LIFO,
4103       SCHEDULE_PRE_LIFO,
4104    };
4105
4106    /* Try each scheduling heuristic to see if it can successfully register
4107     * allocate without spilling.  They should be ordered by decreasing
4108     * performance but increasing likelihood of allocating.
4109     */
4110    for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
4111       schedule_instructions(pre_modes[i]);
4112
4113       if (0) {
4114          assign_regs_trivial();
4115          allocated_without_spills = true;
4116       } else {
4117          allocated_without_spills = assign_regs(false);
4118       }
4119       if (allocated_without_spills)
4120          break;
4121    }
4122
4123    if (!allocated_without_spills) {
4124       /* We assume that any spilling is worse than just dropping back to
4125        * SIMD8.  There's probably actually some intermediate point where
4126        * SIMD16 with a couple of spills is still better.
4127        */
4128       if (dispatch_width == 16) {
4129          fail("Failure to register allocate.  Reduce number of "
4130               "live scalar values to avoid this.");
4131       } else {
4132          perf_debug("%s shader triggered register spilling.  "
4133                     "Try reducing the number of live scalar values to "
4134                     "improve performance.\n", stage_name);
4135       }
4136
4137       /* Since we're out of heuristics, just go spill registers until we
4138        * get an allocation.
4139        */
4140       while (!assign_regs(true)) {
4141          if (failed)
4142             break;
4143       }
4144    }
4145
4146    /* This must come after all optimization and register allocation, since
4147     * it inserts dead code that happens to have side effects, and it does
4148     * so based on the actual physical registers in use.
4149     */
4150    insert_gen4_send_dependency_workarounds();
4151
4152    if (failed)
4153       return;
4154
4155    if (!allocated_without_spills)
4156       schedule_instructions(SCHEDULE_POST);
4157
4158    if (last_scratch > 0)
4159       prog_data->total_scratch = brw_get_scratch_size(last_scratch);
4160 }
4161
4162 bool
4163 fs_visitor::run_vs()
4164 {
4165    assert(stage == MESA_SHADER_VERTEX);
4166
4167    assign_common_binding_table_offsets(0);
4168    setup_vs_payload();
4169
4170    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4171       emit_shader_time_begin();
4172
4173    emit_nir_code();
4174
4175    if (failed)
4176       return false;
4177
4178    emit_urb_writes();
4179
4180    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4181       emit_shader_time_end();
4182
4183    calculate_cfg();
4184
4185    optimize();
4186
4187    assign_curb_setup();
4188    assign_vs_urb_setup();
4189
4190    fixup_3src_null_dest();
4191    allocate_registers();
4192
4193    return !failed;
4194 }
4195
4196 bool
4197 fs_visitor::run_fs()
4198 {
4199    brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
4200    brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
4201
4202    assert(stage == MESA_SHADER_FRAGMENT);
4203
4204    sanity_param_count = prog->Parameters->NumParameters;
4205
4206    assign_binding_table_offsets();
4207
4208    if (devinfo->gen >= 6)
4209       setup_payload_gen6();
4210    else
4211       setup_payload_gen4();
4212
4213    if (0) {
4214       emit_dummy_fs();
4215    } else if (brw->use_rep_send && dispatch_width == 16) {
4216       emit_repclear_shader();
4217    } else {
4218       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4219          emit_shader_time_begin();
4220
4221       calculate_urb_setup();
4222       if (prog->InputsRead > 0) {
4223          if (devinfo->gen < 6)
4224             emit_interpolation_setup_gen4();
4225          else
4226             emit_interpolation_setup_gen6();
4227       }
4228
4229       /* We handle discards by keeping track of the still-live pixels in f0.1.
4230        * Initialize it with the dispatched pixels.
4231        */
4232       if (wm_prog_data->uses_kill) {
4233          fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
4234          discard_init->flag_subreg = 1;
4235       }
4236
4237       /* Generate FS IR for main().  (the visitor only descends into
4238        * functions called "main").
4239        */
4240       emit_nir_code();
4241
4242       if (failed)
4243          return false;
4244
4245       if (wm_prog_data->uses_kill)
4246          emit(FS_OPCODE_PLACEHOLDER_HALT);
4247
4248       if (wm_key->alpha_test_func)
4249          emit_alpha_test();
4250
4251       emit_fb_writes();
4252
4253       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4254          emit_shader_time_end();
4255
4256       calculate_cfg();
4257
4258       optimize();
4259
4260       assign_curb_setup();
4261       assign_urb_setup();
4262
4263       fixup_3src_null_dest();
4264       allocate_registers();
4265
4266       if (failed)
4267          return false;
4268    }
4269
4270    if (dispatch_width == 8)
4271       wm_prog_data->reg_blocks = brw_register_blocks(grf_used);
4272    else
4273       wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
4274
4275    /* If any state parameters were appended, then ParameterValues could have
4276     * been realloced, in which case the driver uniform storage set up by
4277     * _mesa_associate_uniform_storage() would point to freed memory.  Make
4278     * sure that didn't happen.
4279     */
4280    assert(sanity_param_count == prog->Parameters->NumParameters);
4281
4282    return !failed;
4283 }
4284
4285 bool
4286 fs_visitor::run_cs()
4287 {
4288    assert(stage == MESA_SHADER_COMPUTE);
4289    assert(shader);
4290
4291    sanity_param_count = prog->Parameters->NumParameters;
4292
4293    assign_common_binding_table_offsets(0);
4294
4295    setup_cs_payload();
4296
4297    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4298       emit_shader_time_begin();
4299
4300    emit_nir_code();
4301
4302    if (failed)
4303       return false;
4304
4305    emit_cs_terminate();
4306
4307    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4308       emit_shader_time_end();
4309
4310    calculate_cfg();
4311
4312    optimize();
4313
4314    assign_curb_setup();
4315
4316    fixup_3src_null_dest();
4317    allocate_registers();
4318
4319    if (failed)
4320       return false;
4321
4322    /* If any state parameters were appended, then ParameterValues could have
4323     * been realloced, in which case the driver uniform storage set up by
4324     * _mesa_associate_uniform_storage() would point to freed memory.  Make
4325     * sure that didn't happen.
4326     */
4327    assert(sanity_param_count == prog->Parameters->NumParameters);
4328
4329    return !failed;
4330 }
4331
4332 const unsigned *
4333 brw_wm_fs_emit(struct brw_context *brw,
4334                void *mem_ctx,
4335                const struct brw_wm_prog_key *key,
4336                struct brw_wm_prog_data *prog_data,
4337                struct gl_fragment_program *fp,
4338                struct gl_shader_program *prog,
4339                unsigned *final_assembly_size)
4340 {
4341    bool start_busy = false;
4342    double start_time = 0;
4343
4344    if (unlikely(brw->perf_debug)) {
4345       start_busy = (brw->batch.last_bo &&
4346                     drm_intel_bo_busy(brw->batch.last_bo));
4347       start_time = get_time();
4348    }
4349
4350    struct brw_shader *shader = NULL;
4351    if (prog)
4352       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
4353
4354    if (unlikely(INTEL_DEBUG & DEBUG_WM))
4355       brw_dump_ir("fragment", prog, &shader->base, &fp->Base);
4356
4357    /* Now the main event: Visit the shader IR and generate our FS IR for it.
4358     */
4359    fs_visitor v(brw, mem_ctx, MESA_SHADER_FRAGMENT, key, &prog_data->base,
4360                 prog, &fp->Base, 8);
4361    if (!v.run_fs()) {
4362       if (prog) {
4363          prog->LinkStatus = false;
4364          ralloc_strcat(&prog->InfoLog, v.fail_msg);
4365       }
4366
4367       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
4368                     v.fail_msg);
4369
4370       return NULL;
4371    }
4372
4373    cfg_t *simd16_cfg = NULL;
4374    fs_visitor v2(brw, mem_ctx, MESA_SHADER_FRAGMENT, key, &prog_data->base,
4375                  prog, &fp->Base, 16);
4376    if (likely(!(INTEL_DEBUG & DEBUG_NO16) || brw->use_rep_send)) {
4377       if (!v.simd16_unsupported) {
4378          /* Try a SIMD16 compile */
4379          v2.import_uniforms(&v);
4380          if (!v2.run_fs()) {
4381             perf_debug("SIMD16 shader failed to compile, falling back to "
4382                        "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
4383          } else {
4384             simd16_cfg = v2.cfg;
4385          }
4386       } else {
4387          perf_debug("SIMD16 shader unsupported, falling back to "
4388                     "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
4389       }
4390    }
4391
4392    cfg_t *simd8_cfg;
4393    int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8;
4394    if ((no_simd8 || brw->gen < 5) && simd16_cfg) {
4395       simd8_cfg = NULL;
4396       prog_data->no_8 = true;
4397    } else {
4398       simd8_cfg = v.cfg;
4399       prog_data->no_8 = false;
4400    }
4401
4402    fs_generator g(brw, mem_ctx, (void *) key, &prog_data->base,
4403                   &fp->Base, v.promoted_constants, v.runtime_check_aads_emit, "FS");
4404
4405    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
4406       char *name;
4407       if (prog)
4408          name = ralloc_asprintf(mem_ctx, "%s fragment shader %d",
4409                                 prog->Label ? prog->Label : "unnamed",
4410                                 prog->Name);
4411       else
4412          name = ralloc_asprintf(mem_ctx, "fragment program %d", fp->Base.Id);
4413
4414       g.enable_debug(name);
4415    }
4416
4417    if (simd8_cfg)
4418       g.generate_code(simd8_cfg, 8);
4419    if (simd16_cfg)
4420       prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
4421
4422    if (unlikely(brw->perf_debug) && shader) {
4423       if (shader->compiled_once)
4424          brw_wm_debug_recompile(brw, prog, key);
4425       shader->compiled_once = true;
4426
4427       if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
4428          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
4429                     (get_time() - start_time) * 1000);
4430       }
4431    }
4432
4433    return g.get_assembly(final_assembly_size);
4434 }
4435
4436 extern "C" bool
4437 brw_fs_precompile(struct gl_context *ctx,
4438                   struct gl_shader_program *shader_prog,
4439                   struct gl_program *prog)
4440 {
4441    struct brw_context *brw = brw_context(ctx);
4442    struct brw_wm_prog_key key;
4443
4444    struct gl_fragment_program *fp = (struct gl_fragment_program *) prog;
4445    struct brw_fragment_program *bfp = brw_fragment_program(fp);
4446    bool program_uses_dfdy = fp->UsesDFdy;
4447
4448    memset(&key, 0, sizeof(key));
4449
4450    if (brw->gen < 6) {
4451       if (fp->UsesKill)
4452          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
4453
4454       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
4455          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
4456
4457       /* Just assume depth testing. */
4458       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
4459       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
4460    }
4461
4462    if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
4463                                          BRW_FS_VARYING_INPUT_MASK) > 16)
4464       key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
4465
4466    brw_setup_tex_for_precompile(brw, &key.tex, &fp->Base);
4467
4468    if (fp->Base.InputsRead & VARYING_BIT_POS) {
4469       key.drawable_height = ctx->DrawBuffer->Height;
4470    }
4471
4472    key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
4473          ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
4474          BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
4475
4476    if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
4477       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
4478                           key.nr_color_regions > 1;
4479    }
4480
4481    key.program_string_id = bfp->id;
4482
4483    uint32_t old_prog_offset = brw->wm.base.prog_offset;
4484    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
4485
4486    bool success = brw_codegen_wm_prog(brw, shader_prog, bfp, &key);
4487
4488    brw->wm.base.prog_offset = old_prog_offset;
4489    brw->wm.prog_data = old_prog_data;
4490
4491    return success;
4492 }
4493
4494 void
4495 brw_setup_tex_for_precompile(struct brw_context *brw,
4496                              struct brw_sampler_prog_key_data *tex,
4497                              struct gl_program *prog)
4498 {
4499    const bool has_shader_channel_select = brw->is_haswell || brw->gen >= 8;
4500    unsigned sampler_count = _mesa_fls(prog->SamplersUsed);
4501    for (unsigned i = 0; i < sampler_count; i++) {
4502       if (!has_shader_channel_select && (prog->ShadowSamplers & (1 << i))) {
4503          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
4504          tex->swizzles[i] =
4505             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
4506       } else {
4507          /* Color sampler: assume no swizzling. */
4508          tex->swizzles[i] = SWIZZLE_XYZW;
4509       }
4510    }
4511 }