src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 #include <sys/types.h>
  32
  33 #include "util/hash_table.h"
  34 #include "main/macros.h"
  35 #include "main/shaderobj.h"
  36 #include "main/fbobject.h"
  37 #include "program/prog_parameter.h"
  38 #include "program/prog_print.h"
  39 #include "util/register_allocate.h"
  40 #include "program/hash_table.h"
  41 #include "brw_context.h"
  42 #include "brw_eu.h"
  43 #include "brw_wm.h"
  44 #include "brw_fs.h"
  45 #include "brw_cfg.h"
  46 #include "brw_dead_control_flow.h"
  47 #include "main/uniforms.h"
  48 #include "brw_fs_live_variables.h"
  49 #include "glsl/glsl_types.h"
  50 #include "program/sampler.h"
  51
  52 using namespace brw;
  53
  54 void
  55 fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
  56               const fs_reg *src, unsigned sources)
  57 {
  58    memset(this, 0, sizeof(*this));
  59
  60    this->src = new fs_reg[MAX2(sources, 3)];
  61    for (unsigned i = 0; i < sources; i++)
  62       this->src[i] = src[i];
  63
  64    this->opcode = opcode;
  65    this->dst = dst;
  66    this->sources = sources;
  67    this->exec_size = exec_size;
  68
  69    assert(dst.file != IMM && dst.file != UNIFORM);
  70
  71    /* If exec_size == 0, try to guess it from the registers.  Since all
  72     * manner of things may use hardware registers, we first try to guess
  73     * based on GRF registers.  If this fails, we will go ahead and take the
  74     * width from the destination register.
  75     */
  76    if (this->exec_size == 0) {
  77       if (dst.file == GRF) {
  78          this->exec_size = dst.width;
  79       } else {
  80          for (unsigned i = 0; i < sources; ++i) {
  81             if (src[i].file != GRF && src[i].file != ATTR)
  82                continue;
  83
  84             if (this->exec_size <= 1)
  85                this->exec_size = src[i].width;
  86             assert(src[i].width == 1 || src[i].width == this->exec_size);
  87          }
  88       }
  89
  90       if (this->exec_size == 0 && dst.file != BAD_FILE)
  91          this->exec_size = dst.width;
  92    }
  93    assert(this->exec_size != 0);
  94
  95    this->conditional_mod = BRW_CONDITIONAL_NONE;
  96
  97    /* This will be the case for almost all instructions. */
  98    switch (dst.file) {
  99    case GRF:
 100    case HW_REG:
 101    case MRF:
 102    case ATTR:
 103       this->regs_written =
 104          DIV_ROUND_UP(MAX2(dst.width * dst.stride, 1) * type_sz(dst.type), 32);
 105       break;
 106    case BAD_FILE:
 107       this->regs_written = 0;
 108       break;
 109    case IMM:
 110    case UNIFORM:
 111       unreachable("Invalid destination register file");
 112    default:
 113       unreachable("Invalid register file");
 114    }
 115
 116    this->writes_accumulator = false;
 117 }
 118
 119 fs_inst::fs_inst()
 120 {
 121    init(BRW_OPCODE_NOP, 8, dst, NULL, 0);
 122 }
 123
 124 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
 125 {
 126    init(opcode, exec_size, reg_undef, NULL, 0);
 127 }
 128
 129 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
 130 {
 131    init(opcode, 0, dst, NULL, 0);
 132 }
 133
 134 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 135                  const fs_reg &src0)
 136 {
 137    const fs_reg src[1] = { src0 };
 138    init(opcode, exec_size, dst, src, 1);
 139 }
 140
 141 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
 142 {
 143    const fs_reg src[1] = { src0 };
 144    init(opcode, 0, dst, src, 1);
 145 }
 146
 147 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 148                  const fs_reg &src0, const fs_reg &src1)
 149 {
 150    const fs_reg src[2] = { src0, src1 };
 151    init(opcode, exec_size, dst, src, 2);
 152 }
 153
 154 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 155                  const fs_reg &src1)
 156 {
 157    const fs_reg src[2] = { src0, src1 };
 158    init(opcode, 0, dst, src, 2);
 159 }
 160
 161 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 162                  const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
 163 {
 164    const fs_reg src[3] = { src0, src1, src2 };
 165    init(opcode, exec_size, dst, src, 3);
 166 }
 167
 168 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 169                  const fs_reg &src1, const fs_reg &src2)
 170 {
 171    const fs_reg src[3] = { src0, src1, src2 };
 172    init(opcode, 0, dst, src, 3);
 173 }
 174
 175 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst,
 176                  const fs_reg src[], unsigned sources)
 177 {
 178    init(opcode, 0, dst, src, sources);
 179 }
 180
 181 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
 182                  const fs_reg src[], unsigned sources)
 183 {
 184    init(opcode, exec_width, dst, src, sources);
 185 }
 186
 187 fs_inst::fs_inst(const fs_inst &that)
 188 {
 189    memcpy(this, &that, sizeof(that));
 190
 191    this->src = new fs_reg[MAX2(that.sources, 3)];
 192
 193    for (unsigned i = 0; i < that.sources; i++)
 194       this->src[i] = that.src[i];
 195 }
 196
 197 fs_inst::~fs_inst()
 198 {
 199    delete[] this->src;
 200 }
 201
 202 void
 203 fs_inst::resize_sources(uint8_t num_sources)
 204 {
 205    if (this->sources != num_sources) {
 206       fs_reg *src = new fs_reg[MAX2(num_sources, 3)];
 207
 208       for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i)
 209          src[i] = this->src[i];
 210
 211       delete[] this->src;
 212       this->src = src;
 213       this->sources = num_sources;
 214    }
 215 }
 216
 217 #define ALU1(op)                                                        \
 218    fs_inst *                                                            \
 219    fs_visitor::op(const fs_reg &dst, const fs_reg &src0)                \
 220    {                                                                    \
 221       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
 222    }
 223
 224 #define ALU2(op)                                                        \
 225    fs_inst *                                                            \
 226    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 227                   const fs_reg &src1)                                   \
 228    {                                                                    \
 229       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
 230    }
 231
 232 #define ALU2_ACC(op)                                                    \
 233    fs_inst *                                                            \
 234    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 235                   const fs_reg &src1)                                   \
 236    {                                                                    \
 237       fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
 238       inst->writes_accumulator = true;                                  \
 239       return inst;                                                      \
 240    }
 241
 242 #define ALU3(op)                                                        \
 243    fs_inst *                                                            \
 244    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 245                   const fs_reg &src1, const fs_reg &src2)               \
 246    {                                                                    \
 247       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
 248    }
 249
 250 ALU1(NOT)
 251 ALU1(MOV)
 252 ALU1(FRC)
 253 ALU1(RNDD)
 254 ALU1(RNDE)
 255 ALU1(RNDZ)
 256 ALU2(ADD)
 257 ALU2(MUL)
 258 ALU2_ACC(MACH)
 259 ALU2(AND)
 260 ALU2(OR)
 261 ALU2(XOR)
 262 ALU2(SHL)
 263 ALU2(SHR)
 264 ALU2(ASR)
 265 ALU3(LRP)
 266 ALU1(BFREV)
 267 ALU3(BFE)
 268 ALU2(BFI1)
 269 ALU3(BFI2)
 270 ALU1(FBH)
 271 ALU1(FBL)
 272 ALU1(CBIT)
 273 ALU3(MAD)
 274 ALU2_ACC(ADDC)
 275 ALU2_ACC(SUBB)
 276 ALU2(SEL)
 277 ALU2(MAC)
 278
 279 /** Gen4 predicated IF. */
 280 fs_inst *
 281 fs_visitor::IF(enum brw_predicate predicate)
 282 {
 283    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width);
 284    inst->predicate = predicate;
 285    return inst;
 286 }
 287
 288 /** Gen6 IF with embedded comparison. */
 289 fs_inst *
 290 fs_visitor::IF(const fs_reg &src0, const fs_reg &src1,
 291                enum brw_conditional_mod condition)
 292 {
 293    assert(devinfo->gen == 6);
 294    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width,
 295                                         reg_null_d, src0, src1);
 296    inst->conditional_mod = condition;
 297    return inst;
 298 }
 299
 300 /**
 301  * CMP: Sets the low bit of the destination channels with the result
 302  * of the comparison, while the upper bits are undefined, and updates
 303  * the flag register with the packed 16 bits of the result.
 304  */
 305 fs_inst *
 306 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
 307                 enum brw_conditional_mod condition)
 308 {
 309    fs_inst *inst;
 310
 311    /* Take the instruction:
 312     *
 313     * CMP null<d> src0<f> src1<f>
 314     *
 315     * Original gen4 does type conversion to the destination type before
 316     * comparison, producing garbage results for floating point comparisons.
 317     *
 318     * The destination type doesn't matter on newer generations, so we set the
 319     * type to match src0 so we can compact the instruction.
 320     */
 321    dst.type = src0.type;
 322    if (dst.file == HW_REG)
 323       dst.fixed_hw_reg.type = dst.type;
 324
 325    resolve_ud_negate(&src0);
 326    resolve_ud_negate(&src1);
 327
 328    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
 329    inst->conditional_mod = condition;
 330
 331    return inst;
 332 }
 333
 334 fs_inst *
 335 fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources,
 336                          int header_size)
 337 {
 338    assert(dst.width % 8 == 0);
 339    fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, dst.width,
 340                                         dst, src, sources);
 341    inst->header_size = header_size;
 342
 343    for (int i = 0; i < header_size; i++)
 344       assert(src[i].file != GRF || src[i].width * type_sz(src[i].type) == 32);
 345    inst->regs_written = header_size;
 346
 347    for (int i = header_size; i < sources; ++i)
 348       assert(src[i].file != GRF || src[i].width == dst.width);
 349    inst->regs_written += (sources - header_size) * (dst.width / 8);
 350
 351    return inst;
 352 }
 353
 354 exec_list
 355 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
 356                                        const fs_reg &surf_index,
 357                                        const fs_reg &varying_offset,
 358                                        uint32_t const_offset)
 359 {
 360    exec_list instructions;
 361    fs_inst *inst;
 362
 363    /* We have our constant surface use a pitch of 4 bytes, so our index can
 364     * be any component of a vector, and then we load 4 contiguous
 365     * components starting from that.
 366     *
 367     * We break down the const_offset to a portion added to the variable
 368     * offset and a portion done using reg_offset, which means that if you
 369     * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
 370     * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
 371     * CSE can later notice that those loads are all the same and eliminate
 372     * the redundant ones.
 373     */
 374    fs_reg vec4_offset = vgrf(glsl_type::int_type);
 375    instructions.push_tail(ADD(vec4_offset,
 376                               varying_offset, fs_reg(const_offset & ~3)));
 377
 378    int scale = 1;
 379    if (devinfo->gen == 4 && dst.width == 8) {
 380       /* Pre-gen5, we can either use a SIMD8 message that requires (header,
 381        * u, v, r) as parameters, or we can just use the SIMD16 message
 382        * consisting of (header, u).  We choose the second, at the cost of a
 383        * longer return length.
 384        */
 385       scale = 2;
 386    }
 387
 388    enum opcode op;
 389    if (devinfo->gen >= 7)
 390       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
 391    else
 392       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
 393
 394    assert(dst.width % 8 == 0);
 395    int regs_written = 4 * (dst.width / 8) * scale;
 396    fs_reg vec4_result = fs_reg(GRF, alloc.allocate(regs_written),
 397                                dst.type, dst.width);
 398    inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
 399    inst->regs_written = regs_written;
 400    instructions.push_tail(inst);
 401
 402    if (devinfo->gen < 7) {
 403       inst->base_mrf = 13;
 404       inst->header_size = 1;
 405       if (devinfo->gen == 4)
 406          inst->mlen = 3;
 407       else
 408          inst->mlen = 1 + dispatch_width / 8;
 409    }
 410
 411    fs_reg result = offset(vec4_result, (const_offset & 3) * scale);
 412    instructions.push_tail(MOV(dst, result));
 413
 414    return instructions;
 415 }
 416
 417 /**
 418  * A helper for MOV generation for fixing up broken hardware SEND dependency
 419  * handling.
 420  */
 421 fs_inst *
 422 fs_visitor::DEP_RESOLVE_MOV(int grf)
 423 {
 424    fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
 425
 426    inst->ir = NULL;
 427    inst->annotation = "send dependency resolve";
 428
 429    /* The caller always wants uncompressed to emit the minimal extra
 430     * dependencies, and to avoid having to deal with aligning its regs to 2.
 431     */
 432    inst->exec_size = 8;
 433
 434    return inst;
 435 }
 436
 437 bool
 438 fs_inst::equals(fs_inst *inst) const
 439 {
 440    return (opcode == inst->opcode &&
 441            dst.equals(inst->dst) &&
 442            src[0].equals(inst->src[0]) &&
 443            src[1].equals(inst->src[1]) &&
 444            src[2].equals(inst->src[2]) &&
 445            saturate == inst->saturate &&
 446            predicate == inst->predicate &&
 447            conditional_mod == inst->conditional_mod &&
 448            mlen == inst->mlen &&
 449            base_mrf == inst->base_mrf &&
 450            target == inst->target &&
 451            eot == inst->eot &&
 452            header_size == inst->header_size &&
 453            shadow_compare == inst->shadow_compare &&
 454            exec_size == inst->exec_size &&
 455            offset == inst->offset);
 456 }
 457
 458 bool
 459 fs_inst::overwrites_reg(const fs_reg &reg) const
 460 {
 461    return reg.in_range(dst, regs_written);
 462 }
 463
 464 bool
 465 fs_inst::is_send_from_grf() const
 466 {
 467    switch (opcode) {
 468    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
 469    case SHADER_OPCODE_SHADER_TIME_ADD:
 470    case FS_OPCODE_INTERPOLATE_AT_CENTROID:
 471    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
 472    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
 473    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
 474    case SHADER_OPCODE_UNTYPED_ATOMIC:
 475    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
 476    case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
 477    case SHADER_OPCODE_TYPED_ATOMIC:
 478    case SHADER_OPCODE_TYPED_SURFACE_READ:
 479    case SHADER_OPCODE_TYPED_SURFACE_WRITE:
 480    case SHADER_OPCODE_URB_WRITE_SIMD8:
 481       return true;
 482    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 483       return src[1].file == GRF;
 484    case FS_OPCODE_FB_WRITE:
 485       return src[0].file == GRF;
 486    default:
 487       if (is_tex())
 488          return src[0].file == GRF;
 489
 490       return false;
 491    }
 492 }
 493
 494 bool
 495 fs_inst::is_copy_payload(const brw::simple_allocator &grf_alloc) const
 496 {
 497    if (this->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
 498       return false;
 499
 500    fs_reg reg = this->src[0];
 501    if (reg.file != GRF || reg.reg_offset != 0 || reg.stride == 0)
 502       return false;
 503
 504    if (grf_alloc.sizes[reg.reg] != this->regs_written)
 505       return false;
 506
 507    for (int i = 0; i < this->sources; i++) {
 508       reg.type = this->src[i].type;
 509       reg.width = this->src[i].width;
 510       if (!this->src[i].equals(reg))
 511          return false;
 512       reg = ::offset(reg, 1);
 513    }
 514
 515    return true;
 516 }
 517
 518 bool
 519 fs_inst::can_do_source_mods(const struct brw_device_info *devinfo)
 520 {
 521    if (devinfo->gen == 6 && is_math())
 522       return false;
 523
 524    if (is_send_from_grf())
 525       return false;
 526
 527    if (!backend_instruction::can_do_source_mods())
 528       return false;
 529
 530    return true;
 531 }
 532
 533 bool
 534 fs_inst::has_side_effects() const
 535 {
 536    return this->eot || backend_instruction::has_side_effects();
 537 }
 538
 539 void
 540 fs_reg::init()
 541 {
 542    memset(this, 0, sizeof(*this));
 543    stride = 1;
 544 }
 545
 546 /** Generic unset register constructor. */
 547 fs_reg::fs_reg()
 548 {
 549    init();
 550    this->file = BAD_FILE;
 551 }
 552
 553 /** Immediate value constructor. */
 554 fs_reg::fs_reg(float f)
 555 {
 556    init();
 557    this->file = IMM;
 558    this->type = BRW_REGISTER_TYPE_F;
 559    this->fixed_hw_reg.dw1.f = f;
 560    this->width = 1;
 561 }
 562
 563 /** Immediate value constructor. */
 564 fs_reg::fs_reg(int32_t i)
 565 {
 566    init();
 567    this->file = IMM;
 568    this->type = BRW_REGISTER_TYPE_D;
 569    this->fixed_hw_reg.dw1.d = i;
 570    this->width = 1;
 571 }
 572
 573 /** Immediate value constructor. */
 574 fs_reg::fs_reg(uint32_t u)
 575 {
 576    init();
 577    this->file = IMM;
 578    this->type = BRW_REGISTER_TYPE_UD;
 579    this->fixed_hw_reg.dw1.ud = u;
 580    this->width = 1;
 581 }
 582
 583 /** Vector float immediate value constructor. */
 584 fs_reg::fs_reg(uint8_t vf[4])
 585 {
 586    init();
 587    this->file = IMM;
 588    this->type = BRW_REGISTER_TYPE_VF;
 589    memcpy(&this->fixed_hw_reg.dw1.ud, vf, sizeof(unsigned));
 590 }
 591
 592 /** Vector float immediate value constructor. */
 593 fs_reg::fs_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
 594 {
 595    init();
 596    this->file = IMM;
 597    this->type = BRW_REGISTER_TYPE_VF;
 598    this->fixed_hw_reg.dw1.ud = (vf0 <<  0) |
 599                                (vf1 <<  8) |
 600                                (vf2 << 16) |
 601                                (vf3 << 24);
 602 }
 603
 604 /** Fixed brw_reg. */
 605 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 606 {
 607    init();
 608    this->file = HW_REG;
 609    this->fixed_hw_reg = fixed_hw_reg;
 610    this->type = fixed_hw_reg.type;
 611    this->width = 1 << fixed_hw_reg.width;
 612 }
 613
 614 bool
 615 fs_reg::equals(const fs_reg &r) const
 616 {
 617    return (file == r.file &&
 618            reg == r.reg &&
 619            reg_offset == r.reg_offset &&
 620            subreg_offset == r.subreg_offset &&
 621            type == r.type &&
 622            negate == r.negate &&
 623            abs == r.abs &&
 624            !reladdr && !r.reladdr &&
 625            memcmp(&fixed_hw_reg, &r.fixed_hw_reg, sizeof(fixed_hw_reg)) == 0 &&
 626            width == r.width &&
 627            stride == r.stride);
 628 }
 629
 630 fs_reg &
 631 fs_reg::set_smear(unsigned subreg)
 632 {
 633    assert(file != HW_REG && file != IMM);
 634    subreg_offset = subreg * type_sz(type);
 635    stride = 0;
 636    return *this;
 637 }
 638
 639 bool
 640 fs_reg::is_contiguous() const
 641 {
 642    return stride == 1;
 643 }
 644
 645 int
 646 fs_visitor::type_size(const struct glsl_type *type)
 647 {
 648    unsigned int size, i;
 649
 650    switch (type->base_type) {
 651    case GLSL_TYPE_UINT:
 652    case GLSL_TYPE_INT:
 653    case GLSL_TYPE_FLOAT:
 654    case GLSL_TYPE_BOOL:
 655       return type->components();
 656    case GLSL_TYPE_ARRAY:
 657       return type_size(type->fields.array) * type->length;
 658    case GLSL_TYPE_STRUCT:
 659       size = 0;
 660       for (i = 0; i < type->length; i++) {
 661          size += type_size(type->fields.structure[i].type);
 662       }
 663       return size;
 664    case GLSL_TYPE_SAMPLER:
 665       /* Samplers take up no register space, since they're baked in at
 666        * link time.
 667        */
 668       return 0;
 669    case GLSL_TYPE_ATOMIC_UINT:
 670       return 0;
 671    case GLSL_TYPE_IMAGE:
 672    case GLSL_TYPE_VOID:
 673    case GLSL_TYPE_ERROR:
 674    case GLSL_TYPE_INTERFACE:
 675    case GLSL_TYPE_DOUBLE:
 676       unreachable("not reached");
 677    }
 678
 679    return 0;
 680 }
 681
 682 /**
 683  * Create a MOV to read the timestamp register.
 684  *
 685  * The caller is responsible for emitting the MOV.  The return value is
 686  * the destination of the MOV, with extra parameters set.
 687  */
 688 fs_reg
 689 fs_visitor::get_timestamp(fs_inst **out_mov)
 690 {
 691    assert(devinfo->gen >= 7);
 692
 693    fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 694                                           BRW_ARF_TIMESTAMP,
 695                                           0),
 696                              BRW_REGISTER_TYPE_UD));
 697
 698    fs_reg dst = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 4);
 699
 700    fs_inst *mov = MOV(dst, ts);
 701    /* We want to read the 3 fields we care about even if it's not enabled in
 702     * the dispatch.
 703     */
 704    mov->force_writemask_all = true;
 705
 706    /* The caller wants the low 32 bits of the timestamp.  Since it's running
 707     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 708     * which is plenty of time for our purposes.  It is identical across the
 709     * EUs, but since it's tracking GPU core speed it will increment at a
 710     * varying rate as render P-states change.
 711     *
 712     * The caller could also check if render P-states have changed (or anything
 713     * else that might disrupt timing) by setting smear to 2 and checking if
 714     * that field is != 0.
 715     */
 716    dst.set_smear(0);
 717
 718    *out_mov = mov;
 719    return dst;
 720 }
 721
 722 void
 723 fs_visitor::emit_shader_time_begin()
 724 {
 725    current_annotation = "shader time start";
 726    fs_inst *mov;
 727    shader_start_time = get_timestamp(&mov);
 728    emit(mov);
 729 }
 730
 731 void
 732 fs_visitor::emit_shader_time_end()
 733 {
 734    current_annotation = "shader time end";
 735
 736    enum shader_time_shader_type type, written_type, reset_type;
 737    switch (stage) {
 738    case MESA_SHADER_VERTEX:
 739       type = ST_VS;
 740       written_type = ST_VS_WRITTEN;
 741       reset_type = ST_VS_RESET;
 742       break;
 743    case MESA_SHADER_GEOMETRY:
 744       type = ST_GS;
 745       written_type = ST_GS_WRITTEN;
 746       reset_type = ST_GS_RESET;
 747       break;
 748    case MESA_SHADER_FRAGMENT:
 749       if (dispatch_width == 8) {
 750          type = ST_FS8;
 751          written_type = ST_FS8_WRITTEN;
 752          reset_type = ST_FS8_RESET;
 753       } else {
 754          assert(dispatch_width == 16);
 755          type = ST_FS16;
 756          written_type = ST_FS16_WRITTEN;
 757          reset_type = ST_FS16_RESET;
 758       }
 759       break;
 760    case MESA_SHADER_COMPUTE:
 761       type = ST_CS;
 762       written_type = ST_CS_WRITTEN;
 763       reset_type = ST_CS_RESET;
 764       break;
 765    default:
 766       unreachable("fs_visitor::emit_shader_time_end missing code");
 767    }
 768
 769    /* Insert our code just before the final SEND with EOT. */
 770    exec_node *end = this->instructions.get_tail();
 771    assert(end && ((fs_inst *) end)->eot);
 772
 773    fs_inst *tm_read;
 774    fs_reg shader_end_time = get_timestamp(&tm_read);
 775    end->insert_before(tm_read);
 776
 777    /* Check that there weren't any timestamp reset events (assuming these
 778     * were the only two timestamp reads that happened).
 779     */
 780    fs_reg reset = shader_end_time;
 781    reset.set_smear(2);
 782    fs_inst *test = AND(reg_null_d, reset, fs_reg(1u));
 783    test->conditional_mod = BRW_CONDITIONAL_Z;
 784    test->force_writemask_all = true;
 785    end->insert_before(test);
 786    end->insert_before(IF(BRW_PREDICATE_NORMAL));
 787
 788    fs_reg start = shader_start_time;
 789    start.negate = true;
 790    fs_reg diff = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 1);
 791    diff.set_smear(0);
 792    fs_inst *add = ADD(diff, start, shader_end_time);
 793    add->force_writemask_all = true;
 794    end->insert_before(add);
 795
 796    /* If there were no instructions between the two timestamp gets, the diff
 797     * is 2 cycles.  Remove that overhead, so I can forget about that when
 798     * trying to determine the time taken for single instructions.
 799     */
 800    add = ADD(diff, diff, fs_reg(-2u));
 801    add->force_writemask_all = true;
 802    end->insert_before(add);
 803
 804    end->insert_before(SHADER_TIME_ADD(type, diff));
 805    end->insert_before(SHADER_TIME_ADD(written_type, fs_reg(1u)));
 806    end->insert_before(new(mem_ctx) fs_inst(BRW_OPCODE_ELSE, dispatch_width));
 807    end->insert_before(SHADER_TIME_ADD(reset_type, fs_reg(1u)));
 808    end->insert_before(new(mem_ctx) fs_inst(BRW_OPCODE_ENDIF, dispatch_width));
 809 }
 810
 811 fs_inst *
 812 fs_visitor::SHADER_TIME_ADD(enum shader_time_shader_type type, fs_reg value)
 813 {
 814    int shader_time_index =
 815       brw_get_shader_time_index(brw, shader_prog, prog, type);
 816    fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
 817
 818    fs_reg payload;
 819    if (dispatch_width == 8)
 820       payload = vgrf(glsl_type::uvec2_type);
 821    else
 822       payload = vgrf(glsl_type::uint_type);
 823
 824    return new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
 825                                fs_reg(), payload, offset, value);
 826 }
 827
 828 void
 829 fs_visitor::vfail(const char *format, va_list va)
 830 {
 831    char *msg;
 832
 833    if (failed)
 834       return;
 835
 836    failed = true;
 837
 838    msg = ralloc_vasprintf(mem_ctx, format, va);
 839    msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
 840
 841    this->fail_msg = msg;
 842
 843    if (debug_enabled) {
 844       fprintf(stderr, "%s",  msg);
 845    }
 846 }
 847
 848 void
 849 fs_visitor::fail(const char *format, ...)
 850 {
 851    va_list va;
 852
 853    va_start(va, format);
 854    vfail(format, va);
 855    va_end(va);
 856 }
 857
 858 /**
 859  * Mark this program as impossible to compile in SIMD16 mode.
 860  *
 861  * During the SIMD8 compile (which happens first), we can detect and flag
 862  * things that are unsupported in SIMD16 mode, so the compiler can skip
 863  * the SIMD16 compile altogether.
 864  *
 865  * During a SIMD16 compile (if one happens anyway), this just calls fail().
 866  */
 867 void
 868 fs_visitor::no16(const char *format, ...)
 869 {
 870    va_list va;
 871
 872    va_start(va, format);
 873
 874    if (dispatch_width == 16) {
 875       vfail(format, va);
 876    } else {
 877       simd16_unsupported = true;
 878
 879       if (brw->perf_debug) {
 880          if (no16_msg)
 881             ralloc_vasprintf_append(&no16_msg, format, va);
 882          else
 883             no16_msg = ralloc_vasprintf(mem_ctx, format, va);
 884       }
 885    }
 886
 887    va_end(va);
 888 }
 889
 890 fs_inst *
 891 fs_visitor::emit(enum opcode opcode)
 892 {
 893    return emit(new(mem_ctx) fs_inst(opcode, dispatch_width));
 894 }
 895
 896 fs_inst *
 897 fs_visitor::emit(enum opcode opcode, const fs_reg &dst)
 898 {
 899    return emit(new(mem_ctx) fs_inst(opcode, dst));
 900 }
 901
 902 fs_inst *
 903 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
 904 {
 905    return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
 906 }
 907
 908 fs_inst *
 909 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 910                  const fs_reg &src1)
 911 {
 912    return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
 913 }
 914
 915 fs_inst *
 916 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 917                  const fs_reg &src1, const fs_reg &src2)
 918 {
 919    return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
 920 }
 921
 922 fs_inst *
 923 fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
 924                  fs_reg src[], int sources)
 925 {
 926    return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
 927 }
 928
 929 /**
 930  * Returns true if the instruction has a flag that means it won't
 931  * update an entire destination register.
 932  *
 933  * For example, dead code elimination and live variable analysis want to know
 934  * when a write to a variable screens off any preceding values that were in
 935  * it.
 936  */
 937 bool
 938 fs_inst::is_partial_write() const
 939 {
 940    return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
 941            (this->dst.width * type_sz(this->dst.type)) < 32 ||
 942            !this->dst.is_contiguous());
 943 }
 944
 945 int
 946 fs_inst::regs_read(int arg) const
 947 {
 948    if (is_tex() && arg == 0 && src[0].file == GRF) {
 949       return mlen;
 950    } else if (opcode == FS_OPCODE_FB_WRITE && arg == 0) {
 951       return mlen;
 952    } else if (opcode == SHADER_OPCODE_URB_WRITE_SIMD8 && arg == 0) {
 953       return mlen;
 954    } else if (opcode == SHADER_OPCODE_UNTYPED_ATOMIC && arg == 0) {
 955       return mlen;
 956    } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ && arg == 0) {
 957       return mlen;
 958    } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_WRITE && arg == 0) {
 959       return mlen;
 960    } else if (opcode == SHADER_OPCODE_TYPED_ATOMIC && arg == 0) {
 961       return mlen;
 962    } else if (opcode == SHADER_OPCODE_TYPED_SURFACE_READ && arg == 0) {
 963       return mlen;
 964    } else if (opcode == SHADER_OPCODE_TYPED_SURFACE_WRITE && arg == 0) {
 965       return mlen;
 966    } else if (opcode == FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET && arg == 0) {
 967       return mlen;
 968    } else if (opcode == FS_OPCODE_LINTERP && arg == 0) {
 969       return exec_size / 4;
 970    }
 971
 972    switch (src[arg].file) {
 973    case BAD_FILE:
 974    case UNIFORM:
 975    case IMM:
 976       return 1;
 977    case GRF:
 978    case HW_REG:
 979       if (src[arg].stride == 0) {
 980          return 1;
 981       } else {
 982          int size = src[arg].width * src[arg].stride * type_sz(src[arg].type);
 983          return (size + 31) / 32;
 984       }
 985    case MRF:
 986       unreachable("MRF registers are not allowed as sources");
 987    default:
 988       unreachable("Invalid register file");
 989    }
 990 }
 991
 992 bool
 993 fs_inst::reads_flag() const
 994 {
 995    return predicate;
 996 }
 997
 998 bool
 999 fs_inst::writes_flag() const
1000 {
1001    return (conditional_mod && (opcode != BRW_OPCODE_SEL &&
1002                                opcode != BRW_OPCODE_IF &&
1003                                opcode != BRW_OPCODE_WHILE)) ||
1004           opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
1005 }
1006
1007 /**
1008  * Returns how many MRFs an FS opcode will write over.
1009  *
1010  * Note that this is not the 0 or 1 implied writes in an actual gen
1011  * instruction -- the FS opcodes often generate MOVs in addition.
1012  */
1013 int
1014 fs_visitor::implied_mrf_writes(fs_inst *inst)
1015 {
1016    if (inst->mlen == 0)
1017       return 0;
1018
1019    if (inst->base_mrf == -1)
1020       return 0;
1021
1022    switch (inst->opcode) {
1023    case SHADER_OPCODE_RCP:
1024    case SHADER_OPCODE_RSQ:
1025    case SHADER_OPCODE_SQRT:
1026    case SHADER_OPCODE_EXP2:
1027    case SHADER_OPCODE_LOG2:
1028    case SHADER_OPCODE_SIN:
1029    case SHADER_OPCODE_COS:
1030       return 1 * dispatch_width / 8;
1031    case SHADER_OPCODE_POW:
1032    case SHADER_OPCODE_INT_QUOTIENT:
1033    case SHADER_OPCODE_INT_REMAINDER:
1034       return 2 * dispatch_width / 8;
1035    case SHADER_OPCODE_TEX:
1036    case FS_OPCODE_TXB:
1037    case SHADER_OPCODE_TXD:
1038    case SHADER_OPCODE_TXF:
1039    case SHADER_OPCODE_TXF_CMS:
1040    case SHADER_OPCODE_TXF_MCS:
1041    case SHADER_OPCODE_TG4:
1042    case SHADER_OPCODE_TG4_OFFSET:
1043    case SHADER_OPCODE_TXL:
1044    case SHADER_OPCODE_TXS:
1045    case SHADER_OPCODE_LOD:
1046       return 1;
1047    case FS_OPCODE_FB_WRITE:
1048       return 2;
1049    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
1050    case SHADER_OPCODE_GEN4_SCRATCH_READ:
1051       return 1;
1052    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
1053       return inst->mlen;
1054    case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
1055       return inst->mlen;
1056    case SHADER_OPCODE_UNTYPED_ATOMIC:
1057    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
1058    case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
1059    case SHADER_OPCODE_TYPED_ATOMIC:
1060    case SHADER_OPCODE_TYPED_SURFACE_READ:
1061    case SHADER_OPCODE_TYPED_SURFACE_WRITE:
1062    case SHADER_OPCODE_URB_WRITE_SIMD8:
1063    case FS_OPCODE_INTERPOLATE_AT_CENTROID:
1064    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
1065    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
1066    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
1067       return 0;
1068    default:
1069       unreachable("not reached");
1070    }
1071 }
1072
1073 fs_reg
1074 fs_visitor::vgrf(const glsl_type *const type)
1075 {
1076    int reg_width = dispatch_width / 8;
1077    return fs_reg(GRF, alloc.allocate(type_size(type) * reg_width),
1078                  brw_type_for_base_type(type), dispatch_width);
1079 }
1080
1081 fs_reg
1082 fs_visitor::vgrf(int num_components)
1083 {
1084    int reg_width = dispatch_width / 8;
1085    return fs_reg(GRF, alloc.allocate(num_components * reg_width),
1086                  BRW_REGISTER_TYPE_F, dispatch_width);
1087 }
1088
1089 /** Fixed HW reg constructor. */
1090 fs_reg::fs_reg(enum register_file file, int reg)
1091 {
1092    init();
1093    this->file = file;
1094    this->reg = reg;
1095    this->type = BRW_REGISTER_TYPE_F;
1096
1097    switch (file) {
1098    case UNIFORM:
1099       this->width = 1;
1100       break;
1101    default:
1102       this->width = 8;
1103    }
1104 }
1105
1106 /** Fixed HW reg constructor. */
1107 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
1108 {
1109    init();
1110    this->file = file;
1111    this->reg = reg;
1112    this->type = type;
1113
1114    switch (file) {
1115    case UNIFORM:
1116       this->width = 1;
1117       break;
1118    default:
1119       this->width = 8;
1120    }
1121 }
1122
1123 /** Fixed HW reg constructor. */
1124 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type,
1125                uint8_t width)
1126 {
1127    init();
1128    this->file = file;
1129    this->reg = reg;
1130    this->type = type;
1131    this->width = width;
1132 }
1133
1134 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
1135  * This brings in those uniform definitions
1136  */
1137 void
1138 fs_visitor::import_uniforms(fs_visitor *v)
1139 {
1140    this->push_constant_loc = v->push_constant_loc;
1141    this->pull_constant_loc = v->pull_constant_loc;
1142    this->uniforms = v->uniforms;
1143    this->param_size = v->param_size;
1144 }
1145
1146 fs_reg *
1147 fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
1148                                          bool origin_upper_left)
1149 {
1150    assert(stage == MESA_SHADER_FRAGMENT);
1151    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1152    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec4_type));
1153    fs_reg wpos = *reg;
1154    bool flip = !origin_upper_left ^ key->render_to_fbo;
1155
1156    /* gl_FragCoord.x */
1157    if (pixel_center_integer) {
1158       emit(MOV(wpos, this->pixel_x));
1159    } else {
1160       emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
1161    }
1162    wpos = offset(wpos, 1);
1163
1164    /* gl_FragCoord.y */
1165    if (!flip && pixel_center_integer) {
1166       emit(MOV(wpos, this->pixel_y));
1167    } else {
1168       fs_reg pixel_y = this->pixel_y;
1169       float offset = (pixel_center_integer ? 0.0 : 0.5);
1170
1171       if (flip) {
1172          pixel_y.negate = true;
1173          offset += key->drawable_height - 1.0;
1174       }
1175
1176       emit(ADD(wpos, pixel_y, fs_reg(offset)));
1177    }
1178    wpos = offset(wpos, 1);
1179
1180    /* gl_FragCoord.z */
1181    if (devinfo->gen >= 6) {
1182       emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
1183    } else {
1184       emit(FS_OPCODE_LINTERP, wpos,
1185            this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1186            interp_reg(VARYING_SLOT_POS, 2));
1187    }
1188    wpos = offset(wpos, 1);
1189
1190    /* gl_FragCoord.w: Already set up in emit_interpolation */
1191    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1192
1193    return reg;
1194 }
1195
1196 fs_inst *
1197 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1198                          glsl_interp_qualifier interpolation_mode,
1199                          bool is_centroid, bool is_sample)
1200 {
1201    brw_wm_barycentric_interp_mode barycoord_mode;
1202    if (devinfo->gen >= 6) {
1203       if (is_centroid) {
1204          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1205             barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1206          else
1207             barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1208       } else if (is_sample) {
1209           if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1210             barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1211          else
1212             barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1213       } else {
1214          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1215             barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1216          else
1217             barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1218       }
1219    } else {
1220       /* On Ironlake and below, there is only one interpolation mode.
1221        * Centroid interpolation doesn't mean anything on this hardware --
1222        * there is no multisampling.
1223        */
1224       barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1225    }
1226    return emit(FS_OPCODE_LINTERP, attr,
1227                this->delta_xy[barycoord_mode], interp);
1228 }
1229
1230 void
1231 fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
1232                                        const glsl_type *type,
1233                                        glsl_interp_qualifier interpolation_mode,
1234                                        int location, bool mod_centroid,
1235                                        bool mod_sample)
1236 {
1237    attr.type = brw_type_for_base_type(type->get_scalar_type());
1238
1239    assert(stage == MESA_SHADER_FRAGMENT);
1240    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1241    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1242
1243    unsigned int array_elements;
1244
1245    if (type->is_array()) {
1246       array_elements = type->length;
1247       if (array_elements == 0) {
1248          fail("dereferenced array '%s' has length 0\n", name);
1249       }
1250       type = type->fields.array;
1251    } else {
1252       array_elements = 1;
1253    }
1254
1255    if (interpolation_mode == INTERP_QUALIFIER_NONE) {
1256       bool is_gl_Color =
1257          location == VARYING_SLOT_COL0 || location == VARYING_SLOT_COL1;
1258       if (key->flat_shade && is_gl_Color) {
1259          interpolation_mode = INTERP_QUALIFIER_FLAT;
1260       } else {
1261          interpolation_mode = INTERP_QUALIFIER_SMOOTH;
1262       }
1263    }
1264
1265    for (unsigned int i = 0; i < array_elements; i++) {
1266       for (unsigned int j = 0; j < type->matrix_columns; j++) {
1267          if (prog_data->urb_setup[location] == -1) {
1268             /* If there's no incoming setup data for this slot, don't
1269              * emit interpolation for it.
1270              */
1271             attr = offset(attr, type->vector_elements);
1272             location++;
1273             continue;
1274          }
1275
1276          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1277             /* Constant interpolation (flat shading) case. The SF has
1278              * handed us defined values in only the constant offset
1279              * field of the setup reg.
1280              */
1281             for (unsigned int k = 0; k < type->vector_elements; k++) {
1282                struct brw_reg interp = interp_reg(location, k);
1283                interp = suboffset(interp, 3);
1284                interp.type = attr.type;
1285                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1286                attr = offset(attr, 1);
1287             }
1288          } else {
1289             /* Smooth/noperspective interpolation case. */
1290             for (unsigned int k = 0; k < type->vector_elements; k++) {
1291                struct brw_reg interp = interp_reg(location, k);
1292                if (devinfo->needs_unlit_centroid_workaround && mod_centroid) {
1293                   /* Get the pixel/sample mask into f0 so that we know
1294                    * which pixels are lit.  Then, for each channel that is
1295                    * unlit, replace the centroid data with non-centroid
1296                    * data.
1297                    */
1298                   emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1299
1300                   fs_inst *inst;
1301                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1302                                       false, false);
1303                   inst->predicate = BRW_PREDICATE_NORMAL;
1304                   inst->predicate_inverse = true;
1305                   if (devinfo->has_pln)
1306                      inst->no_dd_clear = true;
1307
1308                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1309                                       mod_centroid && !key->persample_shading,
1310                                       mod_sample || key->persample_shading);
1311                   inst->predicate = BRW_PREDICATE_NORMAL;
1312                   inst->predicate_inverse = false;
1313                   if (devinfo->has_pln)
1314                      inst->no_dd_check = true;
1315
1316                } else {
1317                   emit_linterp(attr, fs_reg(interp), interpolation_mode,
1318                                mod_centroid && !key->persample_shading,
1319                                mod_sample || key->persample_shading);
1320                }
1321                if (devinfo->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1322                   emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1323                }
1324                attr = offset(attr, 1);
1325             }
1326
1327          }
1328          location++;
1329       }
1330    }
1331 }
1332
1333 fs_reg *
1334 fs_visitor::emit_frontfacing_interpolation()
1335 {
1336    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type));
1337
1338    if (devinfo->gen >= 6) {
1339       /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
1340        * a boolean result from this (~0/true or 0/false).
1341        *
1342        * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
1343        * this task in only one instruction:
1344        *    - a negation source modifier will flip the bit; and
1345        *    - a W -> D type conversion will sign extend the bit into the high
1346        *      word of the destination.
1347        *
1348        * An ASR 15 fills the low word of the destination.
1349        */
1350       fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
1351       g0.negate = true;
1352
1353       emit(ASR(*reg, g0, fs_reg(15)));
1354    } else {
1355       /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
1356        * a boolean result from this (1/true or 0/false).
1357        *
1358        * Like in the above case, since the bit is the MSB of g1.6:UD we can use
1359        * the negation source modifier to flip it. Unfortunately the SHR
1360        * instruction only operates on UD (or D with an abs source modifier)
1361        * sources without negation.
1362        *
1363        * Instead, use ASR (which will give ~0/true or 0/false).
1364        */
1365       fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
1366       g1_6.negate = true;
1367
1368       emit(ASR(*reg, g1_6, fs_reg(31)));
1369    }
1370
1371    return reg;
1372 }
1373
1374 void
1375 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1376 {
1377    assert(stage == MESA_SHADER_FRAGMENT);
1378    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1379    assert(dst.type == BRW_REGISTER_TYPE_F);
1380
1381    if (key->compute_pos_offset) {
1382       /* Convert int_sample_pos to floating point */
1383       emit(MOV(dst, int_sample_pos));
1384       /* Scale to the range [0, 1] */
1385       emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1386    }
1387    else {
1388       /* From ARB_sample_shading specification:
1389        * "When rendering to a non-multisample buffer, or if multisample
1390        *  rasterization is disabled, gl_SamplePosition will always be
1391        *  (0.5, 0.5).
1392        */
1393       emit(MOV(dst, fs_reg(0.5f)));
1394    }
1395 }
1396
1397 fs_reg *
1398 fs_visitor::emit_samplepos_setup()
1399 {
1400    assert(devinfo->gen >= 6);
1401
1402    this->current_annotation = "compute sample position";
1403    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type));
1404    fs_reg pos = *reg;
1405    fs_reg int_sample_x = vgrf(glsl_type::int_type);
1406    fs_reg int_sample_y = vgrf(glsl_type::int_type);
1407
1408    /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1409     * mode will be enabled.
1410     *
1411     * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1412     * R31.1:0         Position Offset X/Y for Slot[3:0]
1413     * R31.3:2         Position Offset X/Y for Slot[7:4]
1414     * .....
1415     *
1416     * The X, Y sample positions come in as bytes in  thread payload. So, read
1417     * the positions using vstride=16, width=8, hstride=2.
1418     */
1419    struct brw_reg sample_pos_reg =
1420       stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1421                     BRW_REGISTER_TYPE_B), 16, 8, 2);
1422
1423    if (dispatch_width == 8) {
1424       emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1425    } else {
1426       emit(MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg)));
1427       emit(MOV(half(int_sample_x, 1), fs_reg(suboffset(sample_pos_reg, 16))))
1428          ->force_sechalf = true;
1429    }
1430    /* Compute gl_SamplePosition.x */
1431    compute_sample_position(pos, int_sample_x);
1432    pos = offset(pos, 1);
1433    if (dispatch_width == 8) {
1434       emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1435    } else {
1436       emit(MOV(half(int_sample_y, 0),
1437                fs_reg(suboffset(sample_pos_reg, 1))));
1438       emit(MOV(half(int_sample_y, 1), fs_reg(suboffset(sample_pos_reg, 17))))
1439          ->force_sechalf = true;
1440    }
1441    /* Compute gl_SamplePosition.y */
1442    compute_sample_position(pos, int_sample_y);
1443    return reg;
1444 }
1445
1446 fs_reg *
1447 fs_visitor::emit_sampleid_setup()
1448 {
1449    assert(stage == MESA_SHADER_FRAGMENT);
1450    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1451    assert(devinfo->gen >= 6);
1452
1453    this->current_annotation = "compute sample id";
1454    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
1455
1456    if (key->compute_sample_id) {
1457       fs_reg t1 = vgrf(glsl_type::int_type);
1458       fs_reg t2 = vgrf(glsl_type::int_type);
1459       t2.type = BRW_REGISTER_TYPE_UW;
1460
1461       /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1462        * 8x multisampling, subspan 0 will represent sample N (where N
1463        * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1464        * 7. We can find the value of N by looking at R0.0 bits 7:6
1465        * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1466        * (since samples are always delivered in pairs). That is, we
1467        * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1468        * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1469        * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1470        * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1471        * populating a temporary variable with the sequence (0, 1, 2, 3),
1472        * and then reading from it using vstride=1, width=4, hstride=0.
1473        * These computations hold good for 4x multisampling as well.
1474        *
1475        * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1476        * the first four slots are sample 0 of subspan 0; the next four
1477        * are sample 1 of subspan 0; the third group is sample 0 of
1478        * subspan 1, and finally sample 1 of subspan 1.
1479        */
1480       fs_inst *inst;
1481       inst = emit(BRW_OPCODE_AND, t1,
1482                   fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1483                   fs_reg(0xc0));
1484       inst->force_writemask_all = true;
1485       inst = emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1486       inst->force_writemask_all = true;
1487       /* This works for both SIMD8 and SIMD16 */
1488       inst = emit(MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210)));
1489       inst->force_writemask_all = true;
1490       /* This special instruction takes care of setting vstride=1,
1491        * width=4, hstride=0 of t2 during an ADD instruction.
1492        */
1493       emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1494    } else {
1495       /* As per GL_ARB_sample_shading specification:
1496        * "When rendering to a non-multisample buffer, or if multisample
1497        *  rasterization is disabled, gl_SampleID will always be zero."
1498        */
1499       emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1500    }
1501
1502    return reg;
1503 }
1504
1505 void
1506 fs_visitor::resolve_source_modifiers(fs_reg *src)
1507 {
1508    if (!src->abs && !src->negate)
1509       return;
1510
1511    fs_reg temp = retype(vgrf(1), src->type);
1512    emit(MOV(temp, *src));
1513    *src = temp;
1514 }
1515
1516 fs_reg
1517 fs_visitor::fix_math_operand(fs_reg src)
1518 {
1519    /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1520     * might be able to do better by doing execsize = 1 math and then
1521     * expanding that result out, but we would need to be careful with
1522     * masking.
1523     *
1524     * The hardware ignores source modifiers (negate and abs) on math
1525     * instructions, so we also move to a temp to set those up.
1526     */
1527    if (devinfo->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1528        !src.abs && !src.negate)
1529       return src;
1530
1531    /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1532     * operands to math
1533     */
1534    if (devinfo->gen >= 7 && src.file != IMM)
1535       return src;
1536
1537    fs_reg expanded = vgrf(glsl_type::float_type);
1538    expanded.type = src.type;
1539    emit(BRW_OPCODE_MOV, expanded, src);
1540    return expanded;
1541 }
1542
1543 fs_inst *
1544 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1545 {
1546    switch (opcode) {
1547    case SHADER_OPCODE_RCP:
1548    case SHADER_OPCODE_RSQ:
1549    case SHADER_OPCODE_SQRT:
1550    case SHADER_OPCODE_EXP2:
1551    case SHADER_OPCODE_LOG2:
1552    case SHADER_OPCODE_SIN:
1553    case SHADER_OPCODE_COS:
1554       break;
1555    default:
1556       unreachable("not reached: bad math opcode");
1557    }
1558
1559    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
1560     * might be able to do better by doing execsize = 1 math and then
1561     * expanding that result out, but we would need to be careful with
1562     * masking.
1563     *
1564     * Gen 6 hardware ignores source modifiers (negate and abs) on math
1565     * instructions, so we also move to a temp to set those up.
1566     */
1567    if (devinfo->gen == 6 || devinfo->gen == 7)
1568       src = fix_math_operand(src);
1569
1570    fs_inst *inst = emit(opcode, dst, src);
1571
1572    if (devinfo->gen < 6) {
1573       inst->base_mrf = 2;
1574       inst->mlen = dispatch_width / 8;
1575    }
1576
1577    return inst;
1578 }
1579
1580 fs_inst *
1581 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1582 {
1583    int base_mrf = 2;
1584    fs_inst *inst;
1585
1586    if (devinfo->gen >= 8) {
1587       inst = emit(opcode, dst, src0, src1);
1588    } else if (devinfo->gen >= 6) {
1589       src0 = fix_math_operand(src0);
1590       src1 = fix_math_operand(src1);
1591
1592       inst = emit(opcode, dst, src0, src1);
1593    } else {
1594       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1595        * "Message Payload":
1596        *
1597        * "Operand0[7].  For the INT DIV functions, this operand is the
1598        *  denominator."
1599        *  ...
1600        * "Operand1[7].  For the INT DIV functions, this operand is the
1601        *  numerator."
1602        */
1603       bool is_int_div = opcode != SHADER_OPCODE_POW;
1604       fs_reg &op0 = is_int_div ? src1 : src0;
1605       fs_reg &op1 = is_int_div ? src0 : src1;
1606
1607       emit(MOV(fs_reg(MRF, base_mrf + 1, op1.type, dispatch_width), op1));
1608       inst = emit(opcode, dst, op0, reg_null_f);
1609
1610       inst->base_mrf = base_mrf;
1611       inst->mlen = 2 * dispatch_width / 8;
1612    }
1613    return inst;
1614 }
1615
1616 void
1617 fs_visitor::emit_discard_jump()
1618 {
1619    assert(((brw_wm_prog_data*) this->prog_data)->uses_kill);
1620
1621    /* For performance, after a discard, jump to the end of the
1622     * shader if all relevant channels have been discarded.
1623     */
1624    fs_inst *discard_jump = emit(FS_OPCODE_DISCARD_JUMP);
1625    discard_jump->flag_subreg = 1;
1626
1627    discard_jump->predicate = (dispatch_width == 8)
1628                              ? BRW_PREDICATE_ALIGN1_ANY8H
1629                              : BRW_PREDICATE_ALIGN1_ANY16H;
1630    discard_jump->predicate_inverse = true;
1631 }
1632
1633 void
1634 fs_visitor::assign_curb_setup()
1635 {
1636    if (dispatch_width == 8) {
1637       prog_data->dispatch_grf_start_reg = payload.num_regs;
1638    } else {
1639       if (stage == MESA_SHADER_FRAGMENT) {
1640          brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1641          prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1642       } else if (stage == MESA_SHADER_COMPUTE) {
1643          brw_cs_prog_data *prog_data = (brw_cs_prog_data*) this->prog_data;
1644          prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1645       } else {
1646          unreachable("Unsupported shader type!");
1647       }
1648    }
1649
1650    prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1651
1652    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1653    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1654       for (unsigned int i = 0; i < inst->sources; i++) {
1655          if (inst->src[i].file == UNIFORM) {
1656             int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1657             int constant_nr;
1658             if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1659                constant_nr = push_constant_loc[uniform_nr];
1660             } else {
1661                /* Section 5.11 of the OpenGL 4.1 spec says:
1662                 * "Out-of-bounds reads return undefined values, which include
1663                 *  values from other variables of the active program or zero."
1664                 * Just return the first push constant.
1665                 */
1666                constant_nr = 0;
1667             }
1668
1669             struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1670                                                   constant_nr / 8,
1671                                                   constant_nr % 8);
1672
1673             inst->src[i].file = HW_REG;
1674             inst->src[i].fixed_hw_reg = byte_offset(
1675                retype(brw_reg, inst->src[i].type),
1676                inst->src[i].subreg_offset);
1677          }
1678       }
1679    }
1680 }
1681
1682 void
1683 fs_visitor::calculate_urb_setup()
1684 {
1685    assert(stage == MESA_SHADER_FRAGMENT);
1686    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1687    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1688
1689    memset(prog_data->urb_setup, -1,
1690           sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
1691
1692    int urb_next = 0;
1693    /* Figure out where each of the incoming setup attributes lands. */
1694    if (devinfo->gen >= 6) {
1695       if (_mesa_bitcount_64(prog->InputsRead &
1696                             BRW_FS_VARYING_INPUT_MASK) <= 16) {
1697          /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1698           * first 16 varying inputs, so we can put them wherever we want.
1699           * Just put them in order.
1700           *
1701           * This is useful because it means that (a) inputs not used by the
1702           * fragment shader won't take up valuable register space, and (b) we
1703           * won't have to recompile the fragment shader if it gets paired with
1704           * a different vertex (or geometry) shader.
1705           */
1706          for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1707             if (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1708                 BITFIELD64_BIT(i)) {
1709                prog_data->urb_setup[i] = urb_next++;
1710             }
1711          }
1712       } else {
1713          /* We have enough input varyings that the SF/SBE pipeline stage can't
1714           * arbitrarily rearrange them to suit our whim; we have to put them
1715           * in an order that matches the output of the previous pipeline stage
1716           * (geometry or vertex shader).
1717           */
1718          struct brw_vue_map prev_stage_vue_map;
1719          brw_compute_vue_map(devinfo, &prev_stage_vue_map,
1720                              key->input_slots_valid);
1721          int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1722          assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1723          for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1724               slot++) {
1725             int varying = prev_stage_vue_map.slot_to_varying[slot];
1726             /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1727              * unused.
1728              */
1729             if (varying != BRW_VARYING_SLOT_COUNT &&
1730                 (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1731                  BITFIELD64_BIT(varying))) {
1732                prog_data->urb_setup[varying] = slot - first_slot;
1733             }
1734          }
1735          urb_next = prev_stage_vue_map.num_slots - first_slot;
1736       }
1737    } else {
1738       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1739       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1740          /* Point size is packed into the header, not as a general attribute */
1741          if (i == VARYING_SLOT_PSIZ)
1742             continue;
1743
1744          if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1745             /* The back color slot is skipped when the front color is
1746              * also written to.  In addition, some slots can be
1747              * written in the vertex shader and not read in the
1748              * fragment shader.  So the register number must always be
1749              * incremented, mapped or not.
1750              */
1751             if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1752                prog_data->urb_setup[i] = urb_next;
1753             urb_next++;
1754          }
1755       }
1756
1757       /*
1758        * It's a FS only attribute, and we did interpolation for this attribute
1759        * in SF thread. So, count it here, too.
1760        *
1761        * See compile_sf_prog() for more info.
1762        */
1763       if (prog->InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1764          prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1765    }
1766
1767    prog_data->num_varying_inputs = urb_next;
1768 }
1769
1770 void
1771 fs_visitor::assign_urb_setup()
1772 {
1773    assert(stage == MESA_SHADER_FRAGMENT);
1774    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1775
1776    int urb_start = payload.num_regs + prog_data->base.curb_read_length;
1777
1778    /* Offset all the urb_setup[] index by the actual position of the
1779     * setup regs, now that the location of the constants has been chosen.
1780     */
1781    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1782       if (inst->opcode == FS_OPCODE_LINTERP) {
1783          assert(inst->src[1].file == HW_REG);
1784          inst->src[1].fixed_hw_reg.nr += urb_start;
1785       }
1786
1787       if (inst->opcode == FS_OPCODE_CINTERP) {
1788          assert(inst->src[0].file == HW_REG);
1789          inst->src[0].fixed_hw_reg.nr += urb_start;
1790       }
1791    }
1792
1793    /* Each attribute is 4 setup channels, each of which is half a reg. */
1794    this->first_non_payload_grf =
1795       urb_start + prog_data->num_varying_inputs * 2;
1796 }
1797
1798 void
1799 fs_visitor::assign_vs_urb_setup()
1800 {
1801    brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
1802    int grf, count, slot, channel, attr;
1803
1804    assert(stage == MESA_SHADER_VERTEX);
1805    count = _mesa_bitcount_64(vs_prog_data->inputs_read);
1806    if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid)
1807       count++;
1808
1809    /* Each attribute is 4 regs. */
1810    this->first_non_payload_grf =
1811       payload.num_regs + prog_data->curb_read_length + count * 4;
1812
1813    unsigned vue_entries =
1814       MAX2(count, vs_prog_data->base.vue_map.num_slots);
1815
1816    vs_prog_data->base.urb_entry_size = ALIGN(vue_entries, 4) / 4;
1817    vs_prog_data->base.urb_read_length = (count + 1) / 2;
1818
1819    assert(vs_prog_data->base.urb_read_length <= 15);
1820
1821    /* Rewrite all ATTR file references to the hw grf that they land in. */
1822    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1823       for (int i = 0; i < inst->sources; i++) {
1824          if (inst->src[i].file == ATTR) {
1825
1826             if (inst->src[i].reg == VERT_ATTRIB_MAX) {
1827                slot = count - 1;
1828             } else {
1829                /* Attributes come in in a contiguous block, ordered by their
1830                 * gl_vert_attrib value.  That means we can compute the slot
1831                 * number for an attribute by masking out the enabled
1832                 * attributes before it and counting the bits.
1833                 */
1834                attr = inst->src[i].reg + inst->src[i].reg_offset / 4;
1835                slot = _mesa_bitcount_64(vs_prog_data->inputs_read &
1836                                         BITFIELD64_MASK(attr));
1837             }
1838
1839             channel = inst->src[i].reg_offset & 3;
1840
1841             grf = payload.num_regs +
1842                prog_data->curb_read_length +
1843                slot * 4 + channel;
1844
1845             inst->src[i].file = HW_REG;
1846             inst->src[i].fixed_hw_reg =
1847                retype(brw_vec8_grf(grf, 0), inst->src[i].type);
1848          }
1849       }
1850    }
1851 }
1852
1853 /**
1854  * Split large virtual GRFs into separate components if we can.
1855  *
1856  * This is mostly duplicated with what brw_fs_vector_splitting does,
1857  * but that's really conservative because it's afraid of doing
1858  * splitting that doesn't result in real progress after the rest of
1859  * the optimization phases, which would cause infinite looping in
1860  * optimization.  We can do it once here, safely.  This also has the
1861  * opportunity to split interpolated values, or maybe even uniforms,
1862  * which we don't have at the IR level.
1863  *
1864  * We want to split, because virtual GRFs are what we register
1865  * allocate and spill (due to contiguousness requirements for some
1866  * instructions), and they're what we naturally generate in the
1867  * codegen process, but most virtual GRFs don't actually need to be
1868  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1869  * live intervals and better dead code elimination and coalescing.
1870  */
1871 void
1872 fs_visitor::split_virtual_grfs()
1873 {
1874    int num_vars = this->alloc.count;
1875
1876    /* Count the total number of registers */
1877    int reg_count = 0;
1878    int vgrf_to_reg[num_vars];
1879    for (int i = 0; i < num_vars; i++) {
1880       vgrf_to_reg[i] = reg_count;
1881       reg_count += alloc.sizes[i];
1882    }
1883
1884    /* An array of "split points".  For each register slot, this indicates
1885     * if this slot can be separated from the previous slot.  Every time an
1886     * instruction uses multiple elements of a register (as a source or
1887     * destination), we mark the used slots as inseparable.  Then we go
1888     * through and split the registers into the smallest pieces we can.
1889     */
1890    bool split_points[reg_count];
1891    memset(split_points, 0, sizeof(split_points));
1892
1893    /* Mark all used registers as fully splittable */
1894    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1895       if (inst->dst.file == GRF) {
1896          int reg = vgrf_to_reg[inst->dst.reg];
1897          for (unsigned j = 1; j < this->alloc.sizes[inst->dst.reg]; j++)
1898             split_points[reg + j] = true;
1899       }
1900
1901       for (int i = 0; i < inst->sources; i++) {
1902          if (inst->src[i].file == GRF) {
1903             int reg = vgrf_to_reg[inst->src[i].reg];
1904             for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].reg]; j++)
1905                split_points[reg + j] = true;
1906          }
1907       }
1908    }
1909
1910    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1911       if (inst->dst.file == GRF) {
1912          int reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
1913          for (int j = 1; j < inst->regs_written; j++)
1914             split_points[reg + j] = false;
1915       }
1916       for (int i = 0; i < inst->sources; i++) {
1917          if (inst->src[i].file == GRF) {
1918             int reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
1919             for (int j = 1; j < inst->regs_read(i); j++)
1920                split_points[reg + j] = false;
1921          }
1922       }
1923    }
1924
1925    int new_virtual_grf[reg_count];
1926    int new_reg_offset[reg_count];
1927
1928    int reg = 0;
1929    for (int i = 0; i < num_vars; i++) {
1930       /* The first one should always be 0 as a quick sanity check. */
1931       assert(split_points[reg] == false);
1932
1933       /* j = 0 case */
1934       new_reg_offset[reg] = 0;
1935       reg++;
1936       int offset = 1;
1937
1938       /* j > 0 case */
1939       for (unsigned j = 1; j < alloc.sizes[i]; j++) {
1940          /* If this is a split point, reset the offset to 0 and allocate a
1941           * new virtual GRF for the previous offset many registers
1942           */
1943          if (split_points[reg]) {
1944             assert(offset <= MAX_VGRF_SIZE);
1945             int grf = alloc.allocate(offset);
1946             for (int k = reg - offset; k < reg; k++)
1947                new_virtual_grf[k] = grf;
1948             offset = 0;
1949          }
1950          new_reg_offset[reg] = offset;
1951          offset++;
1952          reg++;
1953       }
1954
1955       /* The last one gets the original register number */
1956       assert(offset <= MAX_VGRF_SIZE);
1957       alloc.sizes[i] = offset;
1958       for (int k = reg - offset; k < reg; k++)
1959          new_virtual_grf[k] = i;
1960    }
1961    assert(reg == reg_count);
1962
1963    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1964       if (inst->dst.file == GRF) {
1965          reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
1966          inst->dst.reg = new_virtual_grf[reg];
1967          inst->dst.reg_offset = new_reg_offset[reg];
1968          assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
1969       }
1970       for (int i = 0; i < inst->sources; i++) {
1971          if (inst->src[i].file == GRF) {
1972             reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
1973             inst->src[i].reg = new_virtual_grf[reg];
1974             inst->src[i].reg_offset = new_reg_offset[reg];
1975             assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
1976          }
1977       }
1978    }
1979    invalidate_live_intervals();
1980 }
1981
1982 /**
1983  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1984  *
1985  * During code generation, we create tons of temporary variables, many of
1986  * which get immediately killed and are never used again.  Yet, in later
1987  * optimization and analysis passes, such as compute_live_intervals, we need
1988  * to loop over all the virtual GRFs.  Compacting them can save a lot of
1989  * overhead.
1990  */
1991 bool
1992 fs_visitor::compact_virtual_grfs()
1993 {
1994    bool progress = false;
1995    int remap_table[this->alloc.count];
1996    memset(remap_table, -1, sizeof(remap_table));
1997
1998    /* Mark which virtual GRFs are used. */
1999    foreach_block_and_inst(block, const fs_inst, inst, cfg) {
2000       if (inst->dst.file == GRF)
2001          remap_table[inst->dst.reg] = 0;
2002
2003       for (int i = 0; i < inst->sources; i++) {
2004          if (inst->src[i].file == GRF)
2005             remap_table[inst->src[i].reg] = 0;
2006       }
2007    }
2008
2009    /* Compact the GRF arrays. */
2010    int new_index = 0;
2011    for (unsigned i = 0; i < this->alloc.count; i++) {
2012       if (remap_table[i] == -1) {
2013          /* We just found an unused register.  This means that we are
2014           * actually going to compact something.
2015           */
2016          progress = true;
2017       } else {
2018          remap_table[i] = new_index;
2019          alloc.sizes[new_index] = alloc.sizes[i];
2020          invalidate_live_intervals();
2021          ++new_index;
2022       }
2023    }
2024
2025    this->alloc.count = new_index;
2026
2027    /* Patch all the instructions to use the newly renumbered registers */
2028    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2029       if (inst->dst.file == GRF)
2030          inst->dst.reg = remap_table[inst->dst.reg];
2031
2032       for (int i = 0; i < inst->sources; i++) {
2033          if (inst->src[i].file == GRF)
2034             inst->src[i].reg = remap_table[inst->src[i].reg];
2035       }
2036    }
2037
2038    /* Patch all the references to delta_xy, since they're used in register
2039     * allocation.  If they're unused, switch them to BAD_FILE so we don't
2040     * think some random VGRF is delta_xy.
2041     */
2042    for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
2043       if (delta_xy[i].file == GRF) {
2044          if (remap_table[delta_xy[i].reg] != -1) {
2045             delta_xy[i].reg = remap_table[delta_xy[i].reg];
2046          } else {
2047             delta_xy[i].file = BAD_FILE;
2048          }
2049       }
2050    }
2051
2052    return progress;
2053 }
2054
2055 /*
2056  * Implements array access of uniforms by inserting a
2057  * PULL_CONSTANT_LOAD instruction.
2058  *
2059  * Unlike temporary GRF array access (where we don't support it due to
2060  * the difficulty of doing relative addressing on instruction
2061  * destinations), we could potentially do array access of uniforms
2062  * that were loaded in GRF space as push constants.  In real-world
2063  * usage we've seen, though, the arrays being used are always larger
2064  * than we could load as push constants, so just always move all
2065  * uniform array access out to a pull constant buffer.
2066  */
2067 void
2068 fs_visitor::move_uniform_array_access_to_pull_constants()
2069 {
2070    if (dispatch_width != 8)
2071       return;
2072
2073    pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2074    memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
2075
2076    /* Walk through and find array access of uniforms.  Put a copy of that
2077     * uniform in the pull constant buffer.
2078     *
2079     * Note that we don't move constant-indexed accesses to arrays.  No
2080     * testing has been done of the performance impact of this choice.
2081     */
2082    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2083       for (int i = 0 ; i < inst->sources; i++) {
2084          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2085             continue;
2086
2087          int uniform = inst->src[i].reg;
2088
2089          /* If this array isn't already present in the pull constant buffer,
2090           * add it.
2091           */
2092          if (pull_constant_loc[uniform] == -1) {
2093             const gl_constant_value **values = &stage_prog_data->param[uniform];
2094
2095             assert(param_size[uniform]);
2096
2097             for (int j = 0; j < param_size[uniform]; j++) {
2098                pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
2099
2100                stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
2101                   values[j];
2102             }
2103          }
2104       }
2105    }
2106 }
2107
2108 /**
2109  * Assign UNIFORM file registers to either push constants or pull constants.
2110  *
2111  * We allow a fragment shader to have more than the specified minimum
2112  * maximum number of fragment shader uniform components (64).  If
2113  * there are too many of these, they'd fill up all of register space.
2114  * So, this will push some of them out to the pull constant buffer and
2115  * update the program to load them.
2116  */
2117 void
2118 fs_visitor::assign_constant_locations()
2119 {
2120    /* Only the first compile (SIMD8 mode) gets to decide on locations. */
2121    if (dispatch_width != 8)
2122       return;
2123
2124    /* Find which UNIFORM registers are still in use. */
2125    bool is_live[uniforms];
2126    for (unsigned int i = 0; i < uniforms; i++) {
2127       is_live[i] = false;
2128    }
2129
2130    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2131       for (int i = 0; i < inst->sources; i++) {
2132          if (inst->src[i].file != UNIFORM)
2133             continue;
2134
2135          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
2136          if (constant_nr >= 0 && constant_nr < (int) uniforms)
2137             is_live[constant_nr] = true;
2138       }
2139    }
2140
2141    /* Only allow 16 registers (128 uniform components) as push constants.
2142     *
2143     * Just demote the end of the list.  We could probably do better
2144     * here, demoting things that are rarely used in the program first.
2145     *
2146     * If changing this value, note the limitation about total_regs in
2147     * brw_curbe.c.
2148     */
2149    unsigned int max_push_components = 16 * 8;
2150    unsigned int num_push_constants = 0;
2151
2152    push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2153
2154    for (unsigned int i = 0; i < uniforms; i++) {
2155       if (!is_live[i] || pull_constant_loc[i] != -1) {
2156          /* This UNIFORM register is either dead, or has already been demoted
2157           * to a pull const.  Mark it as no longer living in the param[] array.
2158           */
2159          push_constant_loc[i] = -1;
2160          continue;
2161       }
2162
2163       if (num_push_constants < max_push_components) {
2164          /* Retain as a push constant.  Record the location in the params[]
2165           * array.
2166           */
2167          push_constant_loc[i] = num_push_constants++;
2168       } else {
2169          /* Demote to a pull constant. */
2170          push_constant_loc[i] = -1;
2171
2172          int pull_index = stage_prog_data->nr_pull_params++;
2173          stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
2174          pull_constant_loc[i] = pull_index;
2175       }
2176    }
2177
2178    stage_prog_data->nr_params = num_push_constants;
2179
2180    /* Up until now, the param[] array has been indexed by reg + reg_offset
2181     * of UNIFORM registers.  Condense it to only contain the uniforms we
2182     * chose to upload as push constants.
2183     */
2184    for (unsigned int i = 0; i < uniforms; i++) {
2185       int remapped = push_constant_loc[i];
2186
2187       if (remapped == -1)
2188          continue;
2189
2190       assert(remapped <= (int)i);
2191       stage_prog_data->param[remapped] = stage_prog_data->param[i];
2192    }
2193 }
2194
2195 /**
2196  * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
2197  * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
2198  */
2199 void
2200 fs_visitor::demote_pull_constants()
2201 {
2202    foreach_block_and_inst (block, fs_inst, inst, cfg) {
2203       for (int i = 0; i < inst->sources; i++) {
2204          if (inst->src[i].file != UNIFORM)
2205             continue;
2206
2207          int pull_index;
2208          unsigned location = inst->src[i].reg + inst->src[i].reg_offset;
2209          if (location >= uniforms) /* Out of bounds access */
2210             pull_index = -1;
2211          else
2212             pull_index = pull_constant_loc[location];
2213
2214          if (pull_index == -1)
2215             continue;
2216
2217          /* Set up the annotation tracking for new generated instructions. */
2218          base_ir = inst->ir;
2219          current_annotation = inst->annotation;
2220
2221          fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
2222          fs_reg dst = vgrf(glsl_type::float_type);
2223
2224          /* Generate a pull load into dst. */
2225          if (inst->src[i].reladdr) {
2226             exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
2227                                                         surf_index,
2228                                                         *inst->src[i].reladdr,
2229                                                         pull_index);
2230             inst->insert_before(block, &list);
2231             inst->src[i].reladdr = NULL;
2232          } else {
2233             fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
2234             fs_inst *pull =
2235                new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 8,
2236                                     dst, surf_index, offset);
2237             inst->insert_before(block, pull);
2238             inst->src[i].set_smear(pull_index & 3);
2239          }
2240
2241          /* Rewrite the instruction to use the temporary VGRF. */
2242          inst->src[i].file = GRF;
2243          inst->src[i].reg = dst.reg;
2244          inst->src[i].reg_offset = 0;
2245          inst->src[i].width = dispatch_width;
2246       }
2247    }
2248    invalidate_live_intervals();
2249 }
2250
2251 bool
2252 fs_visitor::opt_algebraic()
2253 {
2254    bool progress = false;
2255
2256    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2257       switch (inst->opcode) {
2258       case BRW_OPCODE_MOV:
2259          if (inst->src[0].file != IMM)
2260             break;
2261
2262          if (inst->saturate) {
2263             if (inst->dst.type != inst->src[0].type)
2264                assert(!"unimplemented: saturate mixed types");
2265
2266             if (brw_saturate_immediate(inst->dst.type,
2267                                        &inst->src[0].fixed_hw_reg)) {
2268                inst->saturate = false;
2269                progress = true;
2270             }
2271          }
2272          break;
2273
2274       case BRW_OPCODE_MUL:
2275          if (inst->src[1].file != IMM)
2276             continue;
2277
2278          /* a * 1.0 = a */
2279          if (inst->src[1].is_one()) {
2280             inst->opcode = BRW_OPCODE_MOV;
2281             inst->src[1] = reg_undef;
2282             progress = true;
2283             break;
2284          }
2285
2286          /* a * -1.0 = -a */
2287          if (inst->src[1].is_negative_one()) {
2288             inst->opcode = BRW_OPCODE_MOV;
2289             inst->src[0].negate = !inst->src[0].negate;
2290             inst->src[1] = reg_undef;
2291             progress = true;
2292             break;
2293          }
2294
2295          /* a * 0.0 = 0.0 */
2296          if (inst->src[1].is_zero()) {
2297             inst->opcode = BRW_OPCODE_MOV;
2298             inst->src[0] = inst->src[1];
2299             inst->src[1] = reg_undef;
2300             progress = true;
2301             break;
2302          }
2303
2304          if (inst->src[0].file == IMM) {
2305             assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2306             inst->opcode = BRW_OPCODE_MOV;
2307             inst->src[0].fixed_hw_reg.dw1.f *= inst->src[1].fixed_hw_reg.dw1.f;
2308             inst->src[1] = reg_undef;
2309             progress = true;
2310             break;
2311          }
2312          break;
2313       case BRW_OPCODE_ADD:
2314          if (inst->src[1].file != IMM)
2315             continue;
2316
2317          /* a + 0.0 = a */
2318          if (inst->src[1].is_zero()) {
2319             inst->opcode = BRW_OPCODE_MOV;
2320             inst->src[1] = reg_undef;
2321             progress = true;
2322             break;
2323          }
2324
2325          if (inst->src[0].file == IMM) {
2326             assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2327             inst->opcode = BRW_OPCODE_MOV;
2328             inst->src[0].fixed_hw_reg.dw1.f += inst->src[1].fixed_hw_reg.dw1.f;
2329             inst->src[1] = reg_undef;
2330             progress = true;
2331             break;
2332          }
2333          break;
2334       case BRW_OPCODE_OR:
2335          if (inst->src[0].equals(inst->src[1])) {
2336             inst->opcode = BRW_OPCODE_MOV;
2337             inst->src[1] = reg_undef;
2338             progress = true;
2339             break;
2340          }
2341          break;
2342       case BRW_OPCODE_LRP:
2343          if (inst->src[1].equals(inst->src[2])) {
2344             inst->opcode = BRW_OPCODE_MOV;
2345             inst->src[0] = inst->src[1];
2346             inst->src[1] = reg_undef;
2347             inst->src[2] = reg_undef;
2348             progress = true;
2349             break;
2350          }
2351          break;
2352       case BRW_OPCODE_CMP:
2353          if (inst->conditional_mod == BRW_CONDITIONAL_GE &&
2354              inst->src[0].abs &&
2355              inst->src[0].negate &&
2356              inst->src[1].is_zero()) {
2357             inst->src[0].abs = false;
2358             inst->src[0].negate = false;
2359             inst->conditional_mod = BRW_CONDITIONAL_Z;
2360             progress = true;
2361             break;
2362          }
2363          break;
2364       case BRW_OPCODE_SEL:
2365          if (inst->src[0].equals(inst->src[1])) {
2366             inst->opcode = BRW_OPCODE_MOV;
2367             inst->src[1] = reg_undef;
2368             inst->predicate = BRW_PREDICATE_NONE;
2369             inst->predicate_inverse = false;
2370             progress = true;
2371          } else if (inst->saturate && inst->src[1].file == IMM) {
2372             switch (inst->conditional_mod) {
2373             case BRW_CONDITIONAL_LE:
2374             case BRW_CONDITIONAL_L:
2375                switch (inst->src[1].type) {
2376                case BRW_REGISTER_TYPE_F:
2377                   if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
2378                      inst->opcode = BRW_OPCODE_MOV;
2379                      inst->src[1] = reg_undef;
2380                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
2381                      progress = true;
2382                   }
2383                   break;
2384                default:
2385                   break;
2386                }
2387                break;
2388             case BRW_CONDITIONAL_GE:
2389             case BRW_CONDITIONAL_G:
2390                switch (inst->src[1].type) {
2391                case BRW_REGISTER_TYPE_F:
2392                   if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
2393                      inst->opcode = BRW_OPCODE_MOV;
2394                      inst->src[1] = reg_undef;
2395                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
2396                      progress = true;
2397                   }
2398                   break;
2399                default:
2400                   break;
2401                }
2402             default:
2403                break;
2404             }
2405          }
2406          break;
2407       case BRW_OPCODE_MAD:
2408          if (inst->src[1].is_zero() || inst->src[2].is_zero()) {
2409             inst->opcode = BRW_OPCODE_MOV;
2410             inst->src[1] = reg_undef;
2411             inst->src[2] = reg_undef;
2412             progress = true;
2413          } else if (inst->src[0].is_zero()) {
2414             inst->opcode = BRW_OPCODE_MUL;
2415             inst->src[0] = inst->src[2];
2416             inst->src[2] = reg_undef;
2417             progress = true;
2418          } else if (inst->src[1].is_one()) {
2419             inst->opcode = BRW_OPCODE_ADD;
2420             inst->src[1] = inst->src[2];
2421             inst->src[2] = reg_undef;
2422             progress = true;
2423          } else if (inst->src[2].is_one()) {
2424             inst->opcode = BRW_OPCODE_ADD;
2425             inst->src[2] = reg_undef;
2426             progress = true;
2427          } else if (inst->src[1].file == IMM && inst->src[2].file == IMM) {
2428             inst->opcode = BRW_OPCODE_ADD;
2429             inst->src[1].fixed_hw_reg.dw1.f *= inst->src[2].fixed_hw_reg.dw1.f;
2430             inst->src[2] = reg_undef;
2431             progress = true;
2432          }
2433          break;
2434       case SHADER_OPCODE_RCP: {
2435          fs_inst *prev = (fs_inst *)inst->prev;
2436          if (prev->opcode == SHADER_OPCODE_SQRT) {
2437             if (inst->src[0].equals(prev->dst)) {
2438                inst->opcode = SHADER_OPCODE_RSQ;
2439                inst->src[0] = prev->src[0];
2440                progress = true;
2441             }
2442          }
2443          break;
2444       }
2445       case SHADER_OPCODE_BROADCAST:
2446          if (is_uniform(inst->src[0])) {
2447             inst->opcode = BRW_OPCODE_MOV;
2448             inst->sources = 1;
2449             inst->force_writemask_all = true;
2450             progress = true;
2451          } else if (inst->src[1].file == IMM) {
2452             inst->opcode = BRW_OPCODE_MOV;
2453             inst->src[0] = component(inst->src[0],
2454                                      inst->src[1].fixed_hw_reg.dw1.ud);
2455             inst->sources = 1;
2456             inst->force_writemask_all = true;
2457             progress = true;
2458          }
2459          break;
2460
2461       default:
2462          break;
2463       }
2464
2465       /* Swap if src[0] is immediate. */
2466       if (progress && inst->is_commutative()) {
2467          if (inst->src[0].file == IMM) {
2468             fs_reg tmp = inst->src[1];
2469             inst->src[1] = inst->src[0];
2470             inst->src[0] = tmp;
2471          }
2472       }
2473    }
2474    return progress;
2475 }
2476
2477 /**
2478  * Optimize sample messages that have constant zero values for the trailing
2479  * texture coordinates. We can just reduce the message length for these
2480  * instructions instead of reserving a register for it. Trailing parameters
2481  * that aren't sent default to zero anyway. This will cause the dead code
2482  * eliminator to remove the MOV instruction that would otherwise be emitted to
2483  * set up the zero value.
2484  */
2485 bool
2486 fs_visitor::opt_zero_samples()
2487 {
2488    /* Gen4 infers the texturing opcode based on the message length so we can't
2489     * change it.
2490     */
2491    if (devinfo->gen < 5)
2492       return false;
2493
2494    bool progress = false;
2495
2496    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2497       if (!inst->is_tex())
2498          continue;
2499
2500       fs_inst *load_payload = (fs_inst *) inst->prev;
2501
2502       if (load_payload->is_head_sentinel() ||
2503           load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2504          continue;
2505
2506       /* We don't want to remove the message header or the first parameter.
2507        * Removing the first parameter is not allowed, see the Haswell PRM
2508        * volume 7, page 149:
2509        *
2510        *     "Parameter 0 is required except for the sampleinfo message, which
2511        *      has no parameter 0"
2512        */
2513       while (inst->mlen > inst->header_size + dispatch_width / 8 &&
2514              load_payload->src[(inst->mlen - inst->header_size) /
2515                                (dispatch_width / 8) +
2516                                inst->header_size - 1].is_zero()) {
2517          inst->mlen -= dispatch_width / 8;
2518          progress = true;
2519       }
2520    }
2521
2522    if (progress)
2523       invalidate_live_intervals();
2524
2525    return progress;
2526 }
2527
2528 /**
2529  * Optimize sample messages which are followed by the final RT write.
2530  *
2531  * CHV, and GEN9+ can mark a texturing SEND instruction with EOT to have its
2532  * results sent directly to the framebuffer, bypassing the EU.  Recognize the
2533  * final texturing results copied to the framebuffer write payload and modify
2534  * them to write to the framebuffer directly.
2535  */
2536 bool
2537 fs_visitor::opt_sampler_eot()
2538 {
2539    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2540
2541    if (stage != MESA_SHADER_FRAGMENT)
2542       return false;
2543
2544    if (devinfo->gen < 9 && !devinfo->is_cherryview)
2545       return false;
2546
2547    /* FINISHME: It should be possible to implement this optimization when there
2548     * are multiple drawbuffers.
2549     */
2550    if (key->nr_color_regions != 1)
2551       return false;
2552
2553    /* Look for a texturing instruction immediately before the final FB_WRITE. */
2554    fs_inst *fb_write = (fs_inst *) cfg->blocks[cfg->num_blocks - 1]->end();
2555    assert(fb_write->eot);
2556    assert(fb_write->opcode == FS_OPCODE_FB_WRITE);
2557
2558    fs_inst *tex_inst = (fs_inst *) fb_write->prev;
2559
2560    /* There wasn't one; nothing to do. */
2561    if (unlikely(tex_inst->is_head_sentinel()) || !tex_inst->is_tex())
2562       return false;
2563
2564    /* This optimisation doesn't seem to work for textureGather for some
2565     * reason. I can't find any documentation or known workarounds to indicate
2566     * that this is expected, but considering that it is probably pretty
2567     * unlikely that a shader would directly write out the results from
2568     * textureGather we might as well just disable it.
2569     */
2570    if (tex_inst->opcode == SHADER_OPCODE_TG4 ||
2571        tex_inst->opcode == SHADER_OPCODE_TG4_OFFSET)
2572       return false;
2573
2574    /* If there's no header present, we need to munge the LOAD_PAYLOAD as well.
2575     * It's very likely to be the previous instruction.
2576     */
2577    fs_inst *load_payload = (fs_inst *) tex_inst->prev;
2578    if (load_payload->is_head_sentinel() ||
2579        load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2580       return false;
2581
2582    assert(!tex_inst->eot); /* We can't get here twice */
2583    assert((tex_inst->offset & (0xff << 24)) == 0);
2584
2585    tex_inst->offset |= fb_write->target << 24;
2586    tex_inst->eot = true;
2587    tex_inst->dst = bld.null_reg_ud();
2588    fb_write->remove(cfg->blocks[cfg->num_blocks - 1]);
2589
2590    /* If a header is present, marking the eot is sufficient. Otherwise, we need
2591     * to create a new LOAD_PAYLOAD command with the same sources and a space
2592     * saved for the header. Using a new destination register not only makes sure
2593     * we have enough space, but it will make sure the dead code eliminator kills
2594     * the instruction that this will replace.
2595     */
2596    if (tex_inst->header_size != 0)
2597       return true;
2598
2599    fs_reg send_header = bld.vgrf(BRW_REGISTER_TYPE_F,
2600                                  load_payload->sources + 1);
2601    fs_reg *new_sources =
2602       ralloc_array(mem_ctx, fs_reg, load_payload->sources + 1);
2603
2604    new_sources[0] = fs_reg();
2605    for (int i = 0; i < load_payload->sources; i++)
2606       new_sources[i+1] = load_payload->src[i];
2607
2608    /* The LOAD_PAYLOAD helper seems like the obvious choice here. However, it
2609     * requires a lot of information about the sources to appropriately figure
2610     * out the number of registers needed to be used. Given this stage in our
2611     * optimization, we may not have the appropriate GRFs required by
2612     * LOAD_PAYLOAD at this point (copy propagation). Therefore, we need to
2613     * manually emit the instruction.
2614     */
2615    fs_inst *new_load_payload = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD,
2616                                                     load_payload->exec_size,
2617                                                     send_header,
2618                                                     new_sources,
2619                                                     load_payload->sources + 1);
2620
2621    new_load_payload->regs_written = load_payload->regs_written + 1;
2622    new_load_payload->header_size = 1;
2623    tex_inst->mlen++;
2624    tex_inst->header_size = 1;
2625    tex_inst->insert_before(cfg->blocks[cfg->num_blocks - 1], new_load_payload);
2626    tex_inst->src[0] = send_header;
2627
2628    return true;
2629 }
2630
2631 bool
2632 fs_visitor::opt_register_renaming()
2633 {
2634    bool progress = false;
2635    int depth = 0;
2636
2637    int remap[alloc.count];
2638    memset(remap, -1, sizeof(int) * alloc.count);
2639
2640    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2641       if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
2642          depth++;
2643       } else if (inst->opcode == BRW_OPCODE_ENDIF ||
2644                  inst->opcode == BRW_OPCODE_WHILE) {
2645          depth--;
2646       }
2647
2648       /* Rewrite instruction sources. */
2649       for (int i = 0; i < inst->sources; i++) {
2650          if (inst->src[i].file == GRF &&
2651              remap[inst->src[i].reg] != -1 &&
2652              remap[inst->src[i].reg] != inst->src[i].reg) {
2653             inst->src[i].reg = remap[inst->src[i].reg];
2654             progress = true;
2655          }
2656       }
2657
2658       const int dst = inst->dst.reg;
2659
2660       if (depth == 0 &&
2661           inst->dst.file == GRF &&
2662           alloc.sizes[inst->dst.reg] == inst->dst.width / 8 &&
2663           !inst->is_partial_write()) {
2664          if (remap[dst] == -1) {
2665             remap[dst] = dst;
2666          } else {
2667             remap[dst] = alloc.allocate(inst->dst.width / 8);
2668             inst->dst.reg = remap[dst];
2669             progress = true;
2670          }
2671       } else if (inst->dst.file == GRF &&
2672                  remap[dst] != -1 &&
2673                  remap[dst] != dst) {
2674          inst->dst.reg = remap[dst];
2675          progress = true;
2676       }
2677    }
2678
2679    if (progress) {
2680       invalidate_live_intervals();
2681
2682       for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
2683          if (delta_xy[i].file == GRF && remap[delta_xy[i].reg] != -1) {
2684             delta_xy[i].reg = remap[delta_xy[i].reg];
2685          }
2686       }
2687    }
2688
2689    return progress;
2690 }
2691
2692 /**
2693  * Remove redundant or useless discard jumps.
2694  *
2695  * For example, we can eliminate jumps in the following sequence:
2696  *
2697  * discard-jump       (redundant with the next jump)
2698  * discard-jump       (useless; jumps to the next instruction)
2699  * placeholder-halt
2700  */
2701 bool
2702 fs_visitor::opt_redundant_discard_jumps()
2703 {
2704    bool progress = false;
2705
2706    bblock_t *last_bblock = cfg->blocks[cfg->num_blocks - 1];
2707
2708    fs_inst *placeholder_halt = NULL;
2709    foreach_inst_in_block_reverse(fs_inst, inst, last_bblock) {
2710       if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT) {
2711          placeholder_halt = inst;
2712          break;
2713       }
2714    }
2715
2716    if (!placeholder_halt)
2717       return false;
2718
2719    /* Delete any HALTs immediately before the placeholder halt. */
2720    for (fs_inst *prev = (fs_inst *) placeholder_halt->prev;
2721         !prev->is_head_sentinel() && prev->opcode == FS_OPCODE_DISCARD_JUMP;
2722         prev = (fs_inst *) placeholder_halt->prev) {
2723       prev->remove(last_bblock);
2724       progress = true;
2725    }
2726
2727    if (progress)
2728       invalidate_live_intervals();
2729
2730    return progress;
2731 }
2732
2733 bool
2734 fs_visitor::compute_to_mrf()
2735 {
2736    bool progress = false;
2737    int next_ip = 0;
2738
2739    /* No MRFs on Gen >= 7. */
2740    if (devinfo->gen >= 7)
2741       return false;
2742
2743    calculate_live_intervals();
2744
2745    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2746       int ip = next_ip;
2747       next_ip++;
2748
2749       if (inst->opcode != BRW_OPCODE_MOV ||
2750           inst->is_partial_write() ||
2751           inst->dst.file != MRF || inst->src[0].file != GRF ||
2752           inst->dst.type != inst->src[0].type ||
2753           inst->src[0].abs || inst->src[0].negate ||
2754           !inst->src[0].is_contiguous() ||
2755           inst->src[0].subreg_offset)
2756          continue;
2757
2758       /* Work out which hardware MRF registers are written by this
2759        * instruction.
2760        */
2761       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2762       int mrf_high;
2763       if (inst->dst.reg & BRW_MRF_COMPR4) {
2764          mrf_high = mrf_low + 4;
2765       } else if (inst->exec_size == 16) {
2766          mrf_high = mrf_low + 1;
2767       } else {
2768          mrf_high = mrf_low;
2769       }
2770
2771       /* Can't compute-to-MRF this GRF if someone else was going to
2772        * read it later.
2773        */
2774       if (this->virtual_grf_end[inst->src[0].reg] > ip)
2775          continue;
2776
2777       /* Found a move of a GRF to a MRF.  Let's see if we can go
2778        * rewrite the thing that made this GRF to write into the MRF.
2779        */
2780       foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2781          if (scan_inst->dst.file == GRF &&
2782              scan_inst->dst.reg == inst->src[0].reg) {
2783             /* Found the last thing to write our reg we want to turn
2784              * into a compute-to-MRF.
2785              */
2786
2787             /* If this one instruction didn't populate all the
2788              * channels, bail.  We might be able to rewrite everything
2789              * that writes that reg, but it would require smarter
2790              * tracking to delay the rewriting until complete success.
2791              */
2792             if (scan_inst->is_partial_write())
2793                break;
2794
2795             /* Things returning more than one register would need us to
2796              * understand coalescing out more than one MOV at a time.
2797              */
2798             if (scan_inst->regs_written > scan_inst->dst.width / 8)
2799                break;
2800
2801             /* SEND instructions can't have MRF as a destination. */
2802             if (scan_inst->mlen)
2803                break;
2804
2805             if (devinfo->gen == 6) {
2806                /* gen6 math instructions must have the destination be
2807                 * GRF, so no compute-to-MRF for them.
2808                 */
2809                if (scan_inst->is_math()) {
2810                   break;
2811                }
2812             }
2813
2814             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2815                /* Found the creator of our MRF's source value. */
2816                scan_inst->dst.file = MRF;
2817                scan_inst->dst.reg = inst->dst.reg;
2818                scan_inst->saturate |= inst->saturate;
2819                inst->remove(block);
2820                progress = true;
2821             }
2822             break;
2823          }
2824
2825          /* We don't handle control flow here.  Most computation of
2826           * values that end up in MRFs are shortly before the MRF
2827           * write anyway.
2828           */
2829          if (block->start() == scan_inst)
2830             break;
2831
2832          /* You can't read from an MRF, so if someone else reads our
2833           * MRF's source GRF that we wanted to rewrite, that stops us.
2834           */
2835          bool interfered = false;
2836          for (int i = 0; i < scan_inst->sources; i++) {
2837             if (scan_inst->src[i].file == GRF &&
2838                 scan_inst->src[i].reg == inst->src[0].reg &&
2839                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2840                interfered = true;
2841             }
2842          }
2843          if (interfered)
2844             break;
2845
2846          if (scan_inst->dst.file == MRF) {
2847             /* If somebody else writes our MRF here, we can't
2848              * compute-to-MRF before that.
2849              */
2850             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2851             int scan_mrf_high;
2852
2853             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2854                scan_mrf_high = scan_mrf_low + 4;
2855             } else if (scan_inst->exec_size == 16) {
2856                scan_mrf_high = scan_mrf_low + 1;
2857             } else {
2858                scan_mrf_high = scan_mrf_low;
2859             }
2860
2861             if (mrf_low == scan_mrf_low ||
2862                 mrf_low == scan_mrf_high ||
2863                 mrf_high == scan_mrf_low ||
2864                 mrf_high == scan_mrf_high) {
2865                break;
2866             }
2867          }
2868
2869          if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2870             /* Found a SEND instruction, which means that there are
2871              * live values in MRFs from base_mrf to base_mrf +
2872              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2873              * above it.
2874              */
2875             if (mrf_low >= scan_inst->base_mrf &&
2876                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2877                break;
2878             }
2879             if (mrf_high >= scan_inst->base_mrf &&
2880                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2881                break;
2882             }
2883          }
2884       }
2885    }
2886
2887    if (progress)
2888       invalidate_live_intervals();
2889
2890    return progress;
2891 }
2892
2893 /**
2894  * Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control
2895  * flow.  We could probably do better here with some form of divergence
2896  * analysis.
2897  */
2898 bool
2899 fs_visitor::eliminate_find_live_channel()
2900 {
2901    bool progress = false;
2902    unsigned depth = 0;
2903
2904    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2905       switch (inst->opcode) {
2906       case BRW_OPCODE_IF:
2907       case BRW_OPCODE_DO:
2908          depth++;
2909          break;
2910
2911       case BRW_OPCODE_ENDIF:
2912       case BRW_OPCODE_WHILE:
2913          depth--;
2914          break;
2915
2916       case FS_OPCODE_DISCARD_JUMP:
2917          /* This can potentially make control flow non-uniform until the end
2918           * of the program.
2919           */
2920          return progress;
2921
2922       case SHADER_OPCODE_FIND_LIVE_CHANNEL:
2923          if (depth == 0) {
2924             inst->opcode = BRW_OPCODE_MOV;
2925             inst->src[0] = fs_reg(0);
2926             inst->sources = 1;
2927             inst->force_writemask_all = true;
2928             progress = true;
2929          }
2930          break;
2931
2932       default:
2933          break;
2934       }
2935    }
2936
2937    return progress;
2938 }
2939
2940 /**
2941  * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
2942  * instructions to FS_OPCODE_REP_FB_WRITE.
2943  */
2944 void
2945 fs_visitor::emit_repclear_shader()
2946 {
2947    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2948    int base_mrf = 1;
2949    int color_mrf = base_mrf + 2;
2950
2951    fs_inst *mov = emit(MOV(vec4(brw_message_reg(color_mrf)),
2952                            fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F)));
2953    mov->force_writemask_all = true;
2954
2955    fs_inst *write;
2956    if (key->nr_color_regions == 1) {
2957       write = emit(FS_OPCODE_REP_FB_WRITE);
2958       write->saturate = key->clamp_fragment_color;
2959       write->base_mrf = color_mrf;
2960       write->target = 0;
2961       write->header_size = 0;
2962       write->mlen = 1;
2963    } else {
2964       assume(key->nr_color_regions > 0);
2965       for (int i = 0; i < key->nr_color_regions; ++i) {
2966          write = emit(FS_OPCODE_REP_FB_WRITE);
2967          write->saturate = key->clamp_fragment_color;
2968          write->base_mrf = base_mrf;
2969          write->target = i;
2970          write->header_size = 2;
2971          write->mlen = 3;
2972       }
2973    }
2974    write->eot = true;
2975
2976    calculate_cfg();
2977
2978    assign_constant_locations();
2979    assign_curb_setup();
2980
2981    /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
2982    assert(mov->src[0].file == HW_REG);
2983    mov->src[0] = brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
2984 }
2985
2986 /**
2987  * Walks through basic blocks, looking for repeated MRF writes and
2988  * removing the later ones.
2989  */
2990 bool
2991 fs_visitor::remove_duplicate_mrf_writes()
2992 {
2993    fs_inst *last_mrf_move[16];
2994    bool progress = false;
2995
2996    /* Need to update the MRF tracking for compressed instructions. */
2997    if (dispatch_width == 16)
2998       return false;
2999
3000    memset(last_mrf_move, 0, sizeof(last_mrf_move));
3001
3002    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3003       if (inst->is_control_flow()) {
3004          memset(last_mrf_move, 0, sizeof(last_mrf_move));
3005       }
3006
3007       if (inst->opcode == BRW_OPCODE_MOV &&
3008           inst->dst.file == MRF) {
3009          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
3010          if (prev_inst && inst->equals(prev_inst)) {
3011             inst->remove(block);
3012             progress = true;
3013             continue;
3014          }
3015       }
3016
3017       /* Clear out the last-write records for MRFs that were overwritten. */
3018       if (inst->dst.file == MRF) {
3019          last_mrf_move[inst->dst.reg] = NULL;
3020       }
3021
3022       if (inst->mlen > 0 && inst->base_mrf != -1) {
3023          /* Found a SEND instruction, which will include two or fewer
3024           * implied MRF writes.  We could do better here.
3025           */
3026          for (int i = 0; i < implied_mrf_writes(inst); i++) {
3027             last_mrf_move[inst->base_mrf + i] = NULL;
3028          }
3029       }
3030
3031       /* Clear out any MRF move records whose sources got overwritten. */
3032       if (inst->dst.file == GRF) {
3033          for (unsigned int i = 0; i < ARRAY_SIZE(last_mrf_move); i++) {
3034             if (last_mrf_move[i] &&
3035                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
3036                last_mrf_move[i] = NULL;
3037             }
3038          }
3039       }
3040
3041       if (inst->opcode == BRW_OPCODE_MOV &&
3042           inst->dst.file == MRF &&
3043           inst->src[0].file == GRF &&
3044           !inst->is_partial_write()) {
3045          last_mrf_move[inst->dst.reg] = inst;
3046       }
3047    }
3048
3049    if (progress)
3050       invalidate_live_intervals();
3051
3052    return progress;
3053 }
3054
3055 static void
3056 clear_deps_for_inst_src(fs_inst *inst, bool *deps, int first_grf, int grf_len)
3057 {
3058    /* Clear the flag for registers that actually got read (as expected). */
3059    for (int i = 0; i < inst->sources; i++) {
3060       int grf;
3061       if (inst->src[i].file == GRF) {
3062          grf = inst->src[i].reg;
3063       } else if (inst->src[i].file == HW_REG &&
3064                  inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
3065          grf = inst->src[i].fixed_hw_reg.nr;
3066       } else {
3067          continue;
3068       }
3069
3070       if (grf >= first_grf &&
3071           grf < first_grf + grf_len) {
3072          deps[grf - first_grf] = false;
3073          if (inst->exec_size == 16)
3074             deps[grf - first_grf + 1] = false;
3075       }
3076    }
3077 }
3078
3079 /**
3080  * Implements this workaround for the original 965:
3081  *
3082  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
3083  *      check for post destination dependencies on this instruction, software
3084  *      must ensure that there is no destination hazard for the case of ‘write
3085  *      followed by a posted write’ shown in the following example.
3086  *
3087  *      1. mov r3 0
3088  *      2. send r3.xy <rest of send instruction>
3089  *      3. mov r2 r3
3090  *
3091  *      Due to no post-destination dependency check on the ‘send’, the above
3092  *      code sequence could have two instructions (1 and 2) in flight at the
3093  *      same time that both consider ‘r3’ as the target of their final writes.
3094  */
3095 void
3096 fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
3097                                                         fs_inst *inst)
3098 {
3099    int write_len = inst->regs_written;
3100    int first_write_grf = inst->dst.reg;
3101    bool needs_dep[BRW_MAX_MRF];
3102    assert(write_len < (int)sizeof(needs_dep) - 1);
3103
3104    memset(needs_dep, false, sizeof(needs_dep));
3105    memset(needs_dep, true, write_len);
3106
3107    clear_deps_for_inst_src(inst, needs_dep, first_write_grf, write_len);
3108
3109    /* Walk backwards looking for writes to registers we're writing which
3110     * aren't read since being written.  If we hit the start of the program,
3111     * we assume that there are no outstanding dependencies on entry to the
3112     * program.
3113     */
3114    foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
3115       /* If we hit control flow, assume that there *are* outstanding
3116        * dependencies, and force their cleanup before our instruction.
3117        */
3118       if (block->start() == scan_inst) {
3119          for (int i = 0; i < write_len; i++) {
3120             if (needs_dep[i]) {
3121                inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
3122             }
3123          }
3124          return;
3125       }
3126
3127       /* We insert our reads as late as possible on the assumption that any
3128        * instruction but a MOV that might have left us an outstanding
3129        * dependency has more latency than a MOV.
3130        */
3131       if (scan_inst->dst.file == GRF) {
3132          for (int i = 0; i < scan_inst->regs_written; i++) {
3133             int reg = scan_inst->dst.reg + i;
3134
3135             if (reg >= first_write_grf &&
3136                 reg < first_write_grf + write_len &&
3137                 needs_dep[reg - first_write_grf]) {
3138                inst->insert_before(block, DEP_RESOLVE_MOV(reg));
3139                needs_dep[reg - first_write_grf] = false;
3140                if (scan_inst->exec_size == 16)
3141                   needs_dep[reg - first_write_grf + 1] = false;
3142             }
3143          }
3144       }
3145
3146       /* Clear the flag for registers that actually got read (as expected). */
3147       clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3148
3149       /* Continue the loop only if we haven't resolved all the dependencies */
3150       int i;
3151       for (i = 0; i < write_len; i++) {
3152          if (needs_dep[i])
3153             break;
3154       }
3155       if (i == write_len)
3156          return;
3157    }
3158 }
3159
3160 /**
3161  * Implements this workaround for the original 965:
3162  *
3163  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
3164  *      used as a destination register until after it has been sourced by an
3165  *      instruction with a different destination register.
3166  */
3167 void
3168 fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
3169 {
3170    int write_len = inst->regs_written;
3171    int first_write_grf = inst->dst.reg;
3172    bool needs_dep[BRW_MAX_MRF];
3173    assert(write_len < (int)sizeof(needs_dep) - 1);
3174
3175    memset(needs_dep, false, sizeof(needs_dep));
3176    memset(needs_dep, true, write_len);
3177    /* Walk forwards looking for writes to registers we're writing which aren't
3178     * read before being written.
3179     */
3180    foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst, block) {
3181       /* If we hit control flow, force resolve all remaining dependencies. */
3182       if (block->end() == scan_inst) {
3183          for (int i = 0; i < write_len; i++) {
3184             if (needs_dep[i])
3185                scan_inst->insert_before(block,
3186                                         DEP_RESOLVE_MOV(first_write_grf + i));
3187          }
3188          return;
3189       }
3190
3191       /* Clear the flag for registers that actually got read (as expected). */
3192       clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3193
3194       /* We insert our reads as late as possible since they're reading the
3195        * result of a SEND, which has massive latency.
3196        */
3197       if (scan_inst->dst.file == GRF &&
3198           scan_inst->dst.reg >= first_write_grf &&
3199           scan_inst->dst.reg < first_write_grf + write_len &&
3200           needs_dep[scan_inst->dst.reg - first_write_grf]) {
3201          scan_inst->insert_before(block, DEP_RESOLVE_MOV(scan_inst->dst.reg));
3202          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
3203       }
3204
3205       /* Continue the loop only if we haven't resolved all the dependencies */
3206       int i;
3207       for (i = 0; i < write_len; i++) {
3208          if (needs_dep[i])
3209             break;
3210       }
3211       if (i == write_len)
3212          return;
3213    }
3214 }
3215
3216 void
3217 fs_visitor::insert_gen4_send_dependency_workarounds()
3218 {
3219    if (devinfo->gen != 4 || devinfo->is_g4x)
3220       return;
3221
3222    bool progress = false;
3223
3224    /* Note that we're done with register allocation, so GRF fs_regs always
3225     * have a .reg_offset of 0.
3226     */
3227
3228    foreach_block_and_inst(block, fs_inst, inst, cfg) {
3229       if (inst->mlen != 0 && inst->dst.file == GRF) {
3230          insert_gen4_pre_send_dependency_workarounds(block, inst);
3231          insert_gen4_post_send_dependency_workarounds(block, inst);
3232          progress = true;
3233       }
3234    }
3235
3236    if (progress)
3237       invalidate_live_intervals();
3238 }
3239
3240 /**
3241  * Turns the generic expression-style uniform pull constant load instruction
3242  * into a hardware-specific series of instructions for loading a pull
3243  * constant.
3244  *
3245  * The expression style allows the CSE pass before this to optimize out
3246  * repeated loads from the same offset, and gives the pre-register-allocation
3247  * scheduling full flexibility, while the conversion to native instructions
3248  * allows the post-register-allocation scheduler the best information
3249  * possible.
3250  *
3251  * Note that execution masking for setting up pull constant loads is special:
3252  * the channels that need to be written are unrelated to the current execution
3253  * mask, since a later instruction will use one of the result channels as a
3254  * source operand for all 8 or 16 of its channels.
3255  */
3256 void
3257 fs_visitor::lower_uniform_pull_constant_loads()
3258 {
3259    foreach_block_and_inst (block, fs_inst, inst, cfg) {
3260       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
3261          continue;
3262
3263       if (devinfo->gen >= 7) {
3264          /* The offset arg before was a vec4-aligned byte offset.  We need to
3265           * turn it into a dword offset.
3266           */
3267          fs_reg const_offset_reg = inst->src[1];
3268          assert(const_offset_reg.file == IMM &&
3269                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
3270          const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
3271          fs_reg payload = fs_reg(GRF, alloc.allocate(1));
3272
3273          /* We have to use a message header on Skylake to get SIMD4x2 mode.
3274           * Reserve space for the register.
3275           */
3276          if (devinfo->gen >= 9) {
3277             payload.reg_offset++;
3278             alloc.sizes[payload.reg] = 2;
3279          }
3280
3281          /* This is actually going to be a MOV, but since only the first dword
3282           * is accessed, we have a special opcode to do just that one.  Note
3283           * that this needs to be an operation that will be considered a def
3284           * by live variable analysis, or register allocation will explode.
3285           */
3286          fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
3287                                                8, payload, const_offset_reg);
3288          setup->force_writemask_all = true;
3289
3290          setup->ir = inst->ir;
3291          setup->annotation = inst->annotation;
3292          inst->insert_before(block, setup);
3293
3294          /* Similarly, this will only populate the first 4 channels of the
3295           * result register (since we only use smear values from 0-3), but we
3296           * don't tell the optimizer.
3297           */
3298          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
3299          inst->src[1] = payload;
3300
3301          invalidate_live_intervals();
3302       } else {
3303          /* Before register allocation, we didn't tell the scheduler about the
3304           * MRF we use.  We know it's safe to use this MRF because nothing
3305           * else does except for register spill/unspill, which generates and
3306           * uses its MRF within a single IR instruction.
3307           */
3308          inst->base_mrf = 14;
3309          inst->mlen = 1;
3310       }
3311    }
3312 }
3313
3314 bool
3315 fs_visitor::lower_load_payload()
3316 {
3317    bool progress = false;
3318
3319    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3320       if (inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
3321          continue;
3322
3323       assert(inst->dst.file == MRF || inst->dst.file == GRF);
3324       assert(inst->saturate == false);
3325
3326       const fs_builder ibld = bld.group(inst->exec_size, inst->force_sechalf)
3327                                  .exec_all(inst->force_writemask_all)
3328                                  .at(block, inst);
3329       fs_reg dst = inst->dst;
3330
3331       /* Get rid of COMPR4.  We'll add it back in if we need it */
3332       if (dst.file == MRF)
3333          dst.reg = dst.reg & ~BRW_MRF_COMPR4;
3334
3335       dst.width = 8;
3336       for (uint8_t i = 0; i < inst->header_size; i++) {
3337          if (inst->src[i].file != BAD_FILE) {
3338             fs_reg mov_dst = retype(dst, BRW_REGISTER_TYPE_UD);
3339             fs_reg mov_src = retype(inst->src[i], BRW_REGISTER_TYPE_UD);
3340             mov_src.width = 8;
3341             ibld.exec_all().MOV(mov_dst, mov_src);
3342          }
3343          dst = offset(dst, 1);
3344       }
3345
3346       dst.width = inst->exec_size;
3347       if (inst->dst.file == MRF && (inst->dst.reg & BRW_MRF_COMPR4) &&
3348           inst->exec_size > 8) {
3349          /* In this case, the payload portion of the LOAD_PAYLOAD isn't
3350           * a straightforward copy.  Instead, the result of the
3351           * LOAD_PAYLOAD is treated as interleaved and the first four
3352           * non-header sources are unpacked as:
3353           *
3354           * m + 0: r0
3355           * m + 1: g0
3356           * m + 2: b0
3357           * m + 3: a0
3358           * m + 4: r1
3359           * m + 5: g1
3360           * m + 6: b1
3361           * m + 7: a1
3362           *
3363           * This is used for gen <= 5 fb writes.
3364           */
3365          assert(inst->exec_size == 16);
3366          assert(inst->header_size + 4 <= inst->sources);
3367          for (uint8_t i = inst->header_size; i < inst->header_size + 4; i++) {
3368             if (inst->src[i].file != BAD_FILE) {
3369                if (devinfo->has_compr4) {
3370                   fs_reg compr4_dst = retype(dst, inst->src[i].type);
3371                   compr4_dst.reg |= BRW_MRF_COMPR4;
3372                   ibld.MOV(compr4_dst, inst->src[i]);
3373                } else {
3374                   /* Platform doesn't have COMPR4.  We have to fake it */
3375                   fs_reg mov_dst = retype(dst, inst->src[i].type);
3376                   mov_dst.width = 8;
3377                   ibld.half(0).MOV(mov_dst, half(inst->src[i], 0));
3378                   ibld.half(1).MOV(offset(mov_dst, 4), half(inst->src[i], 1));
3379                }
3380             }
3381
3382             dst.reg++;
3383          }
3384
3385          /* The loop above only ever incremented us through the first set
3386           * of 4 registers.  However, thanks to the magic of COMPR4, we
3387           * actually wrote to the first 8 registers, so we need to take
3388           * that into account now.
3389           */
3390          dst.reg += 4;
3391
3392          /* The COMPR4 code took care of the first 4 sources.  We'll let
3393           * the regular path handle any remaining sources.  Yes, we are
3394           * modifying the instruction but we're about to delete it so
3395           * this really doesn't hurt anything.
3396           */
3397          inst->header_size += 4;
3398       }
3399
3400       for (uint8_t i = inst->header_size; i < inst->sources; i++) {
3401          if (inst->src[i].file != BAD_FILE)
3402             ibld.MOV(retype(dst, inst->src[i].type), inst->src[i]);
3403          dst = offset(dst, 1);
3404       }
3405
3406       inst->remove(block);
3407       progress = true;
3408    }
3409
3410    if (progress)
3411       invalidate_live_intervals();
3412
3413    return progress;
3414 }
3415
3416 bool
3417 fs_visitor::lower_integer_multiplication()
3418 {
3419    bool progress = false;
3420
3421    /* Gen8's MUL instruction can do a 32-bit x 32-bit -> 32-bit operation
3422     * directly, but Cherryview cannot.
3423     */
3424    if (devinfo->gen >= 8 && !devinfo->is_cherryview)
3425       return false;
3426
3427    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
3428       if (inst->opcode != BRW_OPCODE_MUL ||
3429           inst->dst.is_accumulator() ||
3430           (inst->dst.type != BRW_REGISTER_TYPE_D &&
3431            inst->dst.type != BRW_REGISTER_TYPE_UD))
3432          continue;
3433
3434 #define insert(instr) inst->insert_before(block, instr)
3435
3436       /* The MUL instruction isn't commutative. On Gen <= 6, only the low
3437        * 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of
3438        * src1 are used.
3439        *
3440        * If multiplying by an immediate value that fits in 16-bits, do a
3441        * single MUL instruction with that value in the proper location.
3442        */
3443       if (inst->src[1].file == IMM &&
3444           inst->src[1].fixed_hw_reg.dw1.ud < (1 << 16)) {
3445          if (devinfo->gen < 7) {
3446             fs_reg imm(GRF, alloc.allocate(dispatch_width / 8),
3447                        inst->dst.type, dispatch_width);
3448             insert(MOV(imm, inst->src[1]));
3449             insert(MUL(inst->dst, imm, inst->src[0]));
3450          } else {
3451             insert(MUL(inst->dst, inst->src[0], inst->src[1]));
3452          }
3453       } else {
3454          /* Gen < 8 (and some Gen8+ low-power parts like Cherryview) cannot
3455           * do 32-bit integer multiplication in one instruction, but instead
3456           * must do a sequence (which actually calculates a 64-bit result):
3457           *
3458           *    mul(8)  acc0<1>D   g3<8,8,1>D      g4<8,8,1>D
3459           *    mach(8) null       g3<8,8,1>D      g4<8,8,1>D
3460           *    mov(8)  g2<1>D     acc0<8,8,1>D
3461           *
3462           * But on Gen > 6, the ability to use second accumulator register
3463           * (acc1) for non-float data types was removed, preventing a simple
3464           * implementation in SIMD16. A 16-channel result can be calculated by
3465           * executing the three instructions twice in SIMD8, once with quarter
3466           * control of 1Q for the first eight channels and again with 2Q for
3467           * the second eight channels.
3468           *
3469           * Which accumulator register is implicitly accessed (by AccWrEnable
3470           * for instance) is determined by the quarter control. Unfortunately
3471           * Ivybridge (and presumably Baytrail) has a hardware bug in which an
3472           * implicit accumulator access by an instruction with 2Q will access
3473           * acc1 regardless of whether the data type is usable in acc1.
3474           *
3475           * Specifically, the 2Q mach(8) writes acc1 which does not exist for
3476           * integer data types.
3477           *
3478           * Since we only want the low 32-bits of the result, we can do two
3479           * 32-bit x 16-bit multiplies (like the mul and mach are doing), and
3480           * adjust the high result and add them (like the mach is doing):
3481           *
3482           *    mul(8)  g7<1>D     g3<8,8,1>D      g4.0<8,8,1>UW
3483           *    mul(8)  g8<1>D     g3<8,8,1>D      g4.1<8,8,1>UW
3484           *    shl(8)  g9<1>D     g8<8,8,1>D      16D
3485           *    add(8)  g2<1>D     g7<8,8,1>D      g8<8,8,1>D
3486           *
3487           * We avoid the shl instruction by realizing that we only want to add
3488           * the low 16-bits of the "high" result to the high 16-bits of the
3489           * "low" result and using proper regioning on the add:
3490           *
3491           *    mul(8)  g7<1>D     g3<8,8,1>D      g4.0<16,8,2>UW
3492           *    mul(8)  g8<1>D     g3<8,8,1>D      g4.1<16,8,2>UW
3493           *    add(8)  g7.1<2>UW  g7.1<16,8,2>UW  g8<16,8,2>UW
3494           *
3495           * Since it does not use the (single) accumulator register, we can
3496           * schedule multi-component multiplications much better.
3497           */
3498
3499          if (inst->conditional_mod && inst->dst.is_null()) {
3500             inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
3501                                inst->dst.type, dispatch_width);
3502          }
3503          fs_reg low = inst->dst;
3504          fs_reg high(GRF, alloc.allocate(dispatch_width / 8),
3505                      inst->dst.type, dispatch_width);
3506
3507          if (brw->gen >= 7) {
3508             fs_reg src1_0_w = inst->src[1];
3509             fs_reg src1_1_w = inst->src[1];
3510
3511             if (inst->src[1].file == IMM) {
3512                src1_0_w.fixed_hw_reg.dw1.ud &= 0xffff;
3513                src1_1_w.fixed_hw_reg.dw1.ud >>= 16;
3514             } else {
3515                src1_0_w.type = BRW_REGISTER_TYPE_UW;
3516                src1_0_w.stride = 2;
3517
3518                src1_1_w.type = BRW_REGISTER_TYPE_UW;
3519                src1_1_w.stride = 2;
3520                src1_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
3521             }
3522             insert(MUL(low, inst->src[0], src1_0_w));
3523             insert(MUL(high, inst->src[0], src1_1_w));
3524          } else {
3525             fs_reg src0_0_w = inst->src[0];
3526             fs_reg src0_1_w = inst->src[0];
3527
3528             src0_0_w.type = BRW_REGISTER_TYPE_UW;
3529             src0_0_w.stride = 2;
3530
3531             src0_1_w.type = BRW_REGISTER_TYPE_UW;
3532             src0_1_w.stride = 2;
3533             src0_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
3534
3535             insert(MUL(low, src0_0_w, inst->src[1]));
3536             insert(MUL(high, src0_1_w, inst->src[1]));
3537          }
3538
3539          fs_reg dst = inst->dst;
3540          dst.type = BRW_REGISTER_TYPE_UW;
3541          dst.subreg_offset = 2;
3542          dst.stride = 2;
3543
3544          high.type = BRW_REGISTER_TYPE_UW;
3545          high.stride = 2;
3546
3547          low.type = BRW_REGISTER_TYPE_UW;
3548          low.subreg_offset = 2;
3549          low.stride = 2;
3550
3551          insert(ADD(dst, low, high));
3552
3553          if (inst->conditional_mod) {
3554             fs_reg null(retype(brw_null_reg(), inst->dst.type));
3555             fs_inst *mov = MOV(null, inst->dst);
3556             mov->conditional_mod = inst->conditional_mod;
3557             insert(mov);
3558          }
3559       }
3560 #undef insert
3561
3562       inst->remove(block);
3563       progress = true;
3564    }
3565
3566    if (progress)
3567       invalidate_live_intervals();
3568
3569    return progress;
3570 }
3571
3572 void
3573 fs_visitor::dump_instructions()
3574 {
3575    dump_instructions(NULL);
3576 }
3577
3578 void
3579 fs_visitor::dump_instructions(const char *name)
3580 {
3581    FILE *file = stderr;
3582    if (name && geteuid() != 0) {
3583       file = fopen(name, "w");
3584       if (!file)
3585          file = stderr;
3586    }
3587
3588    if (cfg) {
3589       calculate_register_pressure();
3590       int ip = 0, max_pressure = 0;
3591       foreach_block_and_inst(block, backend_instruction, inst, cfg) {
3592          max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
3593          fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
3594          dump_instruction(inst, file);
3595          ip++;
3596       }
3597       fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
3598    } else {
3599       int ip = 0;
3600       foreach_in_list(backend_instruction, inst, &instructions) {
3601          fprintf(file, "%4d: ", ip++);
3602          dump_instruction(inst, file);
3603       }
3604    }
3605
3606    if (file != stderr) {
3607       fclose(file);
3608    }
3609 }
3610
3611 void
3612 fs_visitor::dump_instruction(backend_instruction *be_inst)
3613 {
3614    dump_instruction(be_inst, stderr);
3615 }
3616
3617 void
3618 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
3619 {
3620    fs_inst *inst = (fs_inst *)be_inst;
3621
3622    if (inst->predicate) {
3623       fprintf(file, "(%cf0.%d) ",
3624              inst->predicate_inverse ? '-' : '+',
3625              inst->flag_subreg);
3626    }
3627
3628    fprintf(file, "%s", brw_instruction_name(inst->opcode));
3629    if (inst->saturate)
3630       fprintf(file, ".sat");
3631    if (inst->conditional_mod) {
3632       fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
3633       if (!inst->predicate &&
3634           (devinfo->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
3635                               inst->opcode != BRW_OPCODE_IF &&
3636                               inst->opcode != BRW_OPCODE_WHILE))) {
3637          fprintf(file, ".f0.%d", inst->flag_subreg);
3638       }
3639    }
3640    fprintf(file, "(%d) ", inst->exec_size);
3641
3642    if (inst->mlen) {
3643       fprintf(file, "(mlen: %d) ", inst->mlen);
3644    }
3645
3646    switch (inst->dst.file) {
3647    case GRF:
3648       fprintf(file, "vgrf%d", inst->dst.reg);
3649       if (inst->dst.width != dispatch_width)
3650          fprintf(file, "@%d", inst->dst.width);
3651       if (alloc.sizes[inst->dst.reg] != inst->dst.width / 8 ||
3652           inst->dst.subreg_offset)
3653          fprintf(file, "+%d.%d",
3654                  inst->dst.reg_offset, inst->dst.subreg_offset);
3655       break;
3656    case MRF:
3657       fprintf(file, "m%d", inst->dst.reg);
3658       break;
3659    case BAD_FILE:
3660       fprintf(file, "(null)");
3661       break;
3662    case UNIFORM:
3663       fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
3664       break;
3665    case ATTR:
3666       fprintf(file, "***attr%d***", inst->dst.reg + inst->dst.reg_offset);
3667       break;
3668    case HW_REG:
3669       if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3670          switch (inst->dst.fixed_hw_reg.nr) {
3671          case BRW_ARF_NULL:
3672             fprintf(file, "null");
3673             break;
3674          case BRW_ARF_ADDRESS:
3675             fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
3676             break;
3677          case BRW_ARF_ACCUMULATOR:
3678             fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
3679             break;
3680          case BRW_ARF_FLAG:
3681             fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3682                              inst->dst.fixed_hw_reg.subnr);
3683             break;
3684          default:
3685             fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3686                                inst->dst.fixed_hw_reg.subnr);
3687             break;
3688          }
3689       } else {
3690          fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
3691       }
3692       if (inst->dst.fixed_hw_reg.subnr)
3693          fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
3694       break;
3695    default:
3696       fprintf(file, "???");
3697       break;
3698    }
3699    fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
3700
3701    for (int i = 0; i < inst->sources; i++) {
3702       if (inst->src[i].negate)
3703          fprintf(file, "-");
3704       if (inst->src[i].abs)
3705          fprintf(file, "|");
3706       switch (inst->src[i].file) {
3707       case GRF:
3708          fprintf(file, "vgrf%d", inst->src[i].reg);
3709          if (inst->src[i].width != dispatch_width)
3710             fprintf(file, "@%d", inst->src[i].width);
3711          if (alloc.sizes[inst->src[i].reg] != inst->src[i].width / 8 ||
3712              inst->src[i].subreg_offset)
3713             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3714                     inst->src[i].subreg_offset);
3715          break;
3716       case MRF:
3717          fprintf(file, "***m%d***", inst->src[i].reg);
3718          break;
3719       case ATTR:
3720          fprintf(file, "attr%d", inst->src[i].reg + inst->src[i].reg_offset);
3721          break;
3722       case UNIFORM:
3723          fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
3724          if (inst->src[i].reladdr) {
3725             fprintf(file, "+reladdr");
3726          } else if (inst->src[i].subreg_offset) {
3727             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3728                     inst->src[i].subreg_offset);
3729          }
3730          break;
3731       case BAD_FILE:
3732          fprintf(file, "(null)");
3733          break;
3734       case IMM:
3735          switch (inst->src[i].type) {
3736          case BRW_REGISTER_TYPE_F:
3737             fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
3738             break;
3739          case BRW_REGISTER_TYPE_W:
3740          case BRW_REGISTER_TYPE_D:
3741             fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
3742             break;
3743          case BRW_REGISTER_TYPE_UW:
3744          case BRW_REGISTER_TYPE_UD:
3745             fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
3746             break;
3747          case BRW_REGISTER_TYPE_VF:
3748             fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
3749                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >>  0) & 0xff),
3750                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >>  8) & 0xff),
3751                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 16) & 0xff),
3752                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 24) & 0xff));
3753             break;
3754          default:
3755             fprintf(file, "???");
3756             break;
3757          }
3758          break;
3759       case HW_REG:
3760          if (inst->src[i].fixed_hw_reg.negate)
3761             fprintf(file, "-");
3762          if (inst->src[i].fixed_hw_reg.abs)
3763             fprintf(file, "|");
3764          if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3765             switch (inst->src[i].fixed_hw_reg.nr) {
3766             case BRW_ARF_NULL:
3767                fprintf(file, "null");
3768                break;
3769             case BRW_ARF_ADDRESS:
3770                fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
3771                break;
3772             case BRW_ARF_ACCUMULATOR:
3773                fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
3774                break;
3775             case BRW_ARF_FLAG:
3776                fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3777                                 inst->src[i].fixed_hw_reg.subnr);
3778                break;
3779             default:
3780                fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3781                                   inst->src[i].fixed_hw_reg.subnr);
3782                break;
3783             }
3784          } else {
3785             fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3786          }
3787          if (inst->src[i].fixed_hw_reg.subnr)
3788             fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
3789          if (inst->src[i].fixed_hw_reg.abs)
3790             fprintf(file, "|");
3791          break;
3792       default:
3793          fprintf(file, "???");
3794          break;
3795       }
3796       if (inst->src[i].abs)
3797          fprintf(file, "|");
3798
3799       if (inst->src[i].file != IMM) {
3800          fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
3801       }
3802
3803       if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
3804          fprintf(file, ", ");
3805    }
3806
3807    fprintf(file, " ");
3808
3809    if (dispatch_width == 16 && inst->exec_size == 8) {
3810       if (inst->force_sechalf)
3811          fprintf(file, "2ndhalf ");
3812       else
3813          fprintf(file, "1sthalf ");
3814    }
3815
3816    fprintf(file, "\n");
3817 }
3818
3819 /**
3820  * Possibly returns an instruction that set up @param reg.
3821  *
3822  * Sometimes we want to take the result of some expression/variable
3823  * dereference tree and rewrite the instruction generating the result
3824  * of the tree.  When processing the tree, we know that the
3825  * instructions generated are all writing temporaries that are dead
3826  * outside of this tree.  So, if we have some instructions that write
3827  * a temporary, we're free to point that temp write somewhere else.
3828  *
3829  * Note that this doesn't guarantee that the instruction generated
3830  * only reg -- it might be the size=4 destination of a texture instruction.
3831  */
3832 fs_inst *
3833 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3834                                            fs_inst *end,
3835                                            const fs_reg &reg)
3836 {
3837    if (end == start ||
3838        end->is_partial_write() ||
3839        reg.reladdr ||
3840        !reg.equals(end->dst)) {
3841       return NULL;
3842    } else {
3843       return end;
3844    }
3845 }
3846
3847 void
3848 fs_visitor::setup_payload_gen6()
3849 {
3850    bool uses_depth =
3851       (prog->InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3852    unsigned barycentric_interp_modes =
3853       (stage == MESA_SHADER_FRAGMENT) ?
3854       ((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
3855
3856    assert(devinfo->gen >= 6);
3857
3858    /* R0-1: masks, pixel X/Y coordinates. */
3859    payload.num_regs = 2;
3860    /* R2: only for 32-pixel dispatch.*/
3861
3862    /* R3-26: barycentric interpolation coordinates.  These appear in the
3863     * same order that they appear in the brw_wm_barycentric_interp_mode
3864     * enum.  Each set of coordinates occupies 2 registers if dispatch width
3865     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
3866     * appear if they were enabled using the "Barycentric Interpolation
3867     * Mode" bits in WM_STATE.
3868     */
3869    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3870       if (barycentric_interp_modes & (1 << i)) {
3871          payload.barycentric_coord_reg[i] = payload.num_regs;
3872          payload.num_regs += 2;
3873          if (dispatch_width == 16) {
3874             payload.num_regs += 2;
3875          }
3876       }
3877    }
3878
3879    /* R27: interpolated depth if uses source depth */
3880    if (uses_depth) {
3881       payload.source_depth_reg = payload.num_regs;
3882       payload.num_regs++;
3883       if (dispatch_width == 16) {
3884          /* R28: interpolated depth if not SIMD8. */
3885          payload.num_regs++;
3886       }
3887    }
3888    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3889    if (uses_depth) {
3890       payload.source_w_reg = payload.num_regs;
3891       payload.num_regs++;
3892       if (dispatch_width == 16) {
3893          /* R30: interpolated W if not SIMD8. */
3894          payload.num_regs++;
3895       }
3896    }
3897
3898    if (stage == MESA_SHADER_FRAGMENT) {
3899       brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3900       brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3901       prog_data->uses_pos_offset = key->compute_pos_offset;
3902       /* R31: MSAA position offsets. */
3903       if (prog_data->uses_pos_offset) {
3904          payload.sample_pos_reg = payload.num_regs;
3905          payload.num_regs++;
3906       }
3907    }
3908
3909    /* R32: MSAA input coverage mask */
3910    if (prog->SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3911       assert(devinfo->gen >= 7);
3912       payload.sample_mask_in_reg = payload.num_regs;
3913       payload.num_regs++;
3914       if (dispatch_width == 16) {
3915          /* R33: input coverage mask if not SIMD8. */
3916          payload.num_regs++;
3917       }
3918    }
3919
3920    /* R34-: bary for 32-pixel. */
3921    /* R58-59: interp W for 32-pixel. */
3922
3923    if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3924       source_depth_to_render_target = true;
3925    }
3926 }
3927
3928 void
3929 fs_visitor::setup_vs_payload()
3930 {
3931    /* R0: thread header, R1: urb handles */
3932    payload.num_regs = 2;
3933 }
3934
3935 void
3936 fs_visitor::setup_cs_payload()
3937 {
3938    assert(brw->gen >= 7);
3939
3940    payload.num_regs = 1;
3941 }
3942
3943 void
3944 fs_visitor::assign_binding_table_offsets()
3945 {
3946    assert(stage == MESA_SHADER_FRAGMENT);
3947    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3948    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3949    uint32_t next_binding_table_offset = 0;
3950
3951    /* If there are no color regions, we still perform an FB write to a null
3952     * renderbuffer, which we place at surface index 0.
3953     */
3954    prog_data->binding_table.render_target_start = next_binding_table_offset;
3955    next_binding_table_offset += MAX2(key->nr_color_regions, 1);
3956
3957    assign_common_binding_table_offsets(next_binding_table_offset);
3958 }
3959
3960 void
3961 fs_visitor::calculate_register_pressure()
3962 {
3963    invalidate_live_intervals();
3964    calculate_live_intervals();
3965
3966    unsigned num_instructions = 0;
3967    foreach_block(block, cfg)
3968       num_instructions += block->instructions.length();
3969
3970    regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3971
3972    for (unsigned reg = 0; reg < alloc.count; reg++) {
3973       for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3974          regs_live_at_ip[ip] += alloc.sizes[reg];
3975    }
3976 }
3977
3978 void
3979 fs_visitor::optimize()
3980 {
3981    /* bld is the common builder object pointing at the end of the program we
3982     * used to translate it into i965 IR.  For the optimization and lowering
3983     * passes coming next, any code added after the end of the program without
3984     * having explicitly called fs_builder::at() clearly points at a mistake.
3985     * Ideally optimization passes wouldn't be part of the visitor so they
3986     * wouldn't have access to bld at all, but they do, so just in case some
3987     * pass forgets to ask for a location explicitly set it to NULL here to
3988     * make it trip.
3989     */
3990    bld = bld.at(NULL, NULL);
3991
3992    split_virtual_grfs();
3993
3994    move_uniform_array_access_to_pull_constants();
3995    assign_constant_locations();
3996    demote_pull_constants();
3997
3998 #define OPT(pass, args...) ({                                           \
3999       pass_num++;                                                       \
4000       bool this_progress = pass(args);                                  \
4001                                                                         \
4002       if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) {   \
4003          char filename[64];                                             \
4004          snprintf(filename, 64, "%s%d-%04d-%02d-%02d-" #pass,              \
4005                   stage_abbrev, dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
4006                                                                         \
4007          backend_shader::dump_instructions(filename);                   \
4008       }                                                                 \
4009                                                                         \
4010       progress = progress || this_progress;                             \
4011       this_progress;                                                    \
4012    })
4013
4014    if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
4015       char filename[64];
4016       snprintf(filename, 64, "%s%d-%04d-00-start",
4017                stage_abbrev, dispatch_width,
4018                shader_prog ? shader_prog->Name : 0);
4019
4020       backend_shader::dump_instructions(filename);
4021    }
4022
4023    bool progress;
4024    int iteration = 0;
4025    int pass_num = 0;
4026    do {
4027       progress = false;
4028       pass_num = 0;
4029       iteration++;
4030
4031       OPT(remove_duplicate_mrf_writes);
4032
4033       OPT(opt_algebraic);
4034       OPT(opt_cse);
4035       OPT(opt_copy_propagate);
4036       OPT(opt_peephole_predicated_break);
4037       OPT(opt_cmod_propagation);
4038       OPT(dead_code_eliminate);
4039       OPT(opt_peephole_sel);
4040       OPT(dead_control_flow_eliminate, this);
4041       OPT(opt_register_renaming);
4042       OPT(opt_redundant_discard_jumps);
4043       OPT(opt_saturate_propagation);
4044       OPT(opt_zero_samples);
4045       OPT(register_coalesce);
4046       OPT(compute_to_mrf);
4047       OPT(eliminate_find_live_channel);
4048
4049       OPT(compact_virtual_grfs);
4050    } while (progress);
4051
4052    pass_num = 0;
4053
4054    OPT(opt_sampler_eot);
4055
4056    if (OPT(lower_load_payload)) {
4057       split_virtual_grfs();
4058       OPT(register_coalesce);
4059       OPT(compute_to_mrf);
4060       OPT(dead_code_eliminate);
4061    }
4062
4063    OPT(opt_combine_constants);
4064    OPT(lower_integer_multiplication);
4065
4066    lower_uniform_pull_constant_loads();
4067 }
4068
4069 /**
4070  * Three source instruction must have a GRF/MRF destination register.
4071  * ARF NULL is not allowed.  Fix that up by allocating a temporary GRF.
4072  */
4073 void
4074 fs_visitor::fixup_3src_null_dest()
4075 {
4076    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
4077       if (inst->is_3src() && inst->dst.is_null()) {
4078          inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
4079                             inst->dst.type);
4080       }
4081    }
4082 }
4083
4084 void
4085 fs_visitor::allocate_registers()
4086 {
4087    bool allocated_without_spills;
4088
4089    static const enum instruction_scheduler_mode pre_modes[] = {
4090       SCHEDULE_PRE,
4091       SCHEDULE_PRE_NON_LIFO,
4092       SCHEDULE_PRE_LIFO,
4093    };
4094
4095    /* Try each scheduling heuristic to see if it can successfully register
4096     * allocate without spilling.  They should be ordered by decreasing
4097     * performance but increasing likelihood of allocating.
4098     */
4099    for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
4100       schedule_instructions(pre_modes[i]);
4101
4102       if (0) {
4103          assign_regs_trivial();
4104          allocated_without_spills = true;
4105       } else {
4106          allocated_without_spills = assign_regs(false);
4107       }
4108       if (allocated_without_spills)
4109          break;
4110    }
4111
4112    if (!allocated_without_spills) {
4113       /* We assume that any spilling is worse than just dropping back to
4114        * SIMD8.  There's probably actually some intermediate point where
4115        * SIMD16 with a couple of spills is still better.
4116        */
4117       if (dispatch_width == 16) {
4118          fail("Failure to register allocate.  Reduce number of "
4119               "live scalar values to avoid this.");
4120       } else {
4121          perf_debug("%s shader triggered register spilling.  "
4122                     "Try reducing the number of live scalar values to "
4123                     "improve performance.\n", stage_name);
4124       }
4125
4126       /* Since we're out of heuristics, just go spill registers until we
4127        * get an allocation.
4128        */
4129       while (!assign_regs(true)) {
4130          if (failed)
4131             break;
4132       }
4133    }
4134
4135    /* This must come after all optimization and register allocation, since
4136     * it inserts dead code that happens to have side effects, and it does
4137     * so based on the actual physical registers in use.
4138     */
4139    insert_gen4_send_dependency_workarounds();
4140
4141    if (failed)
4142       return;
4143
4144    if (!allocated_without_spills)
4145       schedule_instructions(SCHEDULE_POST);
4146
4147    if (last_scratch > 0)
4148       prog_data->total_scratch = brw_get_scratch_size(last_scratch);
4149 }
4150
4151 bool
4152 fs_visitor::run_vs()
4153 {
4154    assert(stage == MESA_SHADER_VERTEX);
4155
4156    assign_common_binding_table_offsets(0);
4157    setup_vs_payload();
4158
4159    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4160       emit_shader_time_begin();
4161
4162    emit_nir_code();
4163
4164    if (failed)
4165       return false;
4166
4167    emit_urb_writes();
4168
4169    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4170       emit_shader_time_end();
4171
4172    calculate_cfg();
4173
4174    optimize();
4175
4176    assign_curb_setup();
4177    assign_vs_urb_setup();
4178
4179    fixup_3src_null_dest();
4180    allocate_registers();
4181
4182    return !failed;
4183 }
4184
4185 bool
4186 fs_visitor::run_fs()
4187 {
4188    brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
4189    brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
4190
4191    assert(stage == MESA_SHADER_FRAGMENT);
4192
4193    sanity_param_count = prog->Parameters->NumParameters;
4194
4195    assign_binding_table_offsets();
4196
4197    if (devinfo->gen >= 6)
4198       setup_payload_gen6();
4199    else
4200       setup_payload_gen4();
4201
4202    if (0) {
4203       emit_dummy_fs();
4204    } else if (brw->use_rep_send && dispatch_width == 16) {
4205       emit_repclear_shader();
4206    } else {
4207       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4208          emit_shader_time_begin();
4209
4210       calculate_urb_setup();
4211       if (prog->InputsRead > 0) {
4212          if (devinfo->gen < 6)
4213             emit_interpolation_setup_gen4();
4214          else
4215             emit_interpolation_setup_gen6();
4216       }
4217
4218       /* We handle discards by keeping track of the still-live pixels in f0.1.
4219        * Initialize it with the dispatched pixels.
4220        */
4221       if (wm_prog_data->uses_kill) {
4222          fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
4223          discard_init->flag_subreg = 1;
4224       }
4225
4226       /* Generate FS IR for main().  (the visitor only descends into
4227        * functions called "main").
4228        */
4229       emit_nir_code();
4230
4231       if (failed)
4232          return false;
4233
4234       if (wm_prog_data->uses_kill)
4235          emit(FS_OPCODE_PLACEHOLDER_HALT);
4236
4237       if (wm_key->alpha_test_func)
4238          emit_alpha_test();
4239
4240       emit_fb_writes();
4241
4242       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4243          emit_shader_time_end();
4244
4245       calculate_cfg();
4246
4247       optimize();
4248
4249       assign_curb_setup();
4250       assign_urb_setup();
4251
4252       fixup_3src_null_dest();
4253       allocate_registers();
4254
4255       if (failed)
4256          return false;
4257    }
4258
4259    if (dispatch_width == 8)
4260       wm_prog_data->reg_blocks = brw_register_blocks(grf_used);
4261    else
4262       wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
4263
4264    /* If any state parameters were appended, then ParameterValues could have
4265     * been realloced, in which case the driver uniform storage set up by
4266     * _mesa_associate_uniform_storage() would point to freed memory.  Make
4267     * sure that didn't happen.
4268     */
4269    assert(sanity_param_count == prog->Parameters->NumParameters);
4270
4271    return !failed;
4272 }
4273
4274 bool
4275 fs_visitor::run_cs()
4276 {
4277    assert(stage == MESA_SHADER_COMPUTE);
4278    assert(shader);
4279
4280    sanity_param_count = prog->Parameters->NumParameters;
4281
4282    assign_common_binding_table_offsets(0);
4283
4284    setup_cs_payload();
4285
4286    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4287       emit_shader_time_begin();
4288
4289    emit_nir_code();
4290
4291    if (failed)
4292       return false;
4293
4294    emit_cs_terminate();
4295
4296    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4297       emit_shader_time_end();
4298
4299    calculate_cfg();
4300
4301    optimize();
4302
4303    assign_curb_setup();
4304
4305    fixup_3src_null_dest();
4306    allocate_registers();
4307
4308    if (failed)
4309       return false;
4310
4311    /* If any state parameters were appended, then ParameterValues could have
4312     * been realloced, in which case the driver uniform storage set up by
4313     * _mesa_associate_uniform_storage() would point to freed memory.  Make
4314     * sure that didn't happen.
4315     */
4316    assert(sanity_param_count == prog->Parameters->NumParameters);
4317
4318    return !failed;
4319 }
4320
4321 const unsigned *
4322 brw_wm_fs_emit(struct brw_context *brw,
4323                void *mem_ctx,
4324                const struct brw_wm_prog_key *key,
4325                struct brw_wm_prog_data *prog_data,
4326                struct gl_fragment_program *fp,
4327                struct gl_shader_program *prog,
4328                unsigned *final_assembly_size)
4329 {
4330    bool start_busy = false;
4331    double start_time = 0;
4332
4333    if (unlikely(brw->perf_debug)) {
4334       start_busy = (brw->batch.last_bo &&
4335                     drm_intel_bo_busy(brw->batch.last_bo));
4336       start_time = get_time();
4337    }
4338
4339    struct brw_shader *shader = NULL;
4340    if (prog)
4341       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
4342
4343    if (unlikely(INTEL_DEBUG & DEBUG_WM))
4344       brw_dump_ir("fragment", prog, &shader->base, &fp->Base);
4345
4346    /* Now the main event: Visit the shader IR and generate our FS IR for it.
4347     */
4348    fs_visitor v(brw, mem_ctx, MESA_SHADER_FRAGMENT, key, &prog_data->base,
4349                 prog, &fp->Base, 8);
4350    if (!v.run_fs()) {
4351       if (prog) {
4352          prog->LinkStatus = false;
4353          ralloc_strcat(&prog->InfoLog, v.fail_msg);
4354       }
4355
4356       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
4357                     v.fail_msg);
4358
4359       return NULL;
4360    }
4361
4362    cfg_t *simd16_cfg = NULL;
4363    fs_visitor v2(brw, mem_ctx, MESA_SHADER_FRAGMENT, key, &prog_data->base,
4364                  prog, &fp->Base, 16);
4365    if (likely(!(INTEL_DEBUG & DEBUG_NO16) || brw->use_rep_send)) {
4366       if (!v.simd16_unsupported) {
4367          /* Try a SIMD16 compile */
4368          v2.import_uniforms(&v);
4369          if (!v2.run_fs()) {
4370             perf_debug("SIMD16 shader failed to compile, falling back to "
4371                        "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
4372          } else {
4373             simd16_cfg = v2.cfg;
4374          }
4375       } else {
4376          perf_debug("SIMD16 shader unsupported, falling back to "
4377                     "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
4378       }
4379    }
4380
4381    cfg_t *simd8_cfg;
4382    int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8;
4383    if ((no_simd8 || brw->gen < 5) && simd16_cfg) {
4384       simd8_cfg = NULL;
4385       prog_data->no_8 = true;
4386    } else {
4387       simd8_cfg = v.cfg;
4388       prog_data->no_8 = false;
4389    }
4390
4391    fs_generator g(brw, mem_ctx, (void *) key, &prog_data->base,
4392                   &fp->Base, v.promoted_constants, v.runtime_check_aads_emit, "FS");
4393
4394    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
4395       char *name;
4396       if (prog)
4397          name = ralloc_asprintf(mem_ctx, "%s fragment shader %d",
4398                                 prog->Label ? prog->Label : "unnamed",
4399                                 prog->Name);
4400       else
4401          name = ralloc_asprintf(mem_ctx, "fragment program %d", fp->Base.Id);
4402
4403       g.enable_debug(name);
4404    }
4405
4406    if (simd8_cfg)
4407       g.generate_code(simd8_cfg, 8);
4408    if (simd16_cfg)
4409       prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
4410
4411    if (unlikely(brw->perf_debug) && shader) {
4412       if (shader->compiled_once)
4413          brw_wm_debug_recompile(brw, prog, key);
4414       shader->compiled_once = true;
4415
4416       if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
4417          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
4418                     (get_time() - start_time) * 1000);
4419       }
4420    }
4421
4422    return g.get_assembly(final_assembly_size);
4423 }
4424
4425 extern "C" bool
4426 brw_fs_precompile(struct gl_context *ctx,
4427                   struct gl_shader_program *shader_prog,
4428                   struct gl_program *prog)
4429 {
4430    struct brw_context *brw = brw_context(ctx);
4431    struct brw_wm_prog_key key;
4432
4433    struct gl_fragment_program *fp = (struct gl_fragment_program *) prog;
4434    struct brw_fragment_program *bfp = brw_fragment_program(fp);
4435    bool program_uses_dfdy = fp->UsesDFdy;
4436
4437    memset(&key, 0, sizeof(key));
4438
4439    if (brw->gen < 6) {
4440       if (fp->UsesKill)
4441          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
4442
4443       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
4444          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
4445
4446       /* Just assume depth testing. */
4447       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
4448       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
4449    }
4450
4451    if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
4452                                          BRW_FS_VARYING_INPUT_MASK) > 16)
4453       key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
4454
4455    brw_setup_tex_for_precompile(brw, &key.tex, &fp->Base);
4456
4457    if (fp->Base.InputsRead & VARYING_BIT_POS) {
4458       key.drawable_height = ctx->DrawBuffer->Height;
4459    }
4460
4461    key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
4462          ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
4463          BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
4464
4465    if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
4466       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
4467                           key.nr_color_regions > 1;
4468    }
4469
4470    key.program_string_id = bfp->id;
4471
4472    uint32_t old_prog_offset = brw->wm.base.prog_offset;
4473    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
4474
4475    bool success = brw_codegen_wm_prog(brw, shader_prog, bfp, &key);
4476
4477    brw->wm.base.prog_offset = old_prog_offset;
4478    brw->wm.prog_data = old_prog_data;
4479
4480    return success;
4481 }
4482
4483 void
4484 brw_setup_tex_for_precompile(struct brw_context *brw,
4485                              struct brw_sampler_prog_key_data *tex,
4486                              struct gl_program *prog)
4487 {
4488    const bool has_shader_channel_select = brw->is_haswell || brw->gen >= 8;
4489    unsigned sampler_count = _mesa_fls(prog->SamplersUsed);
4490    for (unsigned i = 0; i < sampler_count; i++) {
4491       if (!has_shader_channel_select && (prog->ShadowSamplers & (1 << i))) {
4492          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
4493          tex->swizzles[i] =
4494             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
4495       } else {
4496          /* Color sampler: assume no swizzling. */
4497          tex->swizzles[i] = SWIZZLE_XYZW;
4498       }
4499    }
4500 }