src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 #include <sys/types.h>
  32
  33 #include "util/hash_table.h"
  34 #include "main/macros.h"
  35 #include "main/shaderobj.h"
  36 #include "main/fbobject.h"
  37 #include "program/prog_parameter.h"
  38 #include "program/prog_print.h"
  39 #include "util/register_allocate.h"
  40 #include "program/hash_table.h"
  41 #include "brw_context.h"
  42 #include "brw_eu.h"
  43 #include "brw_wm.h"
  44 #include "brw_fs.h"
  45 #include "brw_cfg.h"
  46 #include "brw_dead_control_flow.h"
  47 #include "main/uniforms.h"
  48 #include "brw_fs_live_variables.h"
  49 #include "glsl/glsl_types.h"
  50 #include "program/sampler.h"
  51
  52 using namespace brw;
  53
  54 void
  55 fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
  56               const fs_reg *src, unsigned sources)
  57 {
  58    memset(this, 0, sizeof(*this));
  59
  60    this->src = new fs_reg[MAX2(sources, 3)];
  61    for (unsigned i = 0; i < sources; i++)
  62       this->src[i] = src[i];
  63
  64    this->opcode = opcode;
  65    this->dst = dst;
  66    this->sources = sources;
  67    this->exec_size = exec_size;
  68
  69    assert(dst.file != IMM && dst.file != UNIFORM);
  70
  71    /* If exec_size == 0, try to guess it from the registers.  Since all
  72     * manner of things may use hardware registers, we first try to guess
  73     * based on GRF registers.  If this fails, we will go ahead and take the
  74     * width from the destination register.
  75     */
  76    if (this->exec_size == 0) {
  77       if (dst.file == GRF) {
  78          this->exec_size = dst.width;
  79       } else {
  80          for (unsigned i = 0; i < sources; ++i) {
  81             if (src[i].file != GRF && src[i].file != ATTR)
  82                continue;
  83
  84             if (this->exec_size <= 1)
  85                this->exec_size = src[i].width;
  86             assert(src[i].width == 1 || src[i].width == this->exec_size);
  87          }
  88       }
  89
  90       if (this->exec_size == 0 && dst.file != BAD_FILE)
  91          this->exec_size = dst.width;
  92    }
  93    assert(this->exec_size != 0);
  94
  95    this->conditional_mod = BRW_CONDITIONAL_NONE;
  96
  97    /* This will be the case for almost all instructions. */
  98    switch (dst.file) {
  99    case GRF:
 100    case HW_REG:
 101    case MRF:
 102    case ATTR:
 103       this->regs_written =
 104          DIV_ROUND_UP(MAX2(dst.width * dst.stride, 1) * type_sz(dst.type), 32);
 105       break;
 106    case BAD_FILE:
 107       this->regs_written = 0;
 108       break;
 109    case IMM:
 110    case UNIFORM:
 111       unreachable("Invalid destination register file");
 112    default:
 113       unreachable("Invalid register file");
 114    }
 115
 116    this->writes_accumulator = false;
 117 }
 118
 119 fs_inst::fs_inst()
 120 {
 121    init(BRW_OPCODE_NOP, 8, dst, NULL, 0);
 122 }
 123
 124 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
 125 {
 126    init(opcode, exec_size, reg_undef, NULL, 0);
 127 }
 128
 129 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
 130 {
 131    init(opcode, 0, dst, NULL, 0);
 132 }
 133
 134 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 135                  const fs_reg &src0)
 136 {
 137    const fs_reg src[1] = { src0 };
 138    init(opcode, exec_size, dst, src, 1);
 139 }
 140
 141 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
 142 {
 143    const fs_reg src[1] = { src0 };
 144    init(opcode, 0, dst, src, 1);
 145 }
 146
 147 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 148                  const fs_reg &src0, const fs_reg &src1)
 149 {
 150    const fs_reg src[2] = { src0, src1 };
 151    init(opcode, exec_size, dst, src, 2);
 152 }
 153
 154 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 155                  const fs_reg &src1)
 156 {
 157    const fs_reg src[2] = { src0, src1 };
 158    init(opcode, 0, dst, src, 2);
 159 }
 160
 161 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 162                  const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
 163 {
 164    const fs_reg src[3] = { src0, src1, src2 };
 165    init(opcode, exec_size, dst, src, 3);
 166 }
 167
 168 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 169                  const fs_reg &src1, const fs_reg &src2)
 170 {
 171    const fs_reg src[3] = { src0, src1, src2 };
 172    init(opcode, 0, dst, src, 3);
 173 }
 174
 175 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst,
 176                  const fs_reg src[], unsigned sources)
 177 {
 178    init(opcode, 0, dst, src, sources);
 179 }
 180
 181 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
 182                  const fs_reg src[], unsigned sources)
 183 {
 184    init(opcode, exec_width, dst, src, sources);
 185 }
 186
 187 fs_inst::fs_inst(const fs_inst &that)
 188 {
 189    memcpy(this, &that, sizeof(that));
 190
 191    this->src = new fs_reg[MAX2(that.sources, 3)];
 192
 193    for (unsigned i = 0; i < that.sources; i++)
 194       this->src[i] = that.src[i];
 195 }
 196
 197 fs_inst::~fs_inst()
 198 {
 199    delete[] this->src;
 200 }
 201
 202 void
 203 fs_inst::resize_sources(uint8_t num_sources)
 204 {
 205    if (this->sources != num_sources) {
 206       fs_reg *src = new fs_reg[MAX2(num_sources, 3)];
 207
 208       for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i)
 209          src[i] = this->src[i];
 210
 211       delete[] this->src;
 212       this->src = src;
 213       this->sources = num_sources;
 214    }
 215 }
 216
 217 #define ALU1(op)                                                        \
 218    fs_inst *                                                            \
 219    fs_visitor::op(const fs_reg &dst, const fs_reg &src0)                \
 220    {                                                                    \
 221       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
 222    }
 223
 224 #define ALU2(op)                                                        \
 225    fs_inst *                                                            \
 226    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 227                   const fs_reg &src1)                                   \
 228    {                                                                    \
 229       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
 230    }
 231
 232 #define ALU2_ACC(op)                                                    \
 233    fs_inst *                                                            \
 234    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 235                   const fs_reg &src1)                                   \
 236    {                                                                    \
 237       fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
 238       inst->writes_accumulator = true;                                  \
 239       return inst;                                                      \
 240    }
 241
 242 #define ALU3(op)                                                        \
 243    fs_inst *                                                            \
 244    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 245                   const fs_reg &src1, const fs_reg &src2)               \
 246    {                                                                    \
 247       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
 248    }
 249
 250 ALU1(NOT)
 251 ALU1(MOV)
 252 ALU1(FRC)
 253 ALU1(RNDD)
 254 ALU1(RNDE)
 255 ALU1(RNDZ)
 256 ALU2(ADD)
 257 ALU2(MUL)
 258 ALU2_ACC(MACH)
 259 ALU2(AND)
 260 ALU2(OR)
 261 ALU2(XOR)
 262 ALU2(SHL)
 263 ALU2(SHR)
 264 ALU2(ASR)
 265 ALU3(LRP)
 266 ALU1(BFREV)
 267 ALU3(BFE)
 268 ALU2(BFI1)
 269 ALU3(BFI2)
 270 ALU1(FBH)
 271 ALU1(FBL)
 272 ALU1(CBIT)
 273 ALU3(MAD)
 274 ALU2_ACC(ADDC)
 275 ALU2_ACC(SUBB)
 276 ALU2(SEL)
 277 ALU2(MAC)
 278
 279 /** Gen4 predicated IF. */
 280 fs_inst *
 281 fs_visitor::IF(enum brw_predicate predicate)
 282 {
 283    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width);
 284    inst->predicate = predicate;
 285    return inst;
 286 }
 287
 288 /** Gen6 IF with embedded comparison. */
 289 fs_inst *
 290 fs_visitor::IF(const fs_reg &src0, const fs_reg &src1,
 291                enum brw_conditional_mod condition)
 292 {
 293    assert(devinfo->gen == 6);
 294    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width,
 295                                         reg_null_d, src0, src1);
 296    inst->conditional_mod = condition;
 297    return inst;
 298 }
 299
 300 /**
 301  * CMP: Sets the low bit of the destination channels with the result
 302  * of the comparison, while the upper bits are undefined, and updates
 303  * the flag register with the packed 16 bits of the result.
 304  */
 305 fs_inst *
 306 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
 307                 enum brw_conditional_mod condition)
 308 {
 309    fs_inst *inst;
 310
 311    /* Take the instruction:
 312     *
 313     * CMP null<d> src0<f> src1<f>
 314     *
 315     * Original gen4 does type conversion to the destination type before
 316     * comparison, producing garbage results for floating point comparisons.
 317     *
 318     * The destination type doesn't matter on newer generations, so we set the
 319     * type to match src0 so we can compact the instruction.
 320     */
 321    dst.type = src0.type;
 322    if (dst.file == HW_REG)
 323       dst.fixed_hw_reg.type = dst.type;
 324
 325    resolve_ud_negate(&src0);
 326    resolve_ud_negate(&src1);
 327
 328    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
 329    inst->conditional_mod = condition;
 330
 331    return inst;
 332 }
 333
 334 fs_inst *
 335 fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources,
 336                          int header_size)
 337 {
 338    assert(dst.width % 8 == 0);
 339    fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, dst.width,
 340                                         dst, src, sources);
 341    inst->header_size = header_size;
 342
 343    for (int i = 0; i < header_size; i++)
 344       assert(src[i].file != GRF || src[i].width * type_sz(src[i].type) == 32);
 345    inst->regs_written = header_size;
 346
 347    for (int i = header_size; i < sources; ++i)
 348       assert(src[i].file != GRF || src[i].width == dst.width);
 349    inst->regs_written += (sources - header_size) * (dst.width / 8);
 350
 351    return inst;
 352 }
 353
 354 void
 355 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld,
 356                                        const fs_reg &dst,
 357                                        const fs_reg &surf_index,
 358                                        const fs_reg &varying_offset,
 359                                        uint32_t const_offset)
 360 {
 361    /* We have our constant surface use a pitch of 4 bytes, so our index can
 362     * be any component of a vector, and then we load 4 contiguous
 363     * components starting from that.
 364     *
 365     * We break down the const_offset to a portion added to the variable
 366     * offset and a portion done using reg_offset, which means that if you
 367     * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
 368     * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
 369     * CSE can later notice that those loads are all the same and eliminate
 370     * the redundant ones.
 371     */
 372    fs_reg vec4_offset = vgrf(glsl_type::int_type);
 373    bld.ADD(vec4_offset, varying_offset, fs_reg(const_offset & ~3));
 374
 375    int scale = 1;
 376    if (devinfo->gen == 4 && dst.width == 8) {
 377       /* Pre-gen5, we can either use a SIMD8 message that requires (header,
 378        * u, v, r) as parameters, or we can just use the SIMD16 message
 379        * consisting of (header, u).  We choose the second, at the cost of a
 380        * longer return length.
 381        */
 382       scale = 2;
 383    }
 384
 385    enum opcode op;
 386    if (devinfo->gen >= 7)
 387       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
 388    else
 389       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
 390
 391    assert(dst.width % 8 == 0);
 392    int regs_written = 4 * (dst.width / 8) * scale;
 393    fs_reg vec4_result = fs_reg(GRF, alloc.allocate(regs_written),
 394                                dst.type, dst.width);
 395    fs_inst *inst = bld.emit(op, vec4_result, surf_index, vec4_offset);
 396    inst->regs_written = regs_written;
 397
 398    if (devinfo->gen < 7) {
 399       inst->base_mrf = 13;
 400       inst->header_size = 1;
 401       if (devinfo->gen == 4)
 402          inst->mlen = 3;
 403       else
 404          inst->mlen = 1 + dispatch_width / 8;
 405    }
 406
 407    bld.MOV(dst, offset(vec4_result, (const_offset & 3) * scale));
 408 }
 409
 410 /**
 411  * A helper for MOV generation for fixing up broken hardware SEND dependency
 412  * handling.
 413  */
 414 void
 415 fs_visitor::DEP_RESOLVE_MOV(const fs_builder &bld, int grf)
 416 {
 417    /* The caller always wants uncompressed to emit the minimal extra
 418     * dependencies, and to avoid having to deal with aligning its regs to 2.
 419     */
 420    const fs_builder ubld = bld.annotate("send dependency resolve")
 421                               .half(0);
 422
 423    ubld.MOV(ubld.null_reg_f(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
 424 }
 425
 426 bool
 427 fs_inst::equals(fs_inst *inst) const
 428 {
 429    return (opcode == inst->opcode &&
 430            dst.equals(inst->dst) &&
 431            src[0].equals(inst->src[0]) &&
 432            src[1].equals(inst->src[1]) &&
 433            src[2].equals(inst->src[2]) &&
 434            saturate == inst->saturate &&
 435            predicate == inst->predicate &&
 436            conditional_mod == inst->conditional_mod &&
 437            mlen == inst->mlen &&
 438            base_mrf == inst->base_mrf &&
 439            target == inst->target &&
 440            eot == inst->eot &&
 441            header_size == inst->header_size &&
 442            shadow_compare == inst->shadow_compare &&
 443            exec_size == inst->exec_size &&
 444            offset == inst->offset);
 445 }
 446
 447 bool
 448 fs_inst::overwrites_reg(const fs_reg &reg) const
 449 {
 450    return reg.in_range(dst, regs_written);
 451 }
 452
 453 bool
 454 fs_inst::is_send_from_grf() const
 455 {
 456    switch (opcode) {
 457    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
 458    case SHADER_OPCODE_SHADER_TIME_ADD:
 459    case FS_OPCODE_INTERPOLATE_AT_CENTROID:
 460    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
 461    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
 462    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
 463    case SHADER_OPCODE_UNTYPED_ATOMIC:
 464    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
 465    case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
 466    case SHADER_OPCODE_TYPED_ATOMIC:
 467    case SHADER_OPCODE_TYPED_SURFACE_READ:
 468    case SHADER_OPCODE_TYPED_SURFACE_WRITE:
 469    case SHADER_OPCODE_URB_WRITE_SIMD8:
 470       return true;
 471    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 472       return src[1].file == GRF;
 473    case FS_OPCODE_FB_WRITE:
 474       return src[0].file == GRF;
 475    default:
 476       if (is_tex())
 477          return src[0].file == GRF;
 478
 479       return false;
 480    }
 481 }
 482
 483 bool
 484 fs_inst::is_copy_payload(const brw::simple_allocator &grf_alloc) const
 485 {
 486    if (this->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
 487       return false;
 488
 489    fs_reg reg = this->src[0];
 490    if (reg.file != GRF || reg.reg_offset != 0 || reg.stride == 0)
 491       return false;
 492
 493    if (grf_alloc.sizes[reg.reg] != this->regs_written)
 494       return false;
 495
 496    for (int i = 0; i < this->sources; i++) {
 497       reg.type = this->src[i].type;
 498       reg.width = this->src[i].width;
 499       if (!this->src[i].equals(reg))
 500          return false;
 501       reg = ::offset(reg, 1);
 502    }
 503
 504    return true;
 505 }
 506
 507 bool
 508 fs_inst::can_do_source_mods(const struct brw_device_info *devinfo)
 509 {
 510    if (devinfo->gen == 6 && is_math())
 511       return false;
 512
 513    if (is_send_from_grf())
 514       return false;
 515
 516    if (!backend_instruction::can_do_source_mods())
 517       return false;
 518
 519    return true;
 520 }
 521
 522 bool
 523 fs_inst::has_side_effects() const
 524 {
 525    return this->eot || backend_instruction::has_side_effects();
 526 }
 527
 528 void
 529 fs_reg::init()
 530 {
 531    memset(this, 0, sizeof(*this));
 532    stride = 1;
 533 }
 534
 535 /** Generic unset register constructor. */
 536 fs_reg::fs_reg()
 537 {
 538    init();
 539    this->file = BAD_FILE;
 540 }
 541
 542 /** Immediate value constructor. */
 543 fs_reg::fs_reg(float f)
 544 {
 545    init();
 546    this->file = IMM;
 547    this->type = BRW_REGISTER_TYPE_F;
 548    this->fixed_hw_reg.dw1.f = f;
 549    this->width = 1;
 550 }
 551
 552 /** Immediate value constructor. */
 553 fs_reg::fs_reg(int32_t i)
 554 {
 555    init();
 556    this->file = IMM;
 557    this->type = BRW_REGISTER_TYPE_D;
 558    this->fixed_hw_reg.dw1.d = i;
 559    this->width = 1;
 560 }
 561
 562 /** Immediate value constructor. */
 563 fs_reg::fs_reg(uint32_t u)
 564 {
 565    init();
 566    this->file = IMM;
 567    this->type = BRW_REGISTER_TYPE_UD;
 568    this->fixed_hw_reg.dw1.ud = u;
 569    this->width = 1;
 570 }
 571
 572 /** Vector float immediate value constructor. */
 573 fs_reg::fs_reg(uint8_t vf[4])
 574 {
 575    init();
 576    this->file = IMM;
 577    this->type = BRW_REGISTER_TYPE_VF;
 578    memcpy(&this->fixed_hw_reg.dw1.ud, vf, sizeof(unsigned));
 579 }
 580
 581 /** Vector float immediate value constructor. */
 582 fs_reg::fs_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
 583 {
 584    init();
 585    this->file = IMM;
 586    this->type = BRW_REGISTER_TYPE_VF;
 587    this->fixed_hw_reg.dw1.ud = (vf0 <<  0) |
 588                                (vf1 <<  8) |
 589                                (vf2 << 16) |
 590                                (vf3 << 24);
 591 }
 592
 593 /** Fixed brw_reg. */
 594 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 595 {
 596    init();
 597    this->file = HW_REG;
 598    this->fixed_hw_reg = fixed_hw_reg;
 599    this->type = fixed_hw_reg.type;
 600    this->width = 1 << fixed_hw_reg.width;
 601 }
 602
 603 bool
 604 fs_reg::equals(const fs_reg &r) const
 605 {
 606    return (file == r.file &&
 607            reg == r.reg &&
 608            reg_offset == r.reg_offset &&
 609            subreg_offset == r.subreg_offset &&
 610            type == r.type &&
 611            negate == r.negate &&
 612            abs == r.abs &&
 613            !reladdr && !r.reladdr &&
 614            memcmp(&fixed_hw_reg, &r.fixed_hw_reg, sizeof(fixed_hw_reg)) == 0 &&
 615            width == r.width &&
 616            stride == r.stride);
 617 }
 618
 619 fs_reg &
 620 fs_reg::set_smear(unsigned subreg)
 621 {
 622    assert(file != HW_REG && file != IMM);
 623    subreg_offset = subreg * type_sz(type);
 624    stride = 0;
 625    return *this;
 626 }
 627
 628 bool
 629 fs_reg::is_contiguous() const
 630 {
 631    return stride == 1;
 632 }
 633
 634 int
 635 fs_visitor::type_size(const struct glsl_type *type)
 636 {
 637    unsigned int size, i;
 638
 639    switch (type->base_type) {
 640    case GLSL_TYPE_UINT:
 641    case GLSL_TYPE_INT:
 642    case GLSL_TYPE_FLOAT:
 643    case GLSL_TYPE_BOOL:
 644       return type->components();
 645    case GLSL_TYPE_ARRAY:
 646       return type_size(type->fields.array) * type->length;
 647    case GLSL_TYPE_STRUCT:
 648       size = 0;
 649       for (i = 0; i < type->length; i++) {
 650          size += type_size(type->fields.structure[i].type);
 651       }
 652       return size;
 653    case GLSL_TYPE_SAMPLER:
 654       /* Samplers take up no register space, since they're baked in at
 655        * link time.
 656        */
 657       return 0;
 658    case GLSL_TYPE_ATOMIC_UINT:
 659       return 0;
 660    case GLSL_TYPE_IMAGE:
 661    case GLSL_TYPE_VOID:
 662    case GLSL_TYPE_ERROR:
 663    case GLSL_TYPE_INTERFACE:
 664    case GLSL_TYPE_DOUBLE:
 665       unreachable("not reached");
 666    }
 667
 668    return 0;
 669 }
 670
 671 /**
 672  * Create a MOV to read the timestamp register.
 673  *
 674  * The caller is responsible for emitting the MOV.  The return value is
 675  * the destination of the MOV, with extra parameters set.
 676  */
 677 fs_reg
 678 fs_visitor::get_timestamp(const fs_builder &bld)
 679 {
 680    assert(devinfo->gen >= 7);
 681
 682    fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 683                                           BRW_ARF_TIMESTAMP,
 684                                           0),
 685                              BRW_REGISTER_TYPE_UD));
 686
 687    fs_reg dst = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 4);
 688
 689    /* We want to read the 3 fields we care about even if it's not enabled in
 690     * the dispatch.
 691     */
 692    bld.exec_all().MOV(dst, ts);
 693
 694    /* The caller wants the low 32 bits of the timestamp.  Since it's running
 695     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 696     * which is plenty of time for our purposes.  It is identical across the
 697     * EUs, but since it's tracking GPU core speed it will increment at a
 698     * varying rate as render P-states change.
 699     *
 700     * The caller could also check if render P-states have changed (or anything
 701     * else that might disrupt timing) by setting smear to 2 and checking if
 702     * that field is != 0.
 703     */
 704    dst.set_smear(0);
 705
 706    return dst;
 707 }
 708
 709 void
 710 fs_visitor::emit_shader_time_begin()
 711 {
 712    shader_start_time = get_timestamp(bld.annotate("shader time start"));
 713 }
 714
 715 void
 716 fs_visitor::emit_shader_time_end()
 717 {
 718    enum shader_time_shader_type type, written_type, reset_type;
 719    switch (stage) {
 720    case MESA_SHADER_VERTEX:
 721       type = ST_VS;
 722       written_type = ST_VS_WRITTEN;
 723       reset_type = ST_VS_RESET;
 724       break;
 725    case MESA_SHADER_GEOMETRY:
 726       type = ST_GS;
 727       written_type = ST_GS_WRITTEN;
 728       reset_type = ST_GS_RESET;
 729       break;
 730    case MESA_SHADER_FRAGMENT:
 731       if (dispatch_width == 8) {
 732          type = ST_FS8;
 733          written_type = ST_FS8_WRITTEN;
 734          reset_type = ST_FS8_RESET;
 735       } else {
 736          assert(dispatch_width == 16);
 737          type = ST_FS16;
 738          written_type = ST_FS16_WRITTEN;
 739          reset_type = ST_FS16_RESET;
 740       }
 741       break;
 742    case MESA_SHADER_COMPUTE:
 743       type = ST_CS;
 744       written_type = ST_CS_WRITTEN;
 745       reset_type = ST_CS_RESET;
 746       break;
 747    default:
 748       unreachable("fs_visitor::emit_shader_time_end missing code");
 749    }
 750
 751    /* Insert our code just before the final SEND with EOT. */
 752    exec_node *end = this->instructions.get_tail();
 753    assert(end && ((fs_inst *) end)->eot);
 754    const fs_builder ibld = bld.annotate("shader time end")
 755                               .exec_all().at(NULL, end);
 756
 757    fs_reg shader_end_time = get_timestamp(ibld);
 758
 759    /* Check that there weren't any timestamp reset events (assuming these
 760     * were the only two timestamp reads that happened).
 761     */
 762    fs_reg reset = shader_end_time;
 763    reset.set_smear(2);
 764    set_condmod(BRW_CONDITIONAL_Z,
 765                ibld.AND(ibld.null_reg_ud(), reset, fs_reg(1u)));
 766    ibld.IF(BRW_PREDICATE_NORMAL);
 767
 768    fs_reg start = shader_start_time;
 769    start.negate = true;
 770    fs_reg diff = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 1);
 771    diff.set_smear(0);
 772    ibld.ADD(diff, start, shader_end_time);
 773
 774    /* If there were no instructions between the two timestamp gets, the diff
 775     * is 2 cycles.  Remove that overhead, so I can forget about that when
 776     * trying to determine the time taken for single instructions.
 777     */
 778    ibld.ADD(diff, diff, fs_reg(-2u));
 779    SHADER_TIME_ADD(ibld, type, diff);
 780    SHADER_TIME_ADD(ibld, written_type, fs_reg(1u));
 781    ibld.emit(BRW_OPCODE_ELSE);
 782    SHADER_TIME_ADD(ibld, reset_type, fs_reg(1u));
 783    ibld.emit(BRW_OPCODE_ENDIF);
 784 }
 785
 786 void
 787 fs_visitor::SHADER_TIME_ADD(const fs_builder &bld,
 788                             enum shader_time_shader_type type, fs_reg value)
 789 {
 790    int shader_time_index =
 791       brw_get_shader_time_index(brw, shader_prog, prog, type);
 792    fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
 793
 794    fs_reg payload;
 795    if (dispatch_width == 8)
 796       payload = vgrf(glsl_type::uvec2_type);
 797    else
 798       payload = vgrf(glsl_type::uint_type);
 799
 800    bld.emit(SHADER_OPCODE_SHADER_TIME_ADD, fs_reg(), payload, offset, value);
 801 }
 802
 803 void
 804 fs_visitor::vfail(const char *format, va_list va)
 805 {
 806    char *msg;
 807
 808    if (failed)
 809       return;
 810
 811    failed = true;
 812
 813    msg = ralloc_vasprintf(mem_ctx, format, va);
 814    msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
 815
 816    this->fail_msg = msg;
 817
 818    if (debug_enabled) {
 819       fprintf(stderr, "%s",  msg);
 820    }
 821 }
 822
 823 void
 824 fs_visitor::fail(const char *format, ...)
 825 {
 826    va_list va;
 827
 828    va_start(va, format);
 829    vfail(format, va);
 830    va_end(va);
 831 }
 832
 833 /**
 834  * Mark this program as impossible to compile in SIMD16 mode.
 835  *
 836  * During the SIMD8 compile (which happens first), we can detect and flag
 837  * things that are unsupported in SIMD16 mode, so the compiler can skip
 838  * the SIMD16 compile altogether.
 839  *
 840  * During a SIMD16 compile (if one happens anyway), this just calls fail().
 841  */
 842 void
 843 fs_visitor::no16(const char *format, ...)
 844 {
 845    va_list va;
 846
 847    va_start(va, format);
 848
 849    if (dispatch_width == 16) {
 850       vfail(format, va);
 851    } else {
 852       simd16_unsupported = true;
 853
 854       if (brw->perf_debug) {
 855          if (no16_msg)
 856             ralloc_vasprintf_append(&no16_msg, format, va);
 857          else
 858             no16_msg = ralloc_vasprintf(mem_ctx, format, va);
 859       }
 860    }
 861
 862    va_end(va);
 863 }
 864
 865 fs_inst *
 866 fs_visitor::emit(enum opcode opcode)
 867 {
 868    return emit(new(mem_ctx) fs_inst(opcode, dispatch_width));
 869 }
 870
 871 fs_inst *
 872 fs_visitor::emit(enum opcode opcode, const fs_reg &dst)
 873 {
 874    return emit(new(mem_ctx) fs_inst(opcode, dst));
 875 }
 876
 877 fs_inst *
 878 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
 879 {
 880    return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
 881 }
 882
 883 fs_inst *
 884 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 885                  const fs_reg &src1)
 886 {
 887    return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
 888 }
 889
 890 fs_inst *
 891 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 892                  const fs_reg &src1, const fs_reg &src2)
 893 {
 894    return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
 895 }
 896
 897 fs_inst *
 898 fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
 899                  fs_reg src[], int sources)
 900 {
 901    return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
 902 }
 903
 904 /**
 905  * Returns true if the instruction has a flag that means it won't
 906  * update an entire destination register.
 907  *
 908  * For example, dead code elimination and live variable analysis want to know
 909  * when a write to a variable screens off any preceding values that were in
 910  * it.
 911  */
 912 bool
 913 fs_inst::is_partial_write() const
 914 {
 915    return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
 916            (this->dst.width * type_sz(this->dst.type)) < 32 ||
 917            !this->dst.is_contiguous());
 918 }
 919
 920 int
 921 fs_inst::regs_read(int arg) const
 922 {
 923    if (is_tex() && arg == 0 && src[0].file == GRF) {
 924       return mlen;
 925    } else if (opcode == FS_OPCODE_FB_WRITE && arg == 0) {
 926       return mlen;
 927    } else if (opcode == SHADER_OPCODE_URB_WRITE_SIMD8 && arg == 0) {
 928       return mlen;
 929    } else if (opcode == SHADER_OPCODE_UNTYPED_ATOMIC && arg == 0) {
 930       return mlen;
 931    } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ && arg == 0) {
 932       return mlen;
 933    } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_WRITE && arg == 0) {
 934       return mlen;
 935    } else if (opcode == SHADER_OPCODE_TYPED_ATOMIC && arg == 0) {
 936       return mlen;
 937    } else if (opcode == SHADER_OPCODE_TYPED_SURFACE_READ && arg == 0) {
 938       return mlen;
 939    } else if (opcode == SHADER_OPCODE_TYPED_SURFACE_WRITE && arg == 0) {
 940       return mlen;
 941    } else if (opcode == FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET && arg == 0) {
 942       return mlen;
 943    } else if (opcode == FS_OPCODE_LINTERP && arg == 0) {
 944       return exec_size / 4;
 945    }
 946
 947    switch (src[arg].file) {
 948    case BAD_FILE:
 949    case UNIFORM:
 950    case IMM:
 951       return 1;
 952    case GRF:
 953    case HW_REG:
 954       if (src[arg].stride == 0) {
 955          return 1;
 956       } else {
 957          int size = src[arg].width * src[arg].stride * type_sz(src[arg].type);
 958          return (size + 31) / 32;
 959       }
 960    case MRF:
 961       unreachable("MRF registers are not allowed as sources");
 962    default:
 963       unreachable("Invalid register file");
 964    }
 965 }
 966
 967 bool
 968 fs_inst::reads_flag() const
 969 {
 970    return predicate;
 971 }
 972
 973 bool
 974 fs_inst::writes_flag() const
 975 {
 976    return (conditional_mod && (opcode != BRW_OPCODE_SEL &&
 977                                opcode != BRW_OPCODE_IF &&
 978                                opcode != BRW_OPCODE_WHILE)) ||
 979           opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
 980 }
 981
 982 /**
 983  * Returns how many MRFs an FS opcode will write over.
 984  *
 985  * Note that this is not the 0 or 1 implied writes in an actual gen
 986  * instruction -- the FS opcodes often generate MOVs in addition.
 987  */
 988 int
 989 fs_visitor::implied_mrf_writes(fs_inst *inst)
 990 {
 991    if (inst->mlen == 0)
 992       return 0;
 993
 994    if (inst->base_mrf == -1)
 995       return 0;
 996
 997    switch (inst->opcode) {
 998    case SHADER_OPCODE_RCP:
 999    case SHADER_OPCODE_RSQ:
1000    case SHADER_OPCODE_SQRT:
1001    case SHADER_OPCODE_EXP2:
1002    case SHADER_OPCODE_LOG2:
1003    case SHADER_OPCODE_SIN:
1004    case SHADER_OPCODE_COS:
1005       return 1 * dispatch_width / 8;
1006    case SHADER_OPCODE_POW:
1007    case SHADER_OPCODE_INT_QUOTIENT:
1008    case SHADER_OPCODE_INT_REMAINDER:
1009       return 2 * dispatch_width / 8;
1010    case SHADER_OPCODE_TEX:
1011    case FS_OPCODE_TXB:
1012    case SHADER_OPCODE_TXD:
1013    case SHADER_OPCODE_TXF:
1014    case SHADER_OPCODE_TXF_CMS:
1015    case SHADER_OPCODE_TXF_MCS:
1016    case SHADER_OPCODE_TG4:
1017    case SHADER_OPCODE_TG4_OFFSET:
1018    case SHADER_OPCODE_TXL:
1019    case SHADER_OPCODE_TXS:
1020    case SHADER_OPCODE_LOD:
1021       return 1;
1022    case FS_OPCODE_FB_WRITE:
1023       return 2;
1024    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
1025    case SHADER_OPCODE_GEN4_SCRATCH_READ:
1026       return 1;
1027    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
1028       return inst->mlen;
1029    case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
1030       return inst->mlen;
1031    case SHADER_OPCODE_UNTYPED_ATOMIC:
1032    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
1033    case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
1034    case SHADER_OPCODE_TYPED_ATOMIC:
1035    case SHADER_OPCODE_TYPED_SURFACE_READ:
1036    case SHADER_OPCODE_TYPED_SURFACE_WRITE:
1037    case SHADER_OPCODE_URB_WRITE_SIMD8:
1038    case FS_OPCODE_INTERPOLATE_AT_CENTROID:
1039    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
1040    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
1041    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
1042       return 0;
1043    default:
1044       unreachable("not reached");
1045    }
1046 }
1047
1048 fs_reg
1049 fs_visitor::vgrf(const glsl_type *const type)
1050 {
1051    int reg_width = dispatch_width / 8;
1052    return fs_reg(GRF, alloc.allocate(type_size(type) * reg_width),
1053                  brw_type_for_base_type(type), dispatch_width);
1054 }
1055
1056 fs_reg
1057 fs_visitor::vgrf(int num_components)
1058 {
1059    int reg_width = dispatch_width / 8;
1060    return fs_reg(GRF, alloc.allocate(num_components * reg_width),
1061                  BRW_REGISTER_TYPE_F, dispatch_width);
1062 }
1063
1064 /** Fixed HW reg constructor. */
1065 fs_reg::fs_reg(enum register_file file, int reg)
1066 {
1067    init();
1068    this->file = file;
1069    this->reg = reg;
1070    this->type = BRW_REGISTER_TYPE_F;
1071
1072    switch (file) {
1073    case UNIFORM:
1074       this->width = 1;
1075       break;
1076    default:
1077       this->width = 8;
1078    }
1079 }
1080
1081 /** Fixed HW reg constructor. */
1082 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
1083 {
1084    init();
1085    this->file = file;
1086    this->reg = reg;
1087    this->type = type;
1088
1089    switch (file) {
1090    case UNIFORM:
1091       this->width = 1;
1092       break;
1093    default:
1094       this->width = 8;
1095    }
1096 }
1097
1098 /** Fixed HW reg constructor. */
1099 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type,
1100                uint8_t width)
1101 {
1102    init();
1103    this->file = file;
1104    this->reg = reg;
1105    this->type = type;
1106    this->width = width;
1107 }
1108
1109 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
1110  * This brings in those uniform definitions
1111  */
1112 void
1113 fs_visitor::import_uniforms(fs_visitor *v)
1114 {
1115    this->push_constant_loc = v->push_constant_loc;
1116    this->pull_constant_loc = v->pull_constant_loc;
1117    this->uniforms = v->uniforms;
1118    this->param_size = v->param_size;
1119 }
1120
1121 fs_reg *
1122 fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
1123                                          bool origin_upper_left)
1124 {
1125    assert(stage == MESA_SHADER_FRAGMENT);
1126    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1127    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec4_type));
1128    fs_reg wpos = *reg;
1129    bool flip = !origin_upper_left ^ key->render_to_fbo;
1130
1131    /* gl_FragCoord.x */
1132    if (pixel_center_integer) {
1133       bld.MOV(wpos, this->pixel_x);
1134    } else {
1135       bld.ADD(wpos, this->pixel_x, fs_reg(0.5f));
1136    }
1137    wpos = offset(wpos, 1);
1138
1139    /* gl_FragCoord.y */
1140    if (!flip && pixel_center_integer) {
1141       bld.MOV(wpos, this->pixel_y);
1142    } else {
1143       fs_reg pixel_y = this->pixel_y;
1144       float offset = (pixel_center_integer ? 0.0 : 0.5);
1145
1146       if (flip) {
1147          pixel_y.negate = true;
1148          offset += key->drawable_height - 1.0;
1149       }
1150
1151       bld.ADD(wpos, pixel_y, fs_reg(offset));
1152    }
1153    wpos = offset(wpos, 1);
1154
1155    /* gl_FragCoord.z */
1156    if (devinfo->gen >= 6) {
1157       bld.MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0)));
1158    } else {
1159       bld.emit(FS_OPCODE_LINTERP, wpos,
1160            this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1161            interp_reg(VARYING_SLOT_POS, 2));
1162    }
1163    wpos = offset(wpos, 1);
1164
1165    /* gl_FragCoord.w: Already set up in emit_interpolation */
1166    bld.MOV(wpos, this->wpos_w);
1167
1168    return reg;
1169 }
1170
1171 fs_inst *
1172 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1173                          glsl_interp_qualifier interpolation_mode,
1174                          bool is_centroid, bool is_sample)
1175 {
1176    brw_wm_barycentric_interp_mode barycoord_mode;
1177    if (devinfo->gen >= 6) {
1178       if (is_centroid) {
1179          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1180             barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1181          else
1182             barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1183       } else if (is_sample) {
1184           if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1185             barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1186          else
1187             barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1188       } else {
1189          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1190             barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1191          else
1192             barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1193       }
1194    } else {
1195       /* On Ironlake and below, there is only one interpolation mode.
1196        * Centroid interpolation doesn't mean anything on this hardware --
1197        * there is no multisampling.
1198        */
1199       barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1200    }
1201    return bld.emit(FS_OPCODE_LINTERP, attr,
1202                    this->delta_xy[barycoord_mode], interp);
1203 }
1204
1205 void
1206 fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
1207                                        const glsl_type *type,
1208                                        glsl_interp_qualifier interpolation_mode,
1209                                        int location, bool mod_centroid,
1210                                        bool mod_sample)
1211 {
1212    attr.type = brw_type_for_base_type(type->get_scalar_type());
1213
1214    assert(stage == MESA_SHADER_FRAGMENT);
1215    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1216    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1217
1218    unsigned int array_elements;
1219
1220    if (type->is_array()) {
1221       array_elements = type->length;
1222       if (array_elements == 0) {
1223          fail("dereferenced array '%s' has length 0\n", name);
1224       }
1225       type = type->fields.array;
1226    } else {
1227       array_elements = 1;
1228    }
1229
1230    if (interpolation_mode == INTERP_QUALIFIER_NONE) {
1231       bool is_gl_Color =
1232          location == VARYING_SLOT_COL0 || location == VARYING_SLOT_COL1;
1233       if (key->flat_shade && is_gl_Color) {
1234          interpolation_mode = INTERP_QUALIFIER_FLAT;
1235       } else {
1236          interpolation_mode = INTERP_QUALIFIER_SMOOTH;
1237       }
1238    }
1239
1240    for (unsigned int i = 0; i < array_elements; i++) {
1241       for (unsigned int j = 0; j < type->matrix_columns; j++) {
1242          if (prog_data->urb_setup[location] == -1) {
1243             /* If there's no incoming setup data for this slot, don't
1244              * emit interpolation for it.
1245              */
1246             attr = offset(attr, type->vector_elements);
1247             location++;
1248             continue;
1249          }
1250
1251          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1252             /* Constant interpolation (flat shading) case. The SF has
1253              * handed us defined values in only the constant offset
1254              * field of the setup reg.
1255              */
1256             for (unsigned int k = 0; k < type->vector_elements; k++) {
1257                struct brw_reg interp = interp_reg(location, k);
1258                interp = suboffset(interp, 3);
1259                interp.type = attr.type;
1260                bld.emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1261                attr = offset(attr, 1);
1262             }
1263          } else {
1264             /* Smooth/noperspective interpolation case. */
1265             for (unsigned int k = 0; k < type->vector_elements; k++) {
1266                struct brw_reg interp = interp_reg(location, k);
1267                if (devinfo->needs_unlit_centroid_workaround && mod_centroid) {
1268                   /* Get the pixel/sample mask into f0 so that we know
1269                    * which pixels are lit.  Then, for each channel that is
1270                    * unlit, replace the centroid data with non-centroid
1271                    * data.
1272                    */
1273                   bld.emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1274
1275                   fs_inst *inst;
1276                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1277                                       false, false);
1278                   inst->predicate = BRW_PREDICATE_NORMAL;
1279                   inst->predicate_inverse = true;
1280                   if (devinfo->has_pln)
1281                      inst->no_dd_clear = true;
1282
1283                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1284                                       mod_centroid && !key->persample_shading,
1285                                       mod_sample || key->persample_shading);
1286                   inst->predicate = BRW_PREDICATE_NORMAL;
1287                   inst->predicate_inverse = false;
1288                   if (devinfo->has_pln)
1289                      inst->no_dd_check = true;
1290
1291                } else {
1292                   emit_linterp(attr, fs_reg(interp), interpolation_mode,
1293                                mod_centroid && !key->persample_shading,
1294                                mod_sample || key->persample_shading);
1295                }
1296                if (devinfo->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1297                   bld.MUL(attr, attr, this->pixel_w);
1298                }
1299                attr = offset(attr, 1);
1300             }
1301
1302          }
1303          location++;
1304       }
1305    }
1306 }
1307
1308 fs_reg *
1309 fs_visitor::emit_frontfacing_interpolation()
1310 {
1311    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type));
1312
1313    if (devinfo->gen >= 6) {
1314       /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
1315        * a boolean result from this (~0/true or 0/false).
1316        *
1317        * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
1318        * this task in only one instruction:
1319        *    - a negation source modifier will flip the bit; and
1320        *    - a W -> D type conversion will sign extend the bit into the high
1321        *      word of the destination.
1322        *
1323        * An ASR 15 fills the low word of the destination.
1324        */
1325       fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
1326       g0.negate = true;
1327
1328       bld.ASR(*reg, g0, fs_reg(15));
1329    } else {
1330       /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
1331        * a boolean result from this (1/true or 0/false).
1332        *
1333        * Like in the above case, since the bit is the MSB of g1.6:UD we can use
1334        * the negation source modifier to flip it. Unfortunately the SHR
1335        * instruction only operates on UD (or D with an abs source modifier)
1336        * sources without negation.
1337        *
1338        * Instead, use ASR (which will give ~0/true or 0/false).
1339        */
1340       fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
1341       g1_6.negate = true;
1342
1343       bld.ASR(*reg, g1_6, fs_reg(31));
1344    }
1345
1346    return reg;
1347 }
1348
1349 void
1350 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1351 {
1352    assert(stage == MESA_SHADER_FRAGMENT);
1353    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1354    assert(dst.type == BRW_REGISTER_TYPE_F);
1355
1356    if (key->compute_pos_offset) {
1357       /* Convert int_sample_pos to floating point */
1358       bld.MOV(dst, int_sample_pos);
1359       /* Scale to the range [0, 1] */
1360       bld.MUL(dst, dst, fs_reg(1 / 16.0f));
1361    }
1362    else {
1363       /* From ARB_sample_shading specification:
1364        * "When rendering to a non-multisample buffer, or if multisample
1365        *  rasterization is disabled, gl_SamplePosition will always be
1366        *  (0.5, 0.5).
1367        */
1368       bld.MOV(dst, fs_reg(0.5f));
1369    }
1370 }
1371
1372 fs_reg *
1373 fs_visitor::emit_samplepos_setup()
1374 {
1375    assert(devinfo->gen >= 6);
1376
1377    const fs_builder abld = bld.annotate("compute sample position");
1378    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type));
1379    fs_reg pos = *reg;
1380    fs_reg int_sample_x = vgrf(glsl_type::int_type);
1381    fs_reg int_sample_y = vgrf(glsl_type::int_type);
1382
1383    /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1384     * mode will be enabled.
1385     *
1386     * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1387     * R31.1:0         Position Offset X/Y for Slot[3:0]
1388     * R31.3:2         Position Offset X/Y for Slot[7:4]
1389     * .....
1390     *
1391     * The X, Y sample positions come in as bytes in  thread payload. So, read
1392     * the positions using vstride=16, width=8, hstride=2.
1393     */
1394    struct brw_reg sample_pos_reg =
1395       stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1396                     BRW_REGISTER_TYPE_B), 16, 8, 2);
1397
1398    if (dispatch_width == 8) {
1399       abld.MOV(int_sample_x, fs_reg(sample_pos_reg));
1400    } else {
1401       abld.half(0).MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg));
1402       abld.half(1).MOV(half(int_sample_x, 1),
1403                        fs_reg(suboffset(sample_pos_reg, 16)));
1404    }
1405    /* Compute gl_SamplePosition.x */
1406    compute_sample_position(pos, int_sample_x);
1407    pos = offset(pos, 1);
1408    if (dispatch_width == 8) {
1409       abld.MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1)));
1410    } else {
1411       abld.half(0).MOV(half(int_sample_y, 0),
1412                        fs_reg(suboffset(sample_pos_reg, 1)));
1413       abld.half(1).MOV(half(int_sample_y, 1),
1414                        fs_reg(suboffset(sample_pos_reg, 17)));
1415    }
1416    /* Compute gl_SamplePosition.y */
1417    compute_sample_position(pos, int_sample_y);
1418    return reg;
1419 }
1420
1421 fs_reg *
1422 fs_visitor::emit_sampleid_setup()
1423 {
1424    assert(stage == MESA_SHADER_FRAGMENT);
1425    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1426    assert(devinfo->gen >= 6);
1427
1428    const fs_builder abld = bld.annotate("compute sample id");
1429    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
1430
1431    if (key->compute_sample_id) {
1432       fs_reg t1 = vgrf(glsl_type::int_type);
1433       fs_reg t2 = vgrf(glsl_type::int_type);
1434       t2.type = BRW_REGISTER_TYPE_UW;
1435
1436       /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1437        * 8x multisampling, subspan 0 will represent sample N (where N
1438        * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1439        * 7. We can find the value of N by looking at R0.0 bits 7:6
1440        * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1441        * (since samples are always delivered in pairs). That is, we
1442        * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1443        * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1444        * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1445        * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1446        * populating a temporary variable with the sequence (0, 1, 2, 3),
1447        * and then reading from it using vstride=1, width=4, hstride=0.
1448        * These computations hold good for 4x multisampling as well.
1449        *
1450        * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1451        * the first four slots are sample 0 of subspan 0; the next four
1452        * are sample 1 of subspan 0; the third group is sample 0 of
1453        * subspan 1, and finally sample 1 of subspan 1.
1454        */
1455       abld.exec_all()
1456           .AND(t1, fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1457                fs_reg(0xc0));
1458       abld.exec_all().SHR(t1, t1, fs_reg(5));
1459
1460       /* This works for both SIMD8 and SIMD16 */
1461       abld.exec_all()
1462           .MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210));
1463
1464       /* This special instruction takes care of setting vstride=1,
1465        * width=4, hstride=0 of t2 during an ADD instruction.
1466        */
1467       abld.emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1468    } else {
1469       /* As per GL_ARB_sample_shading specification:
1470        * "When rendering to a non-multisample buffer, or if multisample
1471        *  rasterization is disabled, gl_SampleID will always be zero."
1472        */
1473       abld.MOV(*reg, fs_reg(0));
1474    }
1475
1476    return reg;
1477 }
1478
1479 void
1480 fs_visitor::resolve_source_modifiers(fs_reg *src)
1481 {
1482    if (!src->abs && !src->negate)
1483       return;
1484
1485    fs_reg temp = retype(vgrf(1), src->type);
1486    emit(MOV(temp, *src));
1487    *src = temp;
1488 }
1489
1490 fs_reg
1491 fs_visitor::fix_math_operand(fs_reg src)
1492 {
1493    /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1494     * might be able to do better by doing execsize = 1 math and then
1495     * expanding that result out, but we would need to be careful with
1496     * masking.
1497     *
1498     * The hardware ignores source modifiers (negate and abs) on math
1499     * instructions, so we also move to a temp to set those up.
1500     */
1501    if (devinfo->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1502        !src.abs && !src.negate)
1503       return src;
1504
1505    /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1506     * operands to math
1507     */
1508    if (devinfo->gen >= 7 && src.file != IMM)
1509       return src;
1510
1511    fs_reg expanded = vgrf(glsl_type::float_type);
1512    expanded.type = src.type;
1513    emit(BRW_OPCODE_MOV, expanded, src);
1514    return expanded;
1515 }
1516
1517 fs_inst *
1518 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1519 {
1520    switch (opcode) {
1521    case SHADER_OPCODE_RCP:
1522    case SHADER_OPCODE_RSQ:
1523    case SHADER_OPCODE_SQRT:
1524    case SHADER_OPCODE_EXP2:
1525    case SHADER_OPCODE_LOG2:
1526    case SHADER_OPCODE_SIN:
1527    case SHADER_OPCODE_COS:
1528       break;
1529    default:
1530       unreachable("not reached: bad math opcode");
1531    }
1532
1533    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
1534     * might be able to do better by doing execsize = 1 math and then
1535     * expanding that result out, but we would need to be careful with
1536     * masking.
1537     *
1538     * Gen 6 hardware ignores source modifiers (negate and abs) on math
1539     * instructions, so we also move to a temp to set those up.
1540     */
1541    if (devinfo->gen == 6 || devinfo->gen == 7)
1542       src = fix_math_operand(src);
1543
1544    fs_inst *inst = emit(opcode, dst, src);
1545
1546    if (devinfo->gen < 6) {
1547       inst->base_mrf = 2;
1548       inst->mlen = dispatch_width / 8;
1549    }
1550
1551    return inst;
1552 }
1553
1554 fs_inst *
1555 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1556 {
1557    int base_mrf = 2;
1558    fs_inst *inst;
1559
1560    if (devinfo->gen >= 8) {
1561       inst = emit(opcode, dst, src0, src1);
1562    } else if (devinfo->gen >= 6) {
1563       src0 = fix_math_operand(src0);
1564       src1 = fix_math_operand(src1);
1565
1566       inst = emit(opcode, dst, src0, src1);
1567    } else {
1568       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1569        * "Message Payload":
1570        *
1571        * "Operand0[7].  For the INT DIV functions, this operand is the
1572        *  denominator."
1573        *  ...
1574        * "Operand1[7].  For the INT DIV functions, this operand is the
1575        *  numerator."
1576        */
1577       bool is_int_div = opcode != SHADER_OPCODE_POW;
1578       fs_reg &op0 = is_int_div ? src1 : src0;
1579       fs_reg &op1 = is_int_div ? src0 : src1;
1580
1581       emit(MOV(fs_reg(MRF, base_mrf + 1, op1.type, dispatch_width), op1));
1582       inst = emit(opcode, dst, op0, reg_null_f);
1583
1584       inst->base_mrf = base_mrf;
1585       inst->mlen = 2 * dispatch_width / 8;
1586    }
1587    return inst;
1588 }
1589
1590 void
1591 fs_visitor::emit_discard_jump()
1592 {
1593    assert(((brw_wm_prog_data*) this->prog_data)->uses_kill);
1594
1595    /* For performance, after a discard, jump to the end of the
1596     * shader if all relevant channels have been discarded.
1597     */
1598    fs_inst *discard_jump = emit(FS_OPCODE_DISCARD_JUMP);
1599    discard_jump->flag_subreg = 1;
1600
1601    discard_jump->predicate = (dispatch_width == 8)
1602                              ? BRW_PREDICATE_ALIGN1_ANY8H
1603                              : BRW_PREDICATE_ALIGN1_ANY16H;
1604    discard_jump->predicate_inverse = true;
1605 }
1606
1607 void
1608 fs_visitor::assign_curb_setup()
1609 {
1610    if (dispatch_width == 8) {
1611       prog_data->dispatch_grf_start_reg = payload.num_regs;
1612    } else {
1613       if (stage == MESA_SHADER_FRAGMENT) {
1614          brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1615          prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1616       } else if (stage == MESA_SHADER_COMPUTE) {
1617          brw_cs_prog_data *prog_data = (brw_cs_prog_data*) this->prog_data;
1618          prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1619       } else {
1620          unreachable("Unsupported shader type!");
1621       }
1622    }
1623
1624    prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1625
1626    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1627    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1628       for (unsigned int i = 0; i < inst->sources; i++) {
1629          if (inst->src[i].file == UNIFORM) {
1630             int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1631             int constant_nr;
1632             if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1633                constant_nr = push_constant_loc[uniform_nr];
1634             } else {
1635                /* Section 5.11 of the OpenGL 4.1 spec says:
1636                 * "Out-of-bounds reads return undefined values, which include
1637                 *  values from other variables of the active program or zero."
1638                 * Just return the first push constant.
1639                 */
1640                constant_nr = 0;
1641             }
1642
1643             struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1644                                                   constant_nr / 8,
1645                                                   constant_nr % 8);
1646
1647             inst->src[i].file = HW_REG;
1648             inst->src[i].fixed_hw_reg = byte_offset(
1649                retype(brw_reg, inst->src[i].type),
1650                inst->src[i].subreg_offset);
1651          }
1652       }
1653    }
1654 }
1655
1656 void
1657 fs_visitor::calculate_urb_setup()
1658 {
1659    assert(stage == MESA_SHADER_FRAGMENT);
1660    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1661    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1662
1663    memset(prog_data->urb_setup, -1,
1664           sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
1665
1666    int urb_next = 0;
1667    /* Figure out where each of the incoming setup attributes lands. */
1668    if (devinfo->gen >= 6) {
1669       if (_mesa_bitcount_64(prog->InputsRead &
1670                             BRW_FS_VARYING_INPUT_MASK) <= 16) {
1671          /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1672           * first 16 varying inputs, so we can put them wherever we want.
1673           * Just put them in order.
1674           *
1675           * This is useful because it means that (a) inputs not used by the
1676           * fragment shader won't take up valuable register space, and (b) we
1677           * won't have to recompile the fragment shader if it gets paired with
1678           * a different vertex (or geometry) shader.
1679           */
1680          for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1681             if (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1682                 BITFIELD64_BIT(i)) {
1683                prog_data->urb_setup[i] = urb_next++;
1684             }
1685          }
1686       } else {
1687          /* We have enough input varyings that the SF/SBE pipeline stage can't
1688           * arbitrarily rearrange them to suit our whim; we have to put them
1689           * in an order that matches the output of the previous pipeline stage
1690           * (geometry or vertex shader).
1691           */
1692          struct brw_vue_map prev_stage_vue_map;
1693          brw_compute_vue_map(devinfo, &prev_stage_vue_map,
1694                              key->input_slots_valid);
1695          int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1696          assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1697          for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1698               slot++) {
1699             int varying = prev_stage_vue_map.slot_to_varying[slot];
1700             /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1701              * unused.
1702              */
1703             if (varying != BRW_VARYING_SLOT_COUNT &&
1704                 (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1705                  BITFIELD64_BIT(varying))) {
1706                prog_data->urb_setup[varying] = slot - first_slot;
1707             }
1708          }
1709          urb_next = prev_stage_vue_map.num_slots - first_slot;
1710       }
1711    } else {
1712       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1713       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1714          /* Point size is packed into the header, not as a general attribute */
1715          if (i == VARYING_SLOT_PSIZ)
1716             continue;
1717
1718          if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1719             /* The back color slot is skipped when the front color is
1720              * also written to.  In addition, some slots can be
1721              * written in the vertex shader and not read in the
1722              * fragment shader.  So the register number must always be
1723              * incremented, mapped or not.
1724              */
1725             if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1726                prog_data->urb_setup[i] = urb_next;
1727             urb_next++;
1728          }
1729       }
1730
1731       /*
1732        * It's a FS only attribute, and we did interpolation for this attribute
1733        * in SF thread. So, count it here, too.
1734        *
1735        * See compile_sf_prog() for more info.
1736        */
1737       if (prog->InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1738          prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1739    }
1740
1741    prog_data->num_varying_inputs = urb_next;
1742 }
1743
1744 void
1745 fs_visitor::assign_urb_setup()
1746 {
1747    assert(stage == MESA_SHADER_FRAGMENT);
1748    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1749
1750    int urb_start = payload.num_regs + prog_data->base.curb_read_length;
1751
1752    /* Offset all the urb_setup[] index by the actual position of the
1753     * setup regs, now that the location of the constants has been chosen.
1754     */
1755    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1756       if (inst->opcode == FS_OPCODE_LINTERP) {
1757          assert(inst->src[1].file == HW_REG);
1758          inst->src[1].fixed_hw_reg.nr += urb_start;
1759       }
1760
1761       if (inst->opcode == FS_OPCODE_CINTERP) {
1762          assert(inst->src[0].file == HW_REG);
1763          inst->src[0].fixed_hw_reg.nr += urb_start;
1764       }
1765    }
1766
1767    /* Each attribute is 4 setup channels, each of which is half a reg. */
1768    this->first_non_payload_grf =
1769       urb_start + prog_data->num_varying_inputs * 2;
1770 }
1771
1772 void
1773 fs_visitor::assign_vs_urb_setup()
1774 {
1775    brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
1776    int grf, count, slot, channel, attr;
1777
1778    assert(stage == MESA_SHADER_VERTEX);
1779    count = _mesa_bitcount_64(vs_prog_data->inputs_read);
1780    if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid)
1781       count++;
1782
1783    /* Each attribute is 4 regs. */
1784    this->first_non_payload_grf =
1785       payload.num_regs + prog_data->curb_read_length + count * 4;
1786
1787    unsigned vue_entries =
1788       MAX2(count, vs_prog_data->base.vue_map.num_slots);
1789
1790    vs_prog_data->base.urb_entry_size = ALIGN(vue_entries, 4) / 4;
1791    vs_prog_data->base.urb_read_length = (count + 1) / 2;
1792
1793    assert(vs_prog_data->base.urb_read_length <= 15);
1794
1795    /* Rewrite all ATTR file references to the hw grf that they land in. */
1796    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1797       for (int i = 0; i < inst->sources; i++) {
1798          if (inst->src[i].file == ATTR) {
1799
1800             if (inst->src[i].reg == VERT_ATTRIB_MAX) {
1801                slot = count - 1;
1802             } else {
1803                /* Attributes come in in a contiguous block, ordered by their
1804                 * gl_vert_attrib value.  That means we can compute the slot
1805                 * number for an attribute by masking out the enabled
1806                 * attributes before it and counting the bits.
1807                 */
1808                attr = inst->src[i].reg + inst->src[i].reg_offset / 4;
1809                slot = _mesa_bitcount_64(vs_prog_data->inputs_read &
1810                                         BITFIELD64_MASK(attr));
1811             }
1812
1813             channel = inst->src[i].reg_offset & 3;
1814
1815             grf = payload.num_regs +
1816                prog_data->curb_read_length +
1817                slot * 4 + channel;
1818
1819             inst->src[i].file = HW_REG;
1820             inst->src[i].fixed_hw_reg =
1821                retype(brw_vec8_grf(grf, 0), inst->src[i].type);
1822          }
1823       }
1824    }
1825 }
1826
1827 /**
1828  * Split large virtual GRFs into separate components if we can.
1829  *
1830  * This is mostly duplicated with what brw_fs_vector_splitting does,
1831  * but that's really conservative because it's afraid of doing
1832  * splitting that doesn't result in real progress after the rest of
1833  * the optimization phases, which would cause infinite looping in
1834  * optimization.  We can do it once here, safely.  This also has the
1835  * opportunity to split interpolated values, or maybe even uniforms,
1836  * which we don't have at the IR level.
1837  *
1838  * We want to split, because virtual GRFs are what we register
1839  * allocate and spill (due to contiguousness requirements for some
1840  * instructions), and they're what we naturally generate in the
1841  * codegen process, but most virtual GRFs don't actually need to be
1842  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1843  * live intervals and better dead code elimination and coalescing.
1844  */
1845 void
1846 fs_visitor::split_virtual_grfs()
1847 {
1848    int num_vars = this->alloc.count;
1849
1850    /* Count the total number of registers */
1851    int reg_count = 0;
1852    int vgrf_to_reg[num_vars];
1853    for (int i = 0; i < num_vars; i++) {
1854       vgrf_to_reg[i] = reg_count;
1855       reg_count += alloc.sizes[i];
1856    }
1857
1858    /* An array of "split points".  For each register slot, this indicates
1859     * if this slot can be separated from the previous slot.  Every time an
1860     * instruction uses multiple elements of a register (as a source or
1861     * destination), we mark the used slots as inseparable.  Then we go
1862     * through and split the registers into the smallest pieces we can.
1863     */
1864    bool split_points[reg_count];
1865    memset(split_points, 0, sizeof(split_points));
1866
1867    /* Mark all used registers as fully splittable */
1868    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1869       if (inst->dst.file == GRF) {
1870          int reg = vgrf_to_reg[inst->dst.reg];
1871          for (unsigned j = 1; j < this->alloc.sizes[inst->dst.reg]; j++)
1872             split_points[reg + j] = true;
1873       }
1874
1875       for (int i = 0; i < inst->sources; i++) {
1876          if (inst->src[i].file == GRF) {
1877             int reg = vgrf_to_reg[inst->src[i].reg];
1878             for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].reg]; j++)
1879                split_points[reg + j] = true;
1880          }
1881       }
1882    }
1883
1884    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1885       if (inst->dst.file == GRF) {
1886          int reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
1887          for (int j = 1; j < inst->regs_written; j++)
1888             split_points[reg + j] = false;
1889       }
1890       for (int i = 0; i < inst->sources; i++) {
1891          if (inst->src[i].file == GRF) {
1892             int reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
1893             for (int j = 1; j < inst->regs_read(i); j++)
1894                split_points[reg + j] = false;
1895          }
1896       }
1897    }
1898
1899    int new_virtual_grf[reg_count];
1900    int new_reg_offset[reg_count];
1901
1902    int reg = 0;
1903    for (int i = 0; i < num_vars; i++) {
1904       /* The first one should always be 0 as a quick sanity check. */
1905       assert(split_points[reg] == false);
1906
1907       /* j = 0 case */
1908       new_reg_offset[reg] = 0;
1909       reg++;
1910       int offset = 1;
1911
1912       /* j > 0 case */
1913       for (unsigned j = 1; j < alloc.sizes[i]; j++) {
1914          /* If this is a split point, reset the offset to 0 and allocate a
1915           * new virtual GRF for the previous offset many registers
1916           */
1917          if (split_points[reg]) {
1918             assert(offset <= MAX_VGRF_SIZE);
1919             int grf = alloc.allocate(offset);
1920             for (int k = reg - offset; k < reg; k++)
1921                new_virtual_grf[k] = grf;
1922             offset = 0;
1923          }
1924          new_reg_offset[reg] = offset;
1925          offset++;
1926          reg++;
1927       }
1928
1929       /* The last one gets the original register number */
1930       assert(offset <= MAX_VGRF_SIZE);
1931       alloc.sizes[i] = offset;
1932       for (int k = reg - offset; k < reg; k++)
1933          new_virtual_grf[k] = i;
1934    }
1935    assert(reg == reg_count);
1936
1937    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1938       if (inst->dst.file == GRF) {
1939          reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
1940          inst->dst.reg = new_virtual_grf[reg];
1941          inst->dst.reg_offset = new_reg_offset[reg];
1942          assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
1943       }
1944       for (int i = 0; i < inst->sources; i++) {
1945          if (inst->src[i].file == GRF) {
1946             reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
1947             inst->src[i].reg = new_virtual_grf[reg];
1948             inst->src[i].reg_offset = new_reg_offset[reg];
1949             assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
1950          }
1951       }
1952    }
1953    invalidate_live_intervals();
1954 }
1955
1956 /**
1957  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1958  *
1959  * During code generation, we create tons of temporary variables, many of
1960  * which get immediately killed and are never used again.  Yet, in later
1961  * optimization and analysis passes, such as compute_live_intervals, we need
1962  * to loop over all the virtual GRFs.  Compacting them can save a lot of
1963  * overhead.
1964  */
1965 bool
1966 fs_visitor::compact_virtual_grfs()
1967 {
1968    bool progress = false;
1969    int remap_table[this->alloc.count];
1970    memset(remap_table, -1, sizeof(remap_table));
1971
1972    /* Mark which virtual GRFs are used. */
1973    foreach_block_and_inst(block, const fs_inst, inst, cfg) {
1974       if (inst->dst.file == GRF)
1975          remap_table[inst->dst.reg] = 0;
1976
1977       for (int i = 0; i < inst->sources; i++) {
1978          if (inst->src[i].file == GRF)
1979             remap_table[inst->src[i].reg] = 0;
1980       }
1981    }
1982
1983    /* Compact the GRF arrays. */
1984    int new_index = 0;
1985    for (unsigned i = 0; i < this->alloc.count; i++) {
1986       if (remap_table[i] == -1) {
1987          /* We just found an unused register.  This means that we are
1988           * actually going to compact something.
1989           */
1990          progress = true;
1991       } else {
1992          remap_table[i] = new_index;
1993          alloc.sizes[new_index] = alloc.sizes[i];
1994          invalidate_live_intervals();
1995          ++new_index;
1996       }
1997    }
1998
1999    this->alloc.count = new_index;
2000
2001    /* Patch all the instructions to use the newly renumbered registers */
2002    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2003       if (inst->dst.file == GRF)
2004          inst->dst.reg = remap_table[inst->dst.reg];
2005
2006       for (int i = 0; i < inst->sources; i++) {
2007          if (inst->src[i].file == GRF)
2008             inst->src[i].reg = remap_table[inst->src[i].reg];
2009       }
2010    }
2011
2012    /* Patch all the references to delta_xy, since they're used in register
2013     * allocation.  If they're unused, switch them to BAD_FILE so we don't
2014     * think some random VGRF is delta_xy.
2015     */
2016    for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
2017       if (delta_xy[i].file == GRF) {
2018          if (remap_table[delta_xy[i].reg] != -1) {
2019             delta_xy[i].reg = remap_table[delta_xy[i].reg];
2020          } else {
2021             delta_xy[i].file = BAD_FILE;
2022          }
2023       }
2024    }
2025
2026    return progress;
2027 }
2028
2029 /*
2030  * Implements array access of uniforms by inserting a
2031  * PULL_CONSTANT_LOAD instruction.
2032  *
2033  * Unlike temporary GRF array access (where we don't support it due to
2034  * the difficulty of doing relative addressing on instruction
2035  * destinations), we could potentially do array access of uniforms
2036  * that were loaded in GRF space as push constants.  In real-world
2037  * usage we've seen, though, the arrays being used are always larger
2038  * than we could load as push constants, so just always move all
2039  * uniform array access out to a pull constant buffer.
2040  */
2041 void
2042 fs_visitor::move_uniform_array_access_to_pull_constants()
2043 {
2044    if (dispatch_width != 8)
2045       return;
2046
2047    pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2048    memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
2049
2050    /* Walk through and find array access of uniforms.  Put a copy of that
2051     * uniform in the pull constant buffer.
2052     *
2053     * Note that we don't move constant-indexed accesses to arrays.  No
2054     * testing has been done of the performance impact of this choice.
2055     */
2056    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2057       for (int i = 0 ; i < inst->sources; i++) {
2058          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2059             continue;
2060
2061          int uniform = inst->src[i].reg;
2062
2063          /* If this array isn't already present in the pull constant buffer,
2064           * add it.
2065           */
2066          if (pull_constant_loc[uniform] == -1) {
2067             const gl_constant_value **values = &stage_prog_data->param[uniform];
2068
2069             assert(param_size[uniform]);
2070
2071             for (int j = 0; j < param_size[uniform]; j++) {
2072                pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
2073
2074                stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
2075                   values[j];
2076             }
2077          }
2078       }
2079    }
2080 }
2081
2082 /**
2083  * Assign UNIFORM file registers to either push constants or pull constants.
2084  *
2085  * We allow a fragment shader to have more than the specified minimum
2086  * maximum number of fragment shader uniform components (64).  If
2087  * there are too many of these, they'd fill up all of register space.
2088  * So, this will push some of them out to the pull constant buffer and
2089  * update the program to load them.
2090  */
2091 void
2092 fs_visitor::assign_constant_locations()
2093 {
2094    /* Only the first compile (SIMD8 mode) gets to decide on locations. */
2095    if (dispatch_width != 8)
2096       return;
2097
2098    /* Find which UNIFORM registers are still in use. */
2099    bool is_live[uniforms];
2100    for (unsigned int i = 0; i < uniforms; i++) {
2101       is_live[i] = false;
2102    }
2103
2104    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2105       for (int i = 0; i < inst->sources; i++) {
2106          if (inst->src[i].file != UNIFORM)
2107             continue;
2108
2109          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
2110          if (constant_nr >= 0 && constant_nr < (int) uniforms)
2111             is_live[constant_nr] = true;
2112       }
2113    }
2114
2115    /* Only allow 16 registers (128 uniform components) as push constants.
2116     *
2117     * Just demote the end of the list.  We could probably do better
2118     * here, demoting things that are rarely used in the program first.
2119     *
2120     * If changing this value, note the limitation about total_regs in
2121     * brw_curbe.c.
2122     */
2123    unsigned int max_push_components = 16 * 8;
2124    unsigned int num_push_constants = 0;
2125
2126    push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2127
2128    for (unsigned int i = 0; i < uniforms; i++) {
2129       if (!is_live[i] || pull_constant_loc[i] != -1) {
2130          /* This UNIFORM register is either dead, or has already been demoted
2131           * to a pull const.  Mark it as no longer living in the param[] array.
2132           */
2133          push_constant_loc[i] = -1;
2134          continue;
2135       }
2136
2137       if (num_push_constants < max_push_components) {
2138          /* Retain as a push constant.  Record the location in the params[]
2139           * array.
2140           */
2141          push_constant_loc[i] = num_push_constants++;
2142       } else {
2143          /* Demote to a pull constant. */
2144          push_constant_loc[i] = -1;
2145
2146          int pull_index = stage_prog_data->nr_pull_params++;
2147          stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
2148          pull_constant_loc[i] = pull_index;
2149       }
2150    }
2151
2152    stage_prog_data->nr_params = num_push_constants;
2153
2154    /* Up until now, the param[] array has been indexed by reg + reg_offset
2155     * of UNIFORM registers.  Condense it to only contain the uniforms we
2156     * chose to upload as push constants.
2157     */
2158    for (unsigned int i = 0; i < uniforms; i++) {
2159       int remapped = push_constant_loc[i];
2160
2161       if (remapped == -1)
2162          continue;
2163
2164       assert(remapped <= (int)i);
2165       stage_prog_data->param[remapped] = stage_prog_data->param[i];
2166    }
2167 }
2168
2169 /**
2170  * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
2171  * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
2172  */
2173 void
2174 fs_visitor::demote_pull_constants()
2175 {
2176    foreach_block_and_inst (block, fs_inst, inst, cfg) {
2177       for (int i = 0; i < inst->sources; i++) {
2178          if (inst->src[i].file != UNIFORM)
2179             continue;
2180
2181          int pull_index;
2182          unsigned location = inst->src[i].reg + inst->src[i].reg_offset;
2183          if (location >= uniforms) /* Out of bounds access */
2184             pull_index = -1;
2185          else
2186             pull_index = pull_constant_loc[location];
2187
2188          if (pull_index == -1)
2189             continue;
2190
2191          /* Set up the annotation tracking for new generated instructions. */
2192          const fs_builder ibld = bld.annotate(inst->annotation, inst->ir)
2193                                     .at(block, inst);
2194          fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
2195          fs_reg dst = vgrf(glsl_type::float_type);
2196
2197          /* Generate a pull load into dst. */
2198          if (inst->src[i].reladdr) {
2199             VARYING_PULL_CONSTANT_LOAD(ibld, dst,
2200                                        surf_index,
2201                                        *inst->src[i].reladdr,
2202                                        pull_index);
2203             inst->src[i].reladdr = NULL;
2204          } else {
2205             fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
2206             ibld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
2207                       dst, surf_index, offset);
2208             inst->src[i].set_smear(pull_index & 3);
2209          }
2210
2211          /* Rewrite the instruction to use the temporary VGRF. */
2212          inst->src[i].file = GRF;
2213          inst->src[i].reg = dst.reg;
2214          inst->src[i].reg_offset = 0;
2215          inst->src[i].width = dispatch_width;
2216       }
2217    }
2218    invalidate_live_intervals();
2219 }
2220
2221 bool
2222 fs_visitor::opt_algebraic()
2223 {
2224    bool progress = false;
2225
2226    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2227       switch (inst->opcode) {
2228       case BRW_OPCODE_MOV:
2229          if (inst->src[0].file != IMM)
2230             break;
2231
2232          if (inst->saturate) {
2233             if (inst->dst.type != inst->src[0].type)
2234                assert(!"unimplemented: saturate mixed types");
2235
2236             if (brw_saturate_immediate(inst->dst.type,
2237                                        &inst->src[0].fixed_hw_reg)) {
2238                inst->saturate = false;
2239                progress = true;
2240             }
2241          }
2242          break;
2243
2244       case BRW_OPCODE_MUL:
2245          if (inst->src[1].file != IMM)
2246             continue;
2247
2248          /* a * 1.0 = a */
2249          if (inst->src[1].is_one()) {
2250             inst->opcode = BRW_OPCODE_MOV;
2251             inst->src[1] = reg_undef;
2252             progress = true;
2253             break;
2254          }
2255
2256          /* a * -1.0 = -a */
2257          if (inst->src[1].is_negative_one()) {
2258             inst->opcode = BRW_OPCODE_MOV;
2259             inst->src[0].negate = !inst->src[0].negate;
2260             inst->src[1] = reg_undef;
2261             progress = true;
2262             break;
2263          }
2264
2265          /* a * 0.0 = 0.0 */
2266          if (inst->src[1].is_zero()) {
2267             inst->opcode = BRW_OPCODE_MOV;
2268             inst->src[0] = inst->src[1];
2269             inst->src[1] = reg_undef;
2270             progress = true;
2271             break;
2272          }
2273
2274          if (inst->src[0].file == IMM) {
2275             assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2276             inst->opcode = BRW_OPCODE_MOV;
2277             inst->src[0].fixed_hw_reg.dw1.f *= inst->src[1].fixed_hw_reg.dw1.f;
2278             inst->src[1] = reg_undef;
2279             progress = true;
2280             break;
2281          }
2282          break;
2283       case BRW_OPCODE_ADD:
2284          if (inst->src[1].file != IMM)
2285             continue;
2286
2287          /* a + 0.0 = a */
2288          if (inst->src[1].is_zero()) {
2289             inst->opcode = BRW_OPCODE_MOV;
2290             inst->src[1] = reg_undef;
2291             progress = true;
2292             break;
2293          }
2294
2295          if (inst->src[0].file == IMM) {
2296             assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2297             inst->opcode = BRW_OPCODE_MOV;
2298             inst->src[0].fixed_hw_reg.dw1.f += inst->src[1].fixed_hw_reg.dw1.f;
2299             inst->src[1] = reg_undef;
2300             progress = true;
2301             break;
2302          }
2303          break;
2304       case BRW_OPCODE_OR:
2305          if (inst->src[0].equals(inst->src[1])) {
2306             inst->opcode = BRW_OPCODE_MOV;
2307             inst->src[1] = reg_undef;
2308             progress = true;
2309             break;
2310          }
2311          break;
2312       case BRW_OPCODE_LRP:
2313          if (inst->src[1].equals(inst->src[2])) {
2314             inst->opcode = BRW_OPCODE_MOV;
2315             inst->src[0] = inst->src[1];
2316             inst->src[1] = reg_undef;
2317             inst->src[2] = reg_undef;
2318             progress = true;
2319             break;
2320          }
2321          break;
2322       case BRW_OPCODE_CMP:
2323          if (inst->conditional_mod == BRW_CONDITIONAL_GE &&
2324              inst->src[0].abs &&
2325              inst->src[0].negate &&
2326              inst->src[1].is_zero()) {
2327             inst->src[0].abs = false;
2328             inst->src[0].negate = false;
2329             inst->conditional_mod = BRW_CONDITIONAL_Z;
2330             progress = true;
2331             break;
2332          }
2333          break;
2334       case BRW_OPCODE_SEL:
2335          if (inst->src[0].equals(inst->src[1])) {
2336             inst->opcode = BRW_OPCODE_MOV;
2337             inst->src[1] = reg_undef;
2338             inst->predicate = BRW_PREDICATE_NONE;
2339             inst->predicate_inverse = false;
2340             progress = true;
2341          } else if (inst->saturate && inst->src[1].file == IMM) {
2342             switch (inst->conditional_mod) {
2343             case BRW_CONDITIONAL_LE:
2344             case BRW_CONDITIONAL_L:
2345                switch (inst->src[1].type) {
2346                case BRW_REGISTER_TYPE_F:
2347                   if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
2348                      inst->opcode = BRW_OPCODE_MOV;
2349                      inst->src[1] = reg_undef;
2350                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
2351                      progress = true;
2352                   }
2353                   break;
2354                default:
2355                   break;
2356                }
2357                break;
2358             case BRW_CONDITIONAL_GE:
2359             case BRW_CONDITIONAL_G:
2360                switch (inst->src[1].type) {
2361                case BRW_REGISTER_TYPE_F:
2362                   if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
2363                      inst->opcode = BRW_OPCODE_MOV;
2364                      inst->src[1] = reg_undef;
2365                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
2366                      progress = true;
2367                   }
2368                   break;
2369                default:
2370                   break;
2371                }
2372             default:
2373                break;
2374             }
2375          }
2376          break;
2377       case BRW_OPCODE_MAD:
2378          if (inst->src[1].is_zero() || inst->src[2].is_zero()) {
2379             inst->opcode = BRW_OPCODE_MOV;
2380             inst->src[1] = reg_undef;
2381             inst->src[2] = reg_undef;
2382             progress = true;
2383          } else if (inst->src[0].is_zero()) {
2384             inst->opcode = BRW_OPCODE_MUL;
2385             inst->src[0] = inst->src[2];
2386             inst->src[2] = reg_undef;
2387             progress = true;
2388          } else if (inst->src[1].is_one()) {
2389             inst->opcode = BRW_OPCODE_ADD;
2390             inst->src[1] = inst->src[2];
2391             inst->src[2] = reg_undef;
2392             progress = true;
2393          } else if (inst->src[2].is_one()) {
2394             inst->opcode = BRW_OPCODE_ADD;
2395             inst->src[2] = reg_undef;
2396             progress = true;
2397          } else if (inst->src[1].file == IMM && inst->src[2].file == IMM) {
2398             inst->opcode = BRW_OPCODE_ADD;
2399             inst->src[1].fixed_hw_reg.dw1.f *= inst->src[2].fixed_hw_reg.dw1.f;
2400             inst->src[2] = reg_undef;
2401             progress = true;
2402          }
2403          break;
2404       case SHADER_OPCODE_RCP: {
2405          fs_inst *prev = (fs_inst *)inst->prev;
2406          if (prev->opcode == SHADER_OPCODE_SQRT) {
2407             if (inst->src[0].equals(prev->dst)) {
2408                inst->opcode = SHADER_OPCODE_RSQ;
2409                inst->src[0] = prev->src[0];
2410                progress = true;
2411             }
2412          }
2413          break;
2414       }
2415       case SHADER_OPCODE_BROADCAST:
2416          if (is_uniform(inst->src[0])) {
2417             inst->opcode = BRW_OPCODE_MOV;
2418             inst->sources = 1;
2419             inst->force_writemask_all = true;
2420             progress = true;
2421          } else if (inst->src[1].file == IMM) {
2422             inst->opcode = BRW_OPCODE_MOV;
2423             inst->src[0] = component(inst->src[0],
2424                                      inst->src[1].fixed_hw_reg.dw1.ud);
2425             inst->sources = 1;
2426             inst->force_writemask_all = true;
2427             progress = true;
2428          }
2429          break;
2430
2431       default:
2432          break;
2433       }
2434
2435       /* Swap if src[0] is immediate. */
2436       if (progress && inst->is_commutative()) {
2437          if (inst->src[0].file == IMM) {
2438             fs_reg tmp = inst->src[1];
2439             inst->src[1] = inst->src[0];
2440             inst->src[0] = tmp;
2441          }
2442       }
2443    }
2444    return progress;
2445 }
2446
2447 /**
2448  * Optimize sample messages that have constant zero values for the trailing
2449  * texture coordinates. We can just reduce the message length for these
2450  * instructions instead of reserving a register for it. Trailing parameters
2451  * that aren't sent default to zero anyway. This will cause the dead code
2452  * eliminator to remove the MOV instruction that would otherwise be emitted to
2453  * set up the zero value.
2454  */
2455 bool
2456 fs_visitor::opt_zero_samples()
2457 {
2458    /* Gen4 infers the texturing opcode based on the message length so we can't
2459     * change it.
2460     */
2461    if (devinfo->gen < 5)
2462       return false;
2463
2464    bool progress = false;
2465
2466    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2467       if (!inst->is_tex())
2468          continue;
2469
2470       fs_inst *load_payload = (fs_inst *) inst->prev;
2471
2472       if (load_payload->is_head_sentinel() ||
2473           load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2474          continue;
2475
2476       /* We don't want to remove the message header or the first parameter.
2477        * Removing the first parameter is not allowed, see the Haswell PRM
2478        * volume 7, page 149:
2479        *
2480        *     "Parameter 0 is required except for the sampleinfo message, which
2481        *      has no parameter 0"
2482        */
2483       while (inst->mlen > inst->header_size + dispatch_width / 8 &&
2484              load_payload->src[(inst->mlen - inst->header_size) /
2485                                (dispatch_width / 8) +
2486                                inst->header_size - 1].is_zero()) {
2487          inst->mlen -= dispatch_width / 8;
2488          progress = true;
2489       }
2490    }
2491
2492    if (progress)
2493       invalidate_live_intervals();
2494
2495    return progress;
2496 }
2497
2498 /**
2499  * Optimize sample messages which are followed by the final RT write.
2500  *
2501  * CHV, and GEN9+ can mark a texturing SEND instruction with EOT to have its
2502  * results sent directly to the framebuffer, bypassing the EU.  Recognize the
2503  * final texturing results copied to the framebuffer write payload and modify
2504  * them to write to the framebuffer directly.
2505  */
2506 bool
2507 fs_visitor::opt_sampler_eot()
2508 {
2509    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2510
2511    if (stage != MESA_SHADER_FRAGMENT)
2512       return false;
2513
2514    if (devinfo->gen < 9 && !devinfo->is_cherryview)
2515       return false;
2516
2517    /* FINISHME: It should be possible to implement this optimization when there
2518     * are multiple drawbuffers.
2519     */
2520    if (key->nr_color_regions != 1)
2521       return false;
2522
2523    /* Look for a texturing instruction immediately before the final FB_WRITE. */
2524    fs_inst *fb_write = (fs_inst *) cfg->blocks[cfg->num_blocks - 1]->end();
2525    assert(fb_write->eot);
2526    assert(fb_write->opcode == FS_OPCODE_FB_WRITE);
2527
2528    fs_inst *tex_inst = (fs_inst *) fb_write->prev;
2529
2530    /* There wasn't one; nothing to do. */
2531    if (unlikely(tex_inst->is_head_sentinel()) || !tex_inst->is_tex())
2532       return false;
2533
2534    /* This optimisation doesn't seem to work for textureGather for some
2535     * reason. I can't find any documentation or known workarounds to indicate
2536     * that this is expected, but considering that it is probably pretty
2537     * unlikely that a shader would directly write out the results from
2538     * textureGather we might as well just disable it.
2539     */
2540    if (tex_inst->opcode == SHADER_OPCODE_TG4 ||
2541        tex_inst->opcode == SHADER_OPCODE_TG4_OFFSET)
2542       return false;
2543
2544    /* If there's no header present, we need to munge the LOAD_PAYLOAD as well.
2545     * It's very likely to be the previous instruction.
2546     */
2547    fs_inst *load_payload = (fs_inst *) tex_inst->prev;
2548    if (load_payload->is_head_sentinel() ||
2549        load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2550       return false;
2551
2552    assert(!tex_inst->eot); /* We can't get here twice */
2553    assert((tex_inst->offset & (0xff << 24)) == 0);
2554
2555    tex_inst->offset |= fb_write->target << 24;
2556    tex_inst->eot = true;
2557    tex_inst->dst = bld.null_reg_ud();
2558    fb_write->remove(cfg->blocks[cfg->num_blocks - 1]);
2559
2560    /* If a header is present, marking the eot is sufficient. Otherwise, we need
2561     * to create a new LOAD_PAYLOAD command with the same sources and a space
2562     * saved for the header. Using a new destination register not only makes sure
2563     * we have enough space, but it will make sure the dead code eliminator kills
2564     * the instruction that this will replace.
2565     */
2566    if (tex_inst->header_size != 0)
2567       return true;
2568
2569    fs_reg send_header = bld.vgrf(BRW_REGISTER_TYPE_F,
2570                                  load_payload->sources + 1);
2571    fs_reg *new_sources =
2572       ralloc_array(mem_ctx, fs_reg, load_payload->sources + 1);
2573
2574    new_sources[0] = fs_reg();
2575    for (int i = 0; i < load_payload->sources; i++)
2576       new_sources[i+1] = load_payload->src[i];
2577
2578    /* The LOAD_PAYLOAD helper seems like the obvious choice here. However, it
2579     * requires a lot of information about the sources to appropriately figure
2580     * out the number of registers needed to be used. Given this stage in our
2581     * optimization, we may not have the appropriate GRFs required by
2582     * LOAD_PAYLOAD at this point (copy propagation). Therefore, we need to
2583     * manually emit the instruction.
2584     */
2585    fs_inst *new_load_payload = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD,
2586                                                     load_payload->exec_size,
2587                                                     send_header,
2588                                                     new_sources,
2589                                                     load_payload->sources + 1);
2590
2591    new_load_payload->regs_written = load_payload->regs_written + 1;
2592    new_load_payload->header_size = 1;
2593    tex_inst->mlen++;
2594    tex_inst->header_size = 1;
2595    tex_inst->insert_before(cfg->blocks[cfg->num_blocks - 1], new_load_payload);
2596    tex_inst->src[0] = send_header;
2597
2598    return true;
2599 }
2600
2601 bool
2602 fs_visitor::opt_register_renaming()
2603 {
2604    bool progress = false;
2605    int depth = 0;
2606
2607    int remap[alloc.count];
2608    memset(remap, -1, sizeof(int) * alloc.count);
2609
2610    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2611       if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
2612          depth++;
2613       } else if (inst->opcode == BRW_OPCODE_ENDIF ||
2614                  inst->opcode == BRW_OPCODE_WHILE) {
2615          depth--;
2616       }
2617
2618       /* Rewrite instruction sources. */
2619       for (int i = 0; i < inst->sources; i++) {
2620          if (inst->src[i].file == GRF &&
2621              remap[inst->src[i].reg] != -1 &&
2622              remap[inst->src[i].reg] != inst->src[i].reg) {
2623             inst->src[i].reg = remap[inst->src[i].reg];
2624             progress = true;
2625          }
2626       }
2627
2628       const int dst = inst->dst.reg;
2629
2630       if (depth == 0 &&
2631           inst->dst.file == GRF &&
2632           alloc.sizes[inst->dst.reg] == inst->dst.width / 8 &&
2633           !inst->is_partial_write()) {
2634          if (remap[dst] == -1) {
2635             remap[dst] = dst;
2636          } else {
2637             remap[dst] = alloc.allocate(inst->dst.width / 8);
2638             inst->dst.reg = remap[dst];
2639             progress = true;
2640          }
2641       } else if (inst->dst.file == GRF &&
2642                  remap[dst] != -1 &&
2643                  remap[dst] != dst) {
2644          inst->dst.reg = remap[dst];
2645          progress = true;
2646       }
2647    }
2648
2649    if (progress) {
2650       invalidate_live_intervals();
2651
2652       for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
2653          if (delta_xy[i].file == GRF && remap[delta_xy[i].reg] != -1) {
2654             delta_xy[i].reg = remap[delta_xy[i].reg];
2655          }
2656       }
2657    }
2658
2659    return progress;
2660 }
2661
2662 /**
2663  * Remove redundant or useless discard jumps.
2664  *
2665  * For example, we can eliminate jumps in the following sequence:
2666  *
2667  * discard-jump       (redundant with the next jump)
2668  * discard-jump       (useless; jumps to the next instruction)
2669  * placeholder-halt
2670  */
2671 bool
2672 fs_visitor::opt_redundant_discard_jumps()
2673 {
2674    bool progress = false;
2675
2676    bblock_t *last_bblock = cfg->blocks[cfg->num_blocks - 1];
2677
2678    fs_inst *placeholder_halt = NULL;
2679    foreach_inst_in_block_reverse(fs_inst, inst, last_bblock) {
2680       if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT) {
2681          placeholder_halt = inst;
2682          break;
2683       }
2684    }
2685
2686    if (!placeholder_halt)
2687       return false;
2688
2689    /* Delete any HALTs immediately before the placeholder halt. */
2690    for (fs_inst *prev = (fs_inst *) placeholder_halt->prev;
2691         !prev->is_head_sentinel() && prev->opcode == FS_OPCODE_DISCARD_JUMP;
2692         prev = (fs_inst *) placeholder_halt->prev) {
2693       prev->remove(last_bblock);
2694       progress = true;
2695    }
2696
2697    if (progress)
2698       invalidate_live_intervals();
2699
2700    return progress;
2701 }
2702
2703 bool
2704 fs_visitor::compute_to_mrf()
2705 {
2706    bool progress = false;
2707    int next_ip = 0;
2708
2709    /* No MRFs on Gen >= 7. */
2710    if (devinfo->gen >= 7)
2711       return false;
2712
2713    calculate_live_intervals();
2714
2715    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2716       int ip = next_ip;
2717       next_ip++;
2718
2719       if (inst->opcode != BRW_OPCODE_MOV ||
2720           inst->is_partial_write() ||
2721           inst->dst.file != MRF || inst->src[0].file != GRF ||
2722           inst->dst.type != inst->src[0].type ||
2723           inst->src[0].abs || inst->src[0].negate ||
2724           !inst->src[0].is_contiguous() ||
2725           inst->src[0].subreg_offset)
2726          continue;
2727
2728       /* Work out which hardware MRF registers are written by this
2729        * instruction.
2730        */
2731       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2732       int mrf_high;
2733       if (inst->dst.reg & BRW_MRF_COMPR4) {
2734          mrf_high = mrf_low + 4;
2735       } else if (inst->exec_size == 16) {
2736          mrf_high = mrf_low + 1;
2737       } else {
2738          mrf_high = mrf_low;
2739       }
2740
2741       /* Can't compute-to-MRF this GRF if someone else was going to
2742        * read it later.
2743        */
2744       if (this->virtual_grf_end[inst->src[0].reg] > ip)
2745          continue;
2746
2747       /* Found a move of a GRF to a MRF.  Let's see if we can go
2748        * rewrite the thing that made this GRF to write into the MRF.
2749        */
2750       foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2751          if (scan_inst->dst.file == GRF &&
2752              scan_inst->dst.reg == inst->src[0].reg) {
2753             /* Found the last thing to write our reg we want to turn
2754              * into a compute-to-MRF.
2755              */
2756
2757             /* If this one instruction didn't populate all the
2758              * channels, bail.  We might be able to rewrite everything
2759              * that writes that reg, but it would require smarter
2760              * tracking to delay the rewriting until complete success.
2761              */
2762             if (scan_inst->is_partial_write())
2763                break;
2764
2765             /* Things returning more than one register would need us to
2766              * understand coalescing out more than one MOV at a time.
2767              */
2768             if (scan_inst->regs_written > scan_inst->dst.width / 8)
2769                break;
2770
2771             /* SEND instructions can't have MRF as a destination. */
2772             if (scan_inst->mlen)
2773                break;
2774
2775             if (devinfo->gen == 6) {
2776                /* gen6 math instructions must have the destination be
2777                 * GRF, so no compute-to-MRF for them.
2778                 */
2779                if (scan_inst->is_math()) {
2780                   break;
2781                }
2782             }
2783
2784             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2785                /* Found the creator of our MRF's source value. */
2786                scan_inst->dst.file = MRF;
2787                scan_inst->dst.reg = inst->dst.reg;
2788                scan_inst->saturate |= inst->saturate;
2789                inst->remove(block);
2790                progress = true;
2791             }
2792             break;
2793          }
2794
2795          /* We don't handle control flow here.  Most computation of
2796           * values that end up in MRFs are shortly before the MRF
2797           * write anyway.
2798           */
2799          if (block->start() == scan_inst)
2800             break;
2801
2802          /* You can't read from an MRF, so if someone else reads our
2803           * MRF's source GRF that we wanted to rewrite, that stops us.
2804           */
2805          bool interfered = false;
2806          for (int i = 0; i < scan_inst->sources; i++) {
2807             if (scan_inst->src[i].file == GRF &&
2808                 scan_inst->src[i].reg == inst->src[0].reg &&
2809                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2810                interfered = true;
2811             }
2812          }
2813          if (interfered)
2814             break;
2815
2816          if (scan_inst->dst.file == MRF) {
2817             /* If somebody else writes our MRF here, we can't
2818              * compute-to-MRF before that.
2819              */
2820             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2821             int scan_mrf_high;
2822
2823             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2824                scan_mrf_high = scan_mrf_low + 4;
2825             } else if (scan_inst->exec_size == 16) {
2826                scan_mrf_high = scan_mrf_low + 1;
2827             } else {
2828                scan_mrf_high = scan_mrf_low;
2829             }
2830
2831             if (mrf_low == scan_mrf_low ||
2832                 mrf_low == scan_mrf_high ||
2833                 mrf_high == scan_mrf_low ||
2834                 mrf_high == scan_mrf_high) {
2835                break;
2836             }
2837          }
2838
2839          if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2840             /* Found a SEND instruction, which means that there are
2841              * live values in MRFs from base_mrf to base_mrf +
2842              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2843              * above it.
2844              */
2845             if (mrf_low >= scan_inst->base_mrf &&
2846                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2847                break;
2848             }
2849             if (mrf_high >= scan_inst->base_mrf &&
2850                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2851                break;
2852             }
2853          }
2854       }
2855    }
2856
2857    if (progress)
2858       invalidate_live_intervals();
2859
2860    return progress;
2861 }
2862
2863 /**
2864  * Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control
2865  * flow.  We could probably do better here with some form of divergence
2866  * analysis.
2867  */
2868 bool
2869 fs_visitor::eliminate_find_live_channel()
2870 {
2871    bool progress = false;
2872    unsigned depth = 0;
2873
2874    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2875       switch (inst->opcode) {
2876       case BRW_OPCODE_IF:
2877       case BRW_OPCODE_DO:
2878          depth++;
2879          break;
2880
2881       case BRW_OPCODE_ENDIF:
2882       case BRW_OPCODE_WHILE:
2883          depth--;
2884          break;
2885
2886       case FS_OPCODE_DISCARD_JUMP:
2887          /* This can potentially make control flow non-uniform until the end
2888           * of the program.
2889           */
2890          return progress;
2891
2892       case SHADER_OPCODE_FIND_LIVE_CHANNEL:
2893          if (depth == 0) {
2894             inst->opcode = BRW_OPCODE_MOV;
2895             inst->src[0] = fs_reg(0);
2896             inst->sources = 1;
2897             inst->force_writemask_all = true;
2898             progress = true;
2899          }
2900          break;
2901
2902       default:
2903          break;
2904       }
2905    }
2906
2907    return progress;
2908 }
2909
2910 /**
2911  * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
2912  * instructions to FS_OPCODE_REP_FB_WRITE.
2913  */
2914 void
2915 fs_visitor::emit_repclear_shader()
2916 {
2917    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2918    int base_mrf = 1;
2919    int color_mrf = base_mrf + 2;
2920
2921    fs_inst *mov = emit(MOV(vec4(brw_message_reg(color_mrf)),
2922                            fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F)));
2923    mov->force_writemask_all = true;
2924
2925    fs_inst *write;
2926    if (key->nr_color_regions == 1) {
2927       write = emit(FS_OPCODE_REP_FB_WRITE);
2928       write->saturate = key->clamp_fragment_color;
2929       write->base_mrf = color_mrf;
2930       write->target = 0;
2931       write->header_size = 0;
2932       write->mlen = 1;
2933    } else {
2934       assume(key->nr_color_regions > 0);
2935       for (int i = 0; i < key->nr_color_regions; ++i) {
2936          write = emit(FS_OPCODE_REP_FB_WRITE);
2937          write->saturate = key->clamp_fragment_color;
2938          write->base_mrf = base_mrf;
2939          write->target = i;
2940          write->header_size = 2;
2941          write->mlen = 3;
2942       }
2943    }
2944    write->eot = true;
2945
2946    calculate_cfg();
2947
2948    assign_constant_locations();
2949    assign_curb_setup();
2950
2951    /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
2952    assert(mov->src[0].file == HW_REG);
2953    mov->src[0] = brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
2954 }
2955
2956 /**
2957  * Walks through basic blocks, looking for repeated MRF writes and
2958  * removing the later ones.
2959  */
2960 bool
2961 fs_visitor::remove_duplicate_mrf_writes()
2962 {
2963    fs_inst *last_mrf_move[16];
2964    bool progress = false;
2965
2966    /* Need to update the MRF tracking for compressed instructions. */
2967    if (dispatch_width == 16)
2968       return false;
2969
2970    memset(last_mrf_move, 0, sizeof(last_mrf_move));
2971
2972    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
2973       if (inst->is_control_flow()) {
2974          memset(last_mrf_move, 0, sizeof(last_mrf_move));
2975       }
2976
2977       if (inst->opcode == BRW_OPCODE_MOV &&
2978           inst->dst.file == MRF) {
2979          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2980          if (prev_inst && inst->equals(prev_inst)) {
2981             inst->remove(block);
2982             progress = true;
2983             continue;
2984          }
2985       }
2986
2987       /* Clear out the last-write records for MRFs that were overwritten. */
2988       if (inst->dst.file == MRF) {
2989          last_mrf_move[inst->dst.reg] = NULL;
2990       }
2991
2992       if (inst->mlen > 0 && inst->base_mrf != -1) {
2993          /* Found a SEND instruction, which will include two or fewer
2994           * implied MRF writes.  We could do better here.
2995           */
2996          for (int i = 0; i < implied_mrf_writes(inst); i++) {
2997             last_mrf_move[inst->base_mrf + i] = NULL;
2998          }
2999       }
3000
3001       /* Clear out any MRF move records whose sources got overwritten. */
3002       if (inst->dst.file == GRF) {
3003          for (unsigned int i = 0; i < ARRAY_SIZE(last_mrf_move); i++) {
3004             if (last_mrf_move[i] &&
3005                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
3006                last_mrf_move[i] = NULL;
3007             }
3008          }
3009       }
3010
3011       if (inst->opcode == BRW_OPCODE_MOV &&
3012           inst->dst.file == MRF &&
3013           inst->src[0].file == GRF &&
3014           !inst->is_partial_write()) {
3015          last_mrf_move[inst->dst.reg] = inst;
3016       }
3017    }
3018
3019    if (progress)
3020       invalidate_live_intervals();
3021
3022    return progress;
3023 }
3024
3025 static void
3026 clear_deps_for_inst_src(fs_inst *inst, bool *deps, int first_grf, int grf_len)
3027 {
3028    /* Clear the flag for registers that actually got read (as expected). */
3029    for (int i = 0; i < inst->sources; i++) {
3030       int grf;
3031       if (inst->src[i].file == GRF) {
3032          grf = inst->src[i].reg;
3033       } else if (inst->src[i].file == HW_REG &&
3034                  inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
3035          grf = inst->src[i].fixed_hw_reg.nr;
3036       } else {
3037          continue;
3038       }
3039
3040       if (grf >= first_grf &&
3041           grf < first_grf + grf_len) {
3042          deps[grf - first_grf] = false;
3043          if (inst->exec_size == 16)
3044             deps[grf - first_grf + 1] = false;
3045       }
3046    }
3047 }
3048
3049 /**
3050  * Implements this workaround for the original 965:
3051  *
3052  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
3053  *      check for post destination dependencies on this instruction, software
3054  *      must ensure that there is no destination hazard for the case of ‘write
3055  *      followed by a posted write’ shown in the following example.
3056  *
3057  *      1. mov r3 0
3058  *      2. send r3.xy <rest of send instruction>
3059  *      3. mov r2 r3
3060  *
3061  *      Due to no post-destination dependency check on the ‘send’, the above
3062  *      code sequence could have two instructions (1 and 2) in flight at the
3063  *      same time that both consider ‘r3’ as the target of their final writes.
3064  */
3065 void
3066 fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
3067                                                         fs_inst *inst)
3068 {
3069    int write_len = inst->regs_written;
3070    int first_write_grf = inst->dst.reg;
3071    bool needs_dep[BRW_MAX_MRF];
3072    assert(write_len < (int)sizeof(needs_dep) - 1);
3073
3074    memset(needs_dep, false, sizeof(needs_dep));
3075    memset(needs_dep, true, write_len);
3076
3077    clear_deps_for_inst_src(inst, needs_dep, first_write_grf, write_len);
3078
3079    /* Walk backwards looking for writes to registers we're writing which
3080     * aren't read since being written.  If we hit the start of the program,
3081     * we assume that there are no outstanding dependencies on entry to the
3082     * program.
3083     */
3084    foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
3085       /* If we hit control flow, assume that there *are* outstanding
3086        * dependencies, and force their cleanup before our instruction.
3087        */
3088       if (block->start() == scan_inst) {
3089          for (int i = 0; i < write_len; i++) {
3090             if (needs_dep[i])
3091                DEP_RESOLVE_MOV(bld.at(block, inst), first_write_grf + i);
3092          }
3093          return;
3094       }
3095
3096       /* We insert our reads as late as possible on the assumption that any
3097        * instruction but a MOV that might have left us an outstanding
3098        * dependency has more latency than a MOV.
3099        */
3100       if (scan_inst->dst.file == GRF) {
3101          for (int i = 0; i < scan_inst->regs_written; i++) {
3102             int reg = scan_inst->dst.reg + i;
3103
3104             if (reg >= first_write_grf &&
3105                 reg < first_write_grf + write_len &&
3106                 needs_dep[reg - first_write_grf]) {
3107                DEP_RESOLVE_MOV(bld.at(block, inst), reg);
3108                needs_dep[reg - first_write_grf] = false;
3109                if (scan_inst->exec_size == 16)
3110                   needs_dep[reg - first_write_grf + 1] = false;
3111             }
3112          }
3113       }
3114
3115       /* Clear the flag for registers that actually got read (as expected). */
3116       clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3117
3118       /* Continue the loop only if we haven't resolved all the dependencies */
3119       int i;
3120       for (i = 0; i < write_len; i++) {
3121          if (needs_dep[i])
3122             break;
3123       }
3124       if (i == write_len)
3125          return;
3126    }
3127 }
3128
3129 /**
3130  * Implements this workaround for the original 965:
3131  *
3132  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
3133  *      used as a destination register until after it has been sourced by an
3134  *      instruction with a different destination register.
3135  */
3136 void
3137 fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
3138 {
3139    int write_len = inst->regs_written;
3140    int first_write_grf = inst->dst.reg;
3141    bool needs_dep[BRW_MAX_MRF];
3142    assert(write_len < (int)sizeof(needs_dep) - 1);
3143
3144    memset(needs_dep, false, sizeof(needs_dep));
3145    memset(needs_dep, true, write_len);
3146    /* Walk forwards looking for writes to registers we're writing which aren't
3147     * read before being written.
3148     */
3149    foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst, block) {
3150       /* If we hit control flow, force resolve all remaining dependencies. */
3151       if (block->end() == scan_inst) {
3152          for (int i = 0; i < write_len; i++) {
3153             if (needs_dep[i])
3154                DEP_RESOLVE_MOV(bld.at(block, scan_inst), first_write_grf + i);
3155          }
3156          return;
3157       }
3158
3159       /* Clear the flag for registers that actually got read (as expected). */
3160       clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3161
3162       /* We insert our reads as late as possible since they're reading the
3163        * result of a SEND, which has massive latency.
3164        */
3165       if (scan_inst->dst.file == GRF &&
3166           scan_inst->dst.reg >= first_write_grf &&
3167           scan_inst->dst.reg < first_write_grf + write_len &&
3168           needs_dep[scan_inst->dst.reg - first_write_grf]) {
3169          DEP_RESOLVE_MOV(bld.at(block, scan_inst), scan_inst->dst.reg);
3170          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
3171       }
3172
3173       /* Continue the loop only if we haven't resolved all the dependencies */
3174       int i;
3175       for (i = 0; i < write_len; i++) {
3176          if (needs_dep[i])
3177             break;
3178       }
3179       if (i == write_len)
3180          return;
3181    }
3182 }
3183
3184 void
3185 fs_visitor::insert_gen4_send_dependency_workarounds()
3186 {
3187    if (devinfo->gen != 4 || devinfo->is_g4x)
3188       return;
3189
3190    bool progress = false;
3191
3192    /* Note that we're done with register allocation, so GRF fs_regs always
3193     * have a .reg_offset of 0.
3194     */
3195
3196    foreach_block_and_inst(block, fs_inst, inst, cfg) {
3197       if (inst->mlen != 0 && inst->dst.file == GRF) {
3198          insert_gen4_pre_send_dependency_workarounds(block, inst);
3199          insert_gen4_post_send_dependency_workarounds(block, inst);
3200          progress = true;
3201       }
3202    }
3203
3204    if (progress)
3205       invalidate_live_intervals();
3206 }
3207
3208 /**
3209  * Turns the generic expression-style uniform pull constant load instruction
3210  * into a hardware-specific series of instructions for loading a pull
3211  * constant.
3212  *
3213  * The expression style allows the CSE pass before this to optimize out
3214  * repeated loads from the same offset, and gives the pre-register-allocation
3215  * scheduling full flexibility, while the conversion to native instructions
3216  * allows the post-register-allocation scheduler the best information
3217  * possible.
3218  *
3219  * Note that execution masking for setting up pull constant loads is special:
3220  * the channels that need to be written are unrelated to the current execution
3221  * mask, since a later instruction will use one of the result channels as a
3222  * source operand for all 8 or 16 of its channels.
3223  */
3224 void
3225 fs_visitor::lower_uniform_pull_constant_loads()
3226 {
3227    foreach_block_and_inst (block, fs_inst, inst, cfg) {
3228       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
3229          continue;
3230
3231       if (devinfo->gen >= 7) {
3232          /* The offset arg before was a vec4-aligned byte offset.  We need to
3233           * turn it into a dword offset.
3234           */
3235          fs_reg const_offset_reg = inst->src[1];
3236          assert(const_offset_reg.file == IMM &&
3237                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
3238          const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
3239          fs_reg payload = fs_reg(GRF, alloc.allocate(1));
3240
3241          /* We have to use a message header on Skylake to get SIMD4x2 mode.
3242           * Reserve space for the register.
3243           */
3244          if (devinfo->gen >= 9) {
3245             payload.reg_offset++;
3246             alloc.sizes[payload.reg] = 2;
3247          }
3248
3249          /* This is actually going to be a MOV, but since only the first dword
3250           * is accessed, we have a special opcode to do just that one.  Note
3251           * that this needs to be an operation that will be considered a def
3252           * by live variable analysis, or register allocation will explode.
3253           */
3254          fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
3255                                                8, payload, const_offset_reg);
3256          setup->force_writemask_all = true;
3257
3258          setup->ir = inst->ir;
3259          setup->annotation = inst->annotation;
3260          inst->insert_before(block, setup);
3261
3262          /* Similarly, this will only populate the first 4 channels of the
3263           * result register (since we only use smear values from 0-3), but we
3264           * don't tell the optimizer.
3265           */
3266          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
3267          inst->src[1] = payload;
3268
3269          invalidate_live_intervals();
3270       } else {
3271          /* Before register allocation, we didn't tell the scheduler about the
3272           * MRF we use.  We know it's safe to use this MRF because nothing
3273           * else does except for register spill/unspill, which generates and
3274           * uses its MRF within a single IR instruction.
3275           */
3276          inst->base_mrf = 14;
3277          inst->mlen = 1;
3278       }
3279    }
3280 }
3281
3282 bool
3283 fs_visitor::lower_load_payload()
3284 {
3285    bool progress = false;
3286
3287    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3288       if (inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
3289          continue;
3290
3291       assert(inst->dst.file == MRF || inst->dst.file == GRF);
3292       assert(inst->saturate == false);
3293
3294       const fs_builder ibld = bld.group(inst->exec_size, inst->force_sechalf)
3295                                  .exec_all(inst->force_writemask_all)
3296                                  .at(block, inst);
3297       fs_reg dst = inst->dst;
3298
3299       /* Get rid of COMPR4.  We'll add it back in if we need it */
3300       if (dst.file == MRF)
3301          dst.reg = dst.reg & ~BRW_MRF_COMPR4;
3302
3303       dst.width = 8;
3304       for (uint8_t i = 0; i < inst->header_size; i++) {
3305          if (inst->src[i].file != BAD_FILE) {
3306             fs_reg mov_dst = retype(dst, BRW_REGISTER_TYPE_UD);
3307             fs_reg mov_src = retype(inst->src[i], BRW_REGISTER_TYPE_UD);
3308             mov_src.width = 8;
3309             ibld.exec_all().MOV(mov_dst, mov_src);
3310          }
3311          dst = offset(dst, 1);
3312       }
3313
3314       dst.width = inst->exec_size;
3315       if (inst->dst.file == MRF && (inst->dst.reg & BRW_MRF_COMPR4) &&
3316           inst->exec_size > 8) {
3317          /* In this case, the payload portion of the LOAD_PAYLOAD isn't
3318           * a straightforward copy.  Instead, the result of the
3319           * LOAD_PAYLOAD is treated as interleaved and the first four
3320           * non-header sources are unpacked as:
3321           *
3322           * m + 0: r0
3323           * m + 1: g0
3324           * m + 2: b0
3325           * m + 3: a0
3326           * m + 4: r1
3327           * m + 5: g1
3328           * m + 6: b1
3329           * m + 7: a1
3330           *
3331           * This is used for gen <= 5 fb writes.
3332           */
3333          assert(inst->exec_size == 16);
3334          assert(inst->header_size + 4 <= inst->sources);
3335          for (uint8_t i = inst->header_size; i < inst->header_size + 4; i++) {
3336             if (inst->src[i].file != BAD_FILE) {
3337                if (devinfo->has_compr4) {
3338                   fs_reg compr4_dst = retype(dst, inst->src[i].type);
3339                   compr4_dst.reg |= BRW_MRF_COMPR4;
3340                   ibld.MOV(compr4_dst, inst->src[i]);
3341                } else {
3342                   /* Platform doesn't have COMPR4.  We have to fake it */
3343                   fs_reg mov_dst = retype(dst, inst->src[i].type);
3344                   mov_dst.width = 8;
3345                   ibld.half(0).MOV(mov_dst, half(inst->src[i], 0));
3346                   ibld.half(1).MOV(offset(mov_dst, 4), half(inst->src[i], 1));
3347                }
3348             }
3349
3350             dst.reg++;
3351          }
3352
3353          /* The loop above only ever incremented us through the first set
3354           * of 4 registers.  However, thanks to the magic of COMPR4, we
3355           * actually wrote to the first 8 registers, so we need to take
3356           * that into account now.
3357           */
3358          dst.reg += 4;
3359
3360          /* The COMPR4 code took care of the first 4 sources.  We'll let
3361           * the regular path handle any remaining sources.  Yes, we are
3362           * modifying the instruction but we're about to delete it so
3363           * this really doesn't hurt anything.
3364           */
3365          inst->header_size += 4;
3366       }
3367
3368       for (uint8_t i = inst->header_size; i < inst->sources; i++) {
3369          if (inst->src[i].file != BAD_FILE)
3370             ibld.MOV(retype(dst, inst->src[i].type), inst->src[i]);
3371          dst = offset(dst, 1);
3372       }
3373
3374       inst->remove(block);
3375       progress = true;
3376    }
3377
3378    if (progress)
3379       invalidate_live_intervals();
3380
3381    return progress;
3382 }
3383
3384 bool
3385 fs_visitor::lower_integer_multiplication()
3386 {
3387    bool progress = false;
3388
3389    /* Gen8's MUL instruction can do a 32-bit x 32-bit -> 32-bit operation
3390     * directly, but Cherryview cannot.
3391     */
3392    if (devinfo->gen >= 8 && !devinfo->is_cherryview)
3393       return false;
3394
3395    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
3396       if (inst->opcode != BRW_OPCODE_MUL ||
3397           inst->dst.is_accumulator() ||
3398           (inst->dst.type != BRW_REGISTER_TYPE_D &&
3399            inst->dst.type != BRW_REGISTER_TYPE_UD))
3400          continue;
3401
3402       const fs_builder ibld = bld.at(block, inst);
3403
3404       /* The MUL instruction isn't commutative. On Gen <= 6, only the low
3405        * 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of
3406        * src1 are used.
3407        *
3408        * If multiplying by an immediate value that fits in 16-bits, do a
3409        * single MUL instruction with that value in the proper location.
3410        */
3411       if (inst->src[1].file == IMM &&
3412           inst->src[1].fixed_hw_reg.dw1.ud < (1 << 16)) {
3413          if (devinfo->gen < 7) {
3414             fs_reg imm(GRF, alloc.allocate(dispatch_width / 8),
3415                        inst->dst.type, dispatch_width);
3416             ibld.MOV(imm, inst->src[1]);
3417             ibld.MUL(inst->dst, imm, inst->src[0]);
3418          } else {
3419             ibld.MUL(inst->dst, inst->src[0], inst->src[1]);
3420          }
3421       } else {
3422          /* Gen < 8 (and some Gen8+ low-power parts like Cherryview) cannot
3423           * do 32-bit integer multiplication in one instruction, but instead
3424           * must do a sequence (which actually calculates a 64-bit result):
3425           *
3426           *    mul(8)  acc0<1>D   g3<8,8,1>D      g4<8,8,1>D
3427           *    mach(8) null       g3<8,8,1>D      g4<8,8,1>D
3428           *    mov(8)  g2<1>D     acc0<8,8,1>D
3429           *
3430           * But on Gen > 6, the ability to use second accumulator register
3431           * (acc1) for non-float data types was removed, preventing a simple
3432           * implementation in SIMD16. A 16-channel result can be calculated by
3433           * executing the three instructions twice in SIMD8, once with quarter
3434           * control of 1Q for the first eight channels and again with 2Q for
3435           * the second eight channels.
3436           *
3437           * Which accumulator register is implicitly accessed (by AccWrEnable
3438           * for instance) is determined by the quarter control. Unfortunately
3439           * Ivybridge (and presumably Baytrail) has a hardware bug in which an
3440           * implicit accumulator access by an instruction with 2Q will access
3441           * acc1 regardless of whether the data type is usable in acc1.
3442           *
3443           * Specifically, the 2Q mach(8) writes acc1 which does not exist for
3444           * integer data types.
3445           *
3446           * Since we only want the low 32-bits of the result, we can do two
3447           * 32-bit x 16-bit multiplies (like the mul and mach are doing), and
3448           * adjust the high result and add them (like the mach is doing):
3449           *
3450           *    mul(8)  g7<1>D     g3<8,8,1>D      g4.0<8,8,1>UW
3451           *    mul(8)  g8<1>D     g3<8,8,1>D      g4.1<8,8,1>UW
3452           *    shl(8)  g9<1>D     g8<8,8,1>D      16D
3453           *    add(8)  g2<1>D     g7<8,8,1>D      g8<8,8,1>D
3454           *
3455           * We avoid the shl instruction by realizing that we only want to add
3456           * the low 16-bits of the "high" result to the high 16-bits of the
3457           * "low" result and using proper regioning on the add:
3458           *
3459           *    mul(8)  g7<1>D     g3<8,8,1>D      g4.0<16,8,2>UW
3460           *    mul(8)  g8<1>D     g3<8,8,1>D      g4.1<16,8,2>UW
3461           *    add(8)  g7.1<2>UW  g7.1<16,8,2>UW  g8<16,8,2>UW
3462           *
3463           * Since it does not use the (single) accumulator register, we can
3464           * schedule multi-component multiplications much better.
3465           */
3466
3467          if (inst->conditional_mod && inst->dst.is_null()) {
3468             inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
3469                                inst->dst.type, dispatch_width);
3470          }
3471          fs_reg low = inst->dst;
3472          fs_reg high(GRF, alloc.allocate(dispatch_width / 8),
3473                      inst->dst.type, dispatch_width);
3474
3475          if (brw->gen >= 7) {
3476             fs_reg src1_0_w = inst->src[1];
3477             fs_reg src1_1_w = inst->src[1];
3478
3479             if (inst->src[1].file == IMM) {
3480                src1_0_w.fixed_hw_reg.dw1.ud &= 0xffff;
3481                src1_1_w.fixed_hw_reg.dw1.ud >>= 16;
3482             } else {
3483                src1_0_w.type = BRW_REGISTER_TYPE_UW;
3484                src1_0_w.stride = 2;
3485
3486                src1_1_w.type = BRW_REGISTER_TYPE_UW;
3487                src1_1_w.stride = 2;
3488                src1_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
3489             }
3490             ibld.MUL(low, inst->src[0], src1_0_w);
3491             ibld.MUL(high, inst->src[0], src1_1_w);
3492          } else {
3493             fs_reg src0_0_w = inst->src[0];
3494             fs_reg src0_1_w = inst->src[0];
3495
3496             src0_0_w.type = BRW_REGISTER_TYPE_UW;
3497             src0_0_w.stride = 2;
3498
3499             src0_1_w.type = BRW_REGISTER_TYPE_UW;
3500             src0_1_w.stride = 2;
3501             src0_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
3502
3503             ibld.MUL(low, src0_0_w, inst->src[1]);
3504             ibld.MUL(high, src0_1_w, inst->src[1]);
3505          }
3506
3507          fs_reg dst = inst->dst;
3508          dst.type = BRW_REGISTER_TYPE_UW;
3509          dst.subreg_offset = 2;
3510          dst.stride = 2;
3511
3512          high.type = BRW_REGISTER_TYPE_UW;
3513          high.stride = 2;
3514
3515          low.type = BRW_REGISTER_TYPE_UW;
3516          low.subreg_offset = 2;
3517          low.stride = 2;
3518
3519          ibld.ADD(dst, low, high);
3520
3521          if (inst->conditional_mod) {
3522             fs_reg null(retype(brw_null_reg(), inst->dst.type));
3523             set_condmod(inst->conditional_mod,
3524                         ibld.MOV(null, inst->dst));
3525          }
3526       }
3527
3528       inst->remove(block);
3529       progress = true;
3530    }
3531
3532    if (progress)
3533       invalidate_live_intervals();
3534
3535    return progress;
3536 }
3537
3538 void
3539 fs_visitor::dump_instructions()
3540 {
3541    dump_instructions(NULL);
3542 }
3543
3544 void
3545 fs_visitor::dump_instructions(const char *name)
3546 {
3547    FILE *file = stderr;
3548    if (name && geteuid() != 0) {
3549       file = fopen(name, "w");
3550       if (!file)
3551          file = stderr;
3552    }
3553
3554    if (cfg) {
3555       calculate_register_pressure();
3556       int ip = 0, max_pressure = 0;
3557       foreach_block_and_inst(block, backend_instruction, inst, cfg) {
3558          max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
3559          fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
3560          dump_instruction(inst, file);
3561          ip++;
3562       }
3563       fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
3564    } else {
3565       int ip = 0;
3566       foreach_in_list(backend_instruction, inst, &instructions) {
3567          fprintf(file, "%4d: ", ip++);
3568          dump_instruction(inst, file);
3569       }
3570    }
3571
3572    if (file != stderr) {
3573       fclose(file);
3574    }
3575 }
3576
3577 void
3578 fs_visitor::dump_instruction(backend_instruction *be_inst)
3579 {
3580    dump_instruction(be_inst, stderr);
3581 }
3582
3583 void
3584 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
3585 {
3586    fs_inst *inst = (fs_inst *)be_inst;
3587
3588    if (inst->predicate) {
3589       fprintf(file, "(%cf0.%d) ",
3590              inst->predicate_inverse ? '-' : '+',
3591              inst->flag_subreg);
3592    }
3593
3594    fprintf(file, "%s", brw_instruction_name(inst->opcode));
3595    if (inst->saturate)
3596       fprintf(file, ".sat");
3597    if (inst->conditional_mod) {
3598       fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
3599       if (!inst->predicate &&
3600           (devinfo->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
3601                               inst->opcode != BRW_OPCODE_IF &&
3602                               inst->opcode != BRW_OPCODE_WHILE))) {
3603          fprintf(file, ".f0.%d", inst->flag_subreg);
3604       }
3605    }
3606    fprintf(file, "(%d) ", inst->exec_size);
3607
3608    if (inst->mlen) {
3609       fprintf(file, "(mlen: %d) ", inst->mlen);
3610    }
3611
3612    switch (inst->dst.file) {
3613    case GRF:
3614       fprintf(file, "vgrf%d", inst->dst.reg);
3615       if (inst->dst.width != dispatch_width)
3616          fprintf(file, "@%d", inst->dst.width);
3617       if (alloc.sizes[inst->dst.reg] != inst->dst.width / 8 ||
3618           inst->dst.subreg_offset)
3619          fprintf(file, "+%d.%d",
3620                  inst->dst.reg_offset, inst->dst.subreg_offset);
3621       break;
3622    case MRF:
3623       fprintf(file, "m%d", inst->dst.reg);
3624       break;
3625    case BAD_FILE:
3626       fprintf(file, "(null)");
3627       break;
3628    case UNIFORM:
3629       fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
3630       break;
3631    case ATTR:
3632       fprintf(file, "***attr%d***", inst->dst.reg + inst->dst.reg_offset);
3633       break;
3634    case HW_REG:
3635       if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3636          switch (inst->dst.fixed_hw_reg.nr) {
3637          case BRW_ARF_NULL:
3638             fprintf(file, "null");
3639             break;
3640          case BRW_ARF_ADDRESS:
3641             fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
3642             break;
3643          case BRW_ARF_ACCUMULATOR:
3644             fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
3645             break;
3646          case BRW_ARF_FLAG:
3647             fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3648                              inst->dst.fixed_hw_reg.subnr);
3649             break;
3650          default:
3651             fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3652                                inst->dst.fixed_hw_reg.subnr);
3653             break;
3654          }
3655       } else {
3656          fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
3657       }
3658       if (inst->dst.fixed_hw_reg.subnr)
3659          fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
3660       break;
3661    default:
3662       fprintf(file, "???");
3663       break;
3664    }
3665    fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
3666
3667    for (int i = 0; i < inst->sources; i++) {
3668       if (inst->src[i].negate)
3669          fprintf(file, "-");
3670       if (inst->src[i].abs)
3671          fprintf(file, "|");
3672       switch (inst->src[i].file) {
3673       case GRF:
3674          fprintf(file, "vgrf%d", inst->src[i].reg);
3675          if (inst->src[i].width != dispatch_width)
3676             fprintf(file, "@%d", inst->src[i].width);
3677          if (alloc.sizes[inst->src[i].reg] != inst->src[i].width / 8 ||
3678              inst->src[i].subreg_offset)
3679             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3680                     inst->src[i].subreg_offset);
3681          break;
3682       case MRF:
3683          fprintf(file, "***m%d***", inst->src[i].reg);
3684          break;
3685       case ATTR:
3686          fprintf(file, "attr%d", inst->src[i].reg + inst->src[i].reg_offset);
3687          break;
3688       case UNIFORM:
3689          fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
3690          if (inst->src[i].reladdr) {
3691             fprintf(file, "+reladdr");
3692          } else if (inst->src[i].subreg_offset) {
3693             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3694                     inst->src[i].subreg_offset);
3695          }
3696          break;
3697       case BAD_FILE:
3698          fprintf(file, "(null)");
3699          break;
3700       case IMM:
3701          switch (inst->src[i].type) {
3702          case BRW_REGISTER_TYPE_F:
3703             fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
3704             break;
3705          case BRW_REGISTER_TYPE_W:
3706          case BRW_REGISTER_TYPE_D:
3707             fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
3708             break;
3709          case BRW_REGISTER_TYPE_UW:
3710          case BRW_REGISTER_TYPE_UD:
3711             fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
3712             break;
3713          case BRW_REGISTER_TYPE_VF:
3714             fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
3715                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >>  0) & 0xff),
3716                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >>  8) & 0xff),
3717                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 16) & 0xff),
3718                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 24) & 0xff));
3719             break;
3720          default:
3721             fprintf(file, "???");
3722             break;
3723          }
3724          break;
3725       case HW_REG:
3726          if (inst->src[i].fixed_hw_reg.negate)
3727             fprintf(file, "-");
3728          if (inst->src[i].fixed_hw_reg.abs)
3729             fprintf(file, "|");
3730          if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3731             switch (inst->src[i].fixed_hw_reg.nr) {
3732             case BRW_ARF_NULL:
3733                fprintf(file, "null");
3734                break;
3735             case BRW_ARF_ADDRESS:
3736                fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
3737                break;
3738             case BRW_ARF_ACCUMULATOR:
3739                fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
3740                break;
3741             case BRW_ARF_FLAG:
3742                fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3743                                 inst->src[i].fixed_hw_reg.subnr);
3744                break;
3745             default:
3746                fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3747                                   inst->src[i].fixed_hw_reg.subnr);
3748                break;
3749             }
3750          } else {
3751             fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3752          }
3753          if (inst->src[i].fixed_hw_reg.subnr)
3754             fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
3755          if (inst->src[i].fixed_hw_reg.abs)
3756             fprintf(file, "|");
3757          break;
3758       default:
3759          fprintf(file, "???");
3760          break;
3761       }
3762       if (inst->src[i].abs)
3763          fprintf(file, "|");
3764
3765       if (inst->src[i].file != IMM) {
3766          fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
3767       }
3768
3769       if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
3770          fprintf(file, ", ");
3771    }
3772
3773    fprintf(file, " ");
3774
3775    if (dispatch_width == 16 && inst->exec_size == 8) {
3776       if (inst->force_sechalf)
3777          fprintf(file, "2ndhalf ");
3778       else
3779          fprintf(file, "1sthalf ");
3780    }
3781
3782    fprintf(file, "\n");
3783 }
3784
3785 /**
3786  * Possibly returns an instruction that set up @param reg.
3787  *
3788  * Sometimes we want to take the result of some expression/variable
3789  * dereference tree and rewrite the instruction generating the result
3790  * of the tree.  When processing the tree, we know that the
3791  * instructions generated are all writing temporaries that are dead
3792  * outside of this tree.  So, if we have some instructions that write
3793  * a temporary, we're free to point that temp write somewhere else.
3794  *
3795  * Note that this doesn't guarantee that the instruction generated
3796  * only reg -- it might be the size=4 destination of a texture instruction.
3797  */
3798 fs_inst *
3799 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3800                                            fs_inst *end,
3801                                            const fs_reg &reg)
3802 {
3803    if (end == start ||
3804        end->is_partial_write() ||
3805        reg.reladdr ||
3806        !reg.equals(end->dst)) {
3807       return NULL;
3808    } else {
3809       return end;
3810    }
3811 }
3812
3813 void
3814 fs_visitor::setup_payload_gen6()
3815 {
3816    bool uses_depth =
3817       (prog->InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3818    unsigned barycentric_interp_modes =
3819       (stage == MESA_SHADER_FRAGMENT) ?
3820       ((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
3821
3822    assert(devinfo->gen >= 6);
3823
3824    /* R0-1: masks, pixel X/Y coordinates. */
3825    payload.num_regs = 2;
3826    /* R2: only for 32-pixel dispatch.*/
3827
3828    /* R3-26: barycentric interpolation coordinates.  These appear in the
3829     * same order that they appear in the brw_wm_barycentric_interp_mode
3830     * enum.  Each set of coordinates occupies 2 registers if dispatch width
3831     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
3832     * appear if they were enabled using the "Barycentric Interpolation
3833     * Mode" bits in WM_STATE.
3834     */
3835    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3836       if (barycentric_interp_modes & (1 << i)) {
3837          payload.barycentric_coord_reg[i] = payload.num_regs;
3838          payload.num_regs += 2;
3839          if (dispatch_width == 16) {
3840             payload.num_regs += 2;
3841          }
3842       }
3843    }
3844
3845    /* R27: interpolated depth if uses source depth */
3846    if (uses_depth) {
3847       payload.source_depth_reg = payload.num_regs;
3848       payload.num_regs++;
3849       if (dispatch_width == 16) {
3850          /* R28: interpolated depth if not SIMD8. */
3851          payload.num_regs++;
3852       }
3853    }
3854    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3855    if (uses_depth) {
3856       payload.source_w_reg = payload.num_regs;
3857       payload.num_regs++;
3858       if (dispatch_width == 16) {
3859          /* R30: interpolated W if not SIMD8. */
3860          payload.num_regs++;
3861       }
3862    }
3863
3864    if (stage == MESA_SHADER_FRAGMENT) {
3865       brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3866       brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3867       prog_data->uses_pos_offset = key->compute_pos_offset;
3868       /* R31: MSAA position offsets. */
3869       if (prog_data->uses_pos_offset) {
3870          payload.sample_pos_reg = payload.num_regs;
3871          payload.num_regs++;
3872       }
3873    }
3874
3875    /* R32: MSAA input coverage mask */
3876    if (prog->SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3877       assert(devinfo->gen >= 7);
3878       payload.sample_mask_in_reg = payload.num_regs;
3879       payload.num_regs++;
3880       if (dispatch_width == 16) {
3881          /* R33: input coverage mask if not SIMD8. */
3882          payload.num_regs++;
3883       }
3884    }
3885
3886    /* R34-: bary for 32-pixel. */
3887    /* R58-59: interp W for 32-pixel. */
3888
3889    if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3890       source_depth_to_render_target = true;
3891    }
3892 }
3893
3894 void
3895 fs_visitor::setup_vs_payload()
3896 {
3897    /* R0: thread header, R1: urb handles */
3898    payload.num_regs = 2;
3899 }
3900
3901 void
3902 fs_visitor::setup_cs_payload()
3903 {
3904    assert(brw->gen >= 7);
3905
3906    payload.num_regs = 1;
3907 }
3908
3909 void
3910 fs_visitor::assign_binding_table_offsets()
3911 {
3912    assert(stage == MESA_SHADER_FRAGMENT);
3913    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3914    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3915    uint32_t next_binding_table_offset = 0;
3916
3917    /* If there are no color regions, we still perform an FB write to a null
3918     * renderbuffer, which we place at surface index 0.
3919     */
3920    prog_data->binding_table.render_target_start = next_binding_table_offset;
3921    next_binding_table_offset += MAX2(key->nr_color_regions, 1);
3922
3923    assign_common_binding_table_offsets(next_binding_table_offset);
3924 }
3925
3926 void
3927 fs_visitor::calculate_register_pressure()
3928 {
3929    invalidate_live_intervals();
3930    calculate_live_intervals();
3931
3932    unsigned num_instructions = 0;
3933    foreach_block(block, cfg)
3934       num_instructions += block->instructions.length();
3935
3936    regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3937
3938    for (unsigned reg = 0; reg < alloc.count; reg++) {
3939       for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3940          regs_live_at_ip[ip] += alloc.sizes[reg];
3941    }
3942 }
3943
3944 void
3945 fs_visitor::optimize()
3946 {
3947    /* bld is the common builder object pointing at the end of the program we
3948     * used to translate it into i965 IR.  For the optimization and lowering
3949     * passes coming next, any code added after the end of the program without
3950     * having explicitly called fs_builder::at() clearly points at a mistake.
3951     * Ideally optimization passes wouldn't be part of the visitor so they
3952     * wouldn't have access to bld at all, but they do, so just in case some
3953     * pass forgets to ask for a location explicitly set it to NULL here to
3954     * make it trip.
3955     */
3956    bld = bld.at(NULL, NULL);
3957
3958    split_virtual_grfs();
3959
3960    move_uniform_array_access_to_pull_constants();
3961    assign_constant_locations();
3962    demote_pull_constants();
3963
3964 #define OPT(pass, args...) ({                                           \
3965       pass_num++;                                                       \
3966       bool this_progress = pass(args);                                  \
3967                                                                         \
3968       if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) {   \
3969          char filename[64];                                             \
3970          snprintf(filename, 64, "%s%d-%04d-%02d-%02d-" #pass,              \
3971                   stage_abbrev, dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
3972                                                                         \
3973          backend_shader::dump_instructions(filename);                   \
3974       }                                                                 \
3975                                                                         \
3976       progress = progress || this_progress;                             \
3977       this_progress;                                                    \
3978    })
3979
3980    if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
3981       char filename[64];
3982       snprintf(filename, 64, "%s%d-%04d-00-start",
3983                stage_abbrev, dispatch_width,
3984                shader_prog ? shader_prog->Name : 0);
3985
3986       backend_shader::dump_instructions(filename);
3987    }
3988
3989    bool progress;
3990    int iteration = 0;
3991    int pass_num = 0;
3992    do {
3993       progress = false;
3994       pass_num = 0;
3995       iteration++;
3996
3997       OPT(remove_duplicate_mrf_writes);
3998
3999       OPT(opt_algebraic);
4000       OPT(opt_cse);
4001       OPT(opt_copy_propagate);
4002       OPT(opt_peephole_predicated_break);
4003       OPT(opt_cmod_propagation);
4004       OPT(dead_code_eliminate);
4005       OPT(opt_peephole_sel);
4006       OPT(dead_control_flow_eliminate, this);
4007       OPT(opt_register_renaming);
4008       OPT(opt_redundant_discard_jumps);
4009       OPT(opt_saturate_propagation);
4010       OPT(opt_zero_samples);
4011       OPT(register_coalesce);
4012       OPT(compute_to_mrf);
4013       OPT(eliminate_find_live_channel);
4014
4015       OPT(compact_virtual_grfs);
4016    } while (progress);
4017
4018    pass_num = 0;
4019
4020    OPT(opt_sampler_eot);
4021
4022    if (OPT(lower_load_payload)) {
4023       split_virtual_grfs();
4024       OPT(register_coalesce);
4025       OPT(compute_to_mrf);
4026       OPT(dead_code_eliminate);
4027    }
4028
4029    OPT(opt_combine_constants);
4030    OPT(lower_integer_multiplication);
4031
4032    lower_uniform_pull_constant_loads();
4033 }
4034
4035 /**
4036  * Three source instruction must have a GRF/MRF destination register.
4037  * ARF NULL is not allowed.  Fix that up by allocating a temporary GRF.
4038  */
4039 void
4040 fs_visitor::fixup_3src_null_dest()
4041 {
4042    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
4043       if (inst->is_3src() && inst->dst.is_null()) {
4044          inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
4045                             inst->dst.type);
4046       }
4047    }
4048 }
4049
4050 void
4051 fs_visitor::allocate_registers()
4052 {
4053    bool allocated_without_spills;
4054
4055    static const enum instruction_scheduler_mode pre_modes[] = {
4056       SCHEDULE_PRE,
4057       SCHEDULE_PRE_NON_LIFO,
4058       SCHEDULE_PRE_LIFO,
4059    };
4060
4061    /* Try each scheduling heuristic to see if it can successfully register
4062     * allocate without spilling.  They should be ordered by decreasing
4063     * performance but increasing likelihood of allocating.
4064     */
4065    for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
4066       schedule_instructions(pre_modes[i]);
4067
4068       if (0) {
4069          assign_regs_trivial();
4070          allocated_without_spills = true;
4071       } else {
4072          allocated_without_spills = assign_regs(false);
4073       }
4074       if (allocated_without_spills)
4075          break;
4076    }
4077
4078    if (!allocated_without_spills) {
4079       /* We assume that any spilling is worse than just dropping back to
4080        * SIMD8.  There's probably actually some intermediate point where
4081        * SIMD16 with a couple of spills is still better.
4082        */
4083       if (dispatch_width == 16) {
4084          fail("Failure to register allocate.  Reduce number of "
4085               "live scalar values to avoid this.");
4086       } else {
4087          perf_debug("%s shader triggered register spilling.  "
4088                     "Try reducing the number of live scalar values to "
4089                     "improve performance.\n", stage_name);
4090       }
4091
4092       /* Since we're out of heuristics, just go spill registers until we
4093        * get an allocation.
4094        */
4095       while (!assign_regs(true)) {
4096          if (failed)
4097             break;
4098       }
4099    }
4100
4101    /* This must come after all optimization and register allocation, since
4102     * it inserts dead code that happens to have side effects, and it does
4103     * so based on the actual physical registers in use.
4104     */
4105    insert_gen4_send_dependency_workarounds();
4106
4107    if (failed)
4108       return;
4109
4110    if (!allocated_without_spills)
4111       schedule_instructions(SCHEDULE_POST);
4112
4113    if (last_scratch > 0)
4114       prog_data->total_scratch = brw_get_scratch_size(last_scratch);
4115 }
4116
4117 bool
4118 fs_visitor::run_vs()
4119 {
4120    assert(stage == MESA_SHADER_VERTEX);
4121
4122    assign_common_binding_table_offsets(0);
4123    setup_vs_payload();
4124
4125    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4126       emit_shader_time_begin();
4127
4128    emit_nir_code();
4129
4130    if (failed)
4131       return false;
4132
4133    emit_urb_writes();
4134
4135    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4136       emit_shader_time_end();
4137
4138    calculate_cfg();
4139
4140    optimize();
4141
4142    assign_curb_setup();
4143    assign_vs_urb_setup();
4144
4145    fixup_3src_null_dest();
4146    allocate_registers();
4147
4148    return !failed;
4149 }
4150
4151 bool
4152 fs_visitor::run_fs()
4153 {
4154    brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
4155    brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
4156
4157    assert(stage == MESA_SHADER_FRAGMENT);
4158
4159    sanity_param_count = prog->Parameters->NumParameters;
4160
4161    assign_binding_table_offsets();
4162
4163    if (devinfo->gen >= 6)
4164       setup_payload_gen6();
4165    else
4166       setup_payload_gen4();
4167
4168    if (0) {
4169       emit_dummy_fs();
4170    } else if (brw->use_rep_send && dispatch_width == 16) {
4171       emit_repclear_shader();
4172    } else {
4173       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4174          emit_shader_time_begin();
4175
4176       calculate_urb_setup();
4177       if (prog->InputsRead > 0) {
4178          if (devinfo->gen < 6)
4179             emit_interpolation_setup_gen4();
4180          else
4181             emit_interpolation_setup_gen6();
4182       }
4183
4184       /* We handle discards by keeping track of the still-live pixels in f0.1.
4185        * Initialize it with the dispatched pixels.
4186        */
4187       if (wm_prog_data->uses_kill) {
4188          fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
4189          discard_init->flag_subreg = 1;
4190       }
4191
4192       /* Generate FS IR for main().  (the visitor only descends into
4193        * functions called "main").
4194        */
4195       emit_nir_code();
4196
4197       if (failed)
4198          return false;
4199
4200       if (wm_prog_data->uses_kill)
4201          emit(FS_OPCODE_PLACEHOLDER_HALT);
4202
4203       if (wm_key->alpha_test_func)
4204          emit_alpha_test();
4205
4206       emit_fb_writes();
4207
4208       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4209          emit_shader_time_end();
4210
4211       calculate_cfg();
4212
4213       optimize();
4214
4215       assign_curb_setup();
4216       assign_urb_setup();
4217
4218       fixup_3src_null_dest();
4219       allocate_registers();
4220
4221       if (failed)
4222          return false;
4223    }
4224
4225    if (dispatch_width == 8)
4226       wm_prog_data->reg_blocks = brw_register_blocks(grf_used);
4227    else
4228       wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
4229
4230    /* If any state parameters were appended, then ParameterValues could have
4231     * been realloced, in which case the driver uniform storage set up by
4232     * _mesa_associate_uniform_storage() would point to freed memory.  Make
4233     * sure that didn't happen.
4234     */
4235    assert(sanity_param_count == prog->Parameters->NumParameters);
4236
4237    return !failed;
4238 }
4239
4240 bool
4241 fs_visitor::run_cs()
4242 {
4243    assert(stage == MESA_SHADER_COMPUTE);
4244    assert(shader);
4245
4246    sanity_param_count = prog->Parameters->NumParameters;
4247
4248    assign_common_binding_table_offsets(0);
4249
4250    setup_cs_payload();
4251
4252    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4253       emit_shader_time_begin();
4254
4255    emit_nir_code();
4256
4257    if (failed)
4258       return false;
4259
4260    emit_cs_terminate();
4261
4262    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4263       emit_shader_time_end();
4264
4265    calculate_cfg();
4266
4267    optimize();
4268
4269    assign_curb_setup();
4270
4271    fixup_3src_null_dest();
4272    allocate_registers();
4273
4274    if (failed)
4275       return false;
4276
4277    /* If any state parameters were appended, then ParameterValues could have
4278     * been realloced, in which case the driver uniform storage set up by
4279     * _mesa_associate_uniform_storage() would point to freed memory.  Make
4280     * sure that didn't happen.
4281     */
4282    assert(sanity_param_count == prog->Parameters->NumParameters);
4283
4284    return !failed;
4285 }
4286
4287 const unsigned *
4288 brw_wm_fs_emit(struct brw_context *brw,
4289                void *mem_ctx,
4290                const struct brw_wm_prog_key *key,
4291                struct brw_wm_prog_data *prog_data,
4292                struct gl_fragment_program *fp,
4293                struct gl_shader_program *prog,
4294                unsigned *final_assembly_size)
4295 {
4296    bool start_busy = false;
4297    double start_time = 0;
4298
4299    if (unlikely(brw->perf_debug)) {
4300       start_busy = (brw->batch.last_bo &&
4301                     drm_intel_bo_busy(brw->batch.last_bo));
4302       start_time = get_time();
4303    }
4304
4305    struct brw_shader *shader = NULL;
4306    if (prog)
4307       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
4308
4309    if (unlikely(INTEL_DEBUG & DEBUG_WM))
4310       brw_dump_ir("fragment", prog, &shader->base, &fp->Base);
4311
4312    /* Now the main event: Visit the shader IR and generate our FS IR for it.
4313     */
4314    fs_visitor v(brw, mem_ctx, MESA_SHADER_FRAGMENT, key, &prog_data->base,
4315                 prog, &fp->Base, 8);
4316    if (!v.run_fs()) {
4317       if (prog) {
4318          prog->LinkStatus = false;
4319          ralloc_strcat(&prog->InfoLog, v.fail_msg);
4320       }
4321
4322       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
4323                     v.fail_msg);
4324
4325       return NULL;
4326    }
4327
4328    cfg_t *simd16_cfg = NULL;
4329    fs_visitor v2(brw, mem_ctx, MESA_SHADER_FRAGMENT, key, &prog_data->base,
4330                  prog, &fp->Base, 16);
4331    if (likely(!(INTEL_DEBUG & DEBUG_NO16) || brw->use_rep_send)) {
4332       if (!v.simd16_unsupported) {
4333          /* Try a SIMD16 compile */
4334          v2.import_uniforms(&v);
4335          if (!v2.run_fs()) {
4336             perf_debug("SIMD16 shader failed to compile, falling back to "
4337                        "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
4338          } else {
4339             simd16_cfg = v2.cfg;
4340          }
4341       } else {
4342          perf_debug("SIMD16 shader unsupported, falling back to "
4343                     "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
4344       }
4345    }
4346
4347    cfg_t *simd8_cfg;
4348    int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8;
4349    if ((no_simd8 || brw->gen < 5) && simd16_cfg) {
4350       simd8_cfg = NULL;
4351       prog_data->no_8 = true;
4352    } else {
4353       simd8_cfg = v.cfg;
4354       prog_data->no_8 = false;
4355    }
4356
4357    fs_generator g(brw, mem_ctx, (void *) key, &prog_data->base,
4358                   &fp->Base, v.promoted_constants, v.runtime_check_aads_emit, "FS");
4359
4360    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
4361       char *name;
4362       if (prog)
4363          name = ralloc_asprintf(mem_ctx, "%s fragment shader %d",
4364                                 prog->Label ? prog->Label : "unnamed",
4365                                 prog->Name);
4366       else
4367          name = ralloc_asprintf(mem_ctx, "fragment program %d", fp->Base.Id);
4368
4369       g.enable_debug(name);
4370    }
4371
4372    if (simd8_cfg)
4373       g.generate_code(simd8_cfg, 8);
4374    if (simd16_cfg)
4375       prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
4376
4377    if (unlikely(brw->perf_debug) && shader) {
4378       if (shader->compiled_once)
4379          brw_wm_debug_recompile(brw, prog, key);
4380       shader->compiled_once = true;
4381
4382       if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
4383          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
4384                     (get_time() - start_time) * 1000);
4385       }
4386    }
4387
4388    return g.get_assembly(final_assembly_size);
4389 }
4390
4391 extern "C" bool
4392 brw_fs_precompile(struct gl_context *ctx,
4393                   struct gl_shader_program *shader_prog,
4394                   struct gl_program *prog)
4395 {
4396    struct brw_context *brw = brw_context(ctx);
4397    struct brw_wm_prog_key key;
4398
4399    struct gl_fragment_program *fp = (struct gl_fragment_program *) prog;
4400    struct brw_fragment_program *bfp = brw_fragment_program(fp);
4401    bool program_uses_dfdy = fp->UsesDFdy;
4402
4403    memset(&key, 0, sizeof(key));
4404
4405    if (brw->gen < 6) {
4406       if (fp->UsesKill)
4407          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
4408
4409       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
4410          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
4411
4412       /* Just assume depth testing. */
4413       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
4414       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
4415    }
4416
4417    if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
4418                                          BRW_FS_VARYING_INPUT_MASK) > 16)
4419       key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
4420
4421    brw_setup_tex_for_precompile(brw, &key.tex, &fp->Base);
4422
4423    if (fp->Base.InputsRead & VARYING_BIT_POS) {
4424       key.drawable_height = ctx->DrawBuffer->Height;
4425    }
4426
4427    key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
4428          ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
4429          BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
4430
4431    if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
4432       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
4433                           key.nr_color_regions > 1;
4434    }
4435
4436    key.program_string_id = bfp->id;
4437
4438    uint32_t old_prog_offset = brw->wm.base.prog_offset;
4439    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
4440
4441    bool success = brw_codegen_wm_prog(brw, shader_prog, bfp, &key);
4442
4443    brw->wm.base.prog_offset = old_prog_offset;
4444    brw->wm.prog_data = old_prog_data;
4445
4446    return success;
4447 }
4448
4449 void
4450 brw_setup_tex_for_precompile(struct brw_context *brw,
4451                              struct brw_sampler_prog_key_data *tex,
4452                              struct gl_program *prog)
4453 {
4454    const bool has_shader_channel_select = brw->is_haswell || brw->gen >= 8;
4455    unsigned sampler_count = _mesa_fls(prog->SamplersUsed);
4456    for (unsigned i = 0; i < sampler_count; i++) {
4457       if (!has_shader_channel_select && (prog->ShadowSamplers & (1 << i))) {
4458          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
4459          tex->swizzles[i] =
4460             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
4461       } else {
4462          /* Color sampler: assume no swizzling. */
4463          tex->swizzles[i] = SWIZZLE_XYZW;
4464       }
4465    }
4466 }