src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 #include <sys/types.h>
  32
  33 #include "util/hash_table.h"
  34 #include "main/macros.h"
  35 #include "main/shaderobj.h"
  36 #include "main/fbobject.h"
  37 #include "program/prog_parameter.h"
  38 #include "program/prog_print.h"
  39 #include "util/register_allocate.h"
  40 #include "program/hash_table.h"
  41 #include "brw_context.h"
  42 #include "brw_eu.h"
  43 #include "brw_wm.h"
  44 #include "brw_fs.h"
  45 #include "brw_cfg.h"
  46 #include "brw_dead_control_flow.h"
  47 #include "main/uniforms.h"
  48 #include "brw_fs_live_variables.h"
  49 #include "glsl/glsl_types.h"
  50 #include "program/sampler.h"
  51
  52 using namespace brw;
  53
  54 void
  55 fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
  56               const fs_reg *src, unsigned sources)
  57 {
  58    memset(this, 0, sizeof(*this));
  59
  60    this->src = new fs_reg[MAX2(sources, 3)];
  61    for (unsigned i = 0; i < sources; i++)
  62       this->src[i] = src[i];
  63
  64    this->opcode = opcode;
  65    this->dst = dst;
  66    this->sources = sources;
  67    this->exec_size = exec_size;
  68
  69    assert(dst.file != IMM && dst.file != UNIFORM);
  70
  71    /* If exec_size == 0, try to guess it from the registers.  Since all
  72     * manner of things may use hardware registers, we first try to guess
  73     * based on GRF registers.  If this fails, we will go ahead and take the
  74     * width from the destination register.
  75     */
  76    if (this->exec_size == 0) {
  77       if (dst.file == GRF) {
  78          this->exec_size = dst.width;
  79       } else {
  80          for (unsigned i = 0; i < sources; ++i) {
  81             if (src[i].file != GRF && src[i].file != ATTR)
  82                continue;
  83
  84             if (this->exec_size <= 1)
  85                this->exec_size = src[i].width;
  86             assert(src[i].width == 1 || src[i].width == this->exec_size);
  87          }
  88       }
  89
  90       if (this->exec_size == 0 && dst.file != BAD_FILE)
  91          this->exec_size = dst.width;
  92    }
  93    assert(this->exec_size != 0);
  94
  95    this->conditional_mod = BRW_CONDITIONAL_NONE;
  96
  97    /* This will be the case for almost all instructions. */
  98    switch (dst.file) {
  99    case GRF:
 100    case HW_REG:
 101    case MRF:
 102    case ATTR:
 103       this->regs_written =
 104          DIV_ROUND_UP(MAX2(dst.width * dst.stride, 1) * type_sz(dst.type), 32);
 105       break;
 106    case BAD_FILE:
 107       this->regs_written = 0;
 108       break;
 109    case IMM:
 110    case UNIFORM:
 111       unreachable("Invalid destination register file");
 112    default:
 113       unreachable("Invalid register file");
 114    }
 115
 116    this->writes_accumulator = false;
 117 }
 118
 119 fs_inst::fs_inst()
 120 {
 121    init(BRW_OPCODE_NOP, 8, dst, NULL, 0);
 122 }
 123
 124 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
 125 {
 126    init(opcode, exec_size, reg_undef, NULL, 0);
 127 }
 128
 129 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
 130 {
 131    init(opcode, 0, dst, NULL, 0);
 132 }
 133
 134 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 135                  const fs_reg &src0)
 136 {
 137    const fs_reg src[1] = { src0 };
 138    init(opcode, exec_size, dst, src, 1);
 139 }
 140
 141 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
 142 {
 143    const fs_reg src[1] = { src0 };
 144    init(opcode, 0, dst, src, 1);
 145 }
 146
 147 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 148                  const fs_reg &src0, const fs_reg &src1)
 149 {
 150    const fs_reg src[2] = { src0, src1 };
 151    init(opcode, exec_size, dst, src, 2);
 152 }
 153
 154 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 155                  const fs_reg &src1)
 156 {
 157    const fs_reg src[2] = { src0, src1 };
 158    init(opcode, 0, dst, src, 2);
 159 }
 160
 161 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 162                  const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
 163 {
 164    const fs_reg src[3] = { src0, src1, src2 };
 165    init(opcode, exec_size, dst, src, 3);
 166 }
 167
 168 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 169                  const fs_reg &src1, const fs_reg &src2)
 170 {
 171    const fs_reg src[3] = { src0, src1, src2 };
 172    init(opcode, 0, dst, src, 3);
 173 }
 174
 175 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst,
 176                  const fs_reg src[], unsigned sources)
 177 {
 178    init(opcode, 0, dst, src, sources);
 179 }
 180
 181 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
 182                  const fs_reg src[], unsigned sources)
 183 {
 184    init(opcode, exec_width, dst, src, sources);
 185 }
 186
 187 fs_inst::fs_inst(const fs_inst &that)
 188 {
 189    memcpy(this, &that, sizeof(that));
 190
 191    this->src = new fs_reg[MAX2(that.sources, 3)];
 192
 193    for (unsigned i = 0; i < that.sources; i++)
 194       this->src[i] = that.src[i];
 195 }
 196
 197 fs_inst::~fs_inst()
 198 {
 199    delete[] this->src;
 200 }
 201
 202 void
 203 fs_inst::resize_sources(uint8_t num_sources)
 204 {
 205    if (this->sources != num_sources) {
 206       fs_reg *src = new fs_reg[MAX2(num_sources, 3)];
 207
 208       for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i)
 209          src[i] = this->src[i];
 210
 211       delete[] this->src;
 212       this->src = src;
 213       this->sources = num_sources;
 214    }
 215 }
 216
 217 #define ALU1(op)                                                        \
 218    fs_inst *                                                            \
 219    fs_visitor::op(const fs_reg &dst, const fs_reg &src0)                \
 220    {                                                                    \
 221       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
 222    }
 223
 224 #define ALU2(op)                                                        \
 225    fs_inst *                                                            \
 226    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 227                   const fs_reg &src1)                                   \
 228    {                                                                    \
 229       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
 230    }
 231
 232 #define ALU2_ACC(op)                                                    \
 233    fs_inst *                                                            \
 234    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 235                   const fs_reg &src1)                                   \
 236    {                                                                    \
 237       fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
 238       inst->writes_accumulator = true;                                  \
 239       return inst;                                                      \
 240    }
 241
 242 #define ALU3(op)                                                        \
 243    fs_inst *                                                            \
 244    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 245                   const fs_reg &src1, const fs_reg &src2)               \
 246    {                                                                    \
 247       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
 248    }
 249
 250 ALU1(NOT)
 251 ALU1(MOV)
 252 ALU1(FRC)
 253 ALU1(RNDD)
 254 ALU1(RNDE)
 255 ALU1(RNDZ)
 256 ALU2(ADD)
 257 ALU2(MUL)
 258 ALU2_ACC(MACH)
 259 ALU2(AND)
 260 ALU2(OR)
 261 ALU2(XOR)
 262 ALU2(SHL)
 263 ALU2(SHR)
 264 ALU2(ASR)
 265 ALU3(LRP)
 266 ALU1(BFREV)
 267 ALU3(BFE)
 268 ALU2(BFI1)
 269 ALU3(BFI2)
 270 ALU1(FBH)
 271 ALU1(FBL)
 272 ALU1(CBIT)
 273 ALU3(MAD)
 274 ALU2_ACC(ADDC)
 275 ALU2_ACC(SUBB)
 276 ALU2(SEL)
 277 ALU2(MAC)
 278
 279 /** Gen4 predicated IF. */
 280 fs_inst *
 281 fs_visitor::IF(enum brw_predicate predicate)
 282 {
 283    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width);
 284    inst->predicate = predicate;
 285    return inst;
 286 }
 287
 288 /** Gen6 IF with embedded comparison. */
 289 fs_inst *
 290 fs_visitor::IF(const fs_reg &src0, const fs_reg &src1,
 291                enum brw_conditional_mod condition)
 292 {
 293    assert(devinfo->gen == 6);
 294    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width,
 295                                         reg_null_d, src0, src1);
 296    inst->conditional_mod = condition;
 297    return inst;
 298 }
 299
 300 /**
 301  * CMP: Sets the low bit of the destination channels with the result
 302  * of the comparison, while the upper bits are undefined, and updates
 303  * the flag register with the packed 16 bits of the result.
 304  */
 305 fs_inst *
 306 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
 307                 enum brw_conditional_mod condition)
 308 {
 309    fs_inst *inst;
 310
 311    /* Take the instruction:
 312     *
 313     * CMP null<d> src0<f> src1<f>
 314     *
 315     * Original gen4 does type conversion to the destination type before
 316     * comparison, producing garbage results for floating point comparisons.
 317     *
 318     * The destination type doesn't matter on newer generations, so we set the
 319     * type to match src0 so we can compact the instruction.
 320     */
 321    dst.type = src0.type;
 322    if (dst.file == HW_REG)
 323       dst.fixed_hw_reg.type = dst.type;
 324
 325    resolve_ud_negate(&src0);
 326    resolve_ud_negate(&src1);
 327
 328    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
 329    inst->conditional_mod = condition;
 330
 331    return inst;
 332 }
 333
 334 fs_inst *
 335 fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources,
 336                          int header_size)
 337 {
 338    assert(dst.width % 8 == 0);
 339    fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, dst.width,
 340                                         dst, src, sources);
 341    inst->header_size = header_size;
 342
 343    for (int i = 0; i < header_size; i++)
 344       assert(src[i].file != GRF || src[i].width * type_sz(src[i].type) == 32);
 345    inst->regs_written = header_size;
 346
 347    for (int i = header_size; i < sources; ++i)
 348       assert(src[i].file != GRF || src[i].width == dst.width);
 349    inst->regs_written += (sources - header_size) * (dst.width / 8);
 350
 351    return inst;
 352 }
 353
 354 void
 355 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld,
 356                                        const fs_reg &dst,
 357                                        const fs_reg &surf_index,
 358                                        const fs_reg &varying_offset,
 359                                        uint32_t const_offset)
 360 {
 361    /* We have our constant surface use a pitch of 4 bytes, so our index can
 362     * be any component of a vector, and then we load 4 contiguous
 363     * components starting from that.
 364     *
 365     * We break down the const_offset to a portion added to the variable
 366     * offset and a portion done using reg_offset, which means that if you
 367     * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
 368     * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
 369     * CSE can later notice that those loads are all the same and eliminate
 370     * the redundant ones.
 371     */
 372    fs_reg vec4_offset = vgrf(glsl_type::int_type);
 373    bld.ADD(vec4_offset, varying_offset, fs_reg(const_offset & ~3));
 374
 375    int scale = 1;
 376    if (devinfo->gen == 4 && dst.width == 8) {
 377       /* Pre-gen5, we can either use a SIMD8 message that requires (header,
 378        * u, v, r) as parameters, or we can just use the SIMD16 message
 379        * consisting of (header, u).  We choose the second, at the cost of a
 380        * longer return length.
 381        */
 382       scale = 2;
 383    }
 384
 385    enum opcode op;
 386    if (devinfo->gen >= 7)
 387       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
 388    else
 389       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
 390
 391    assert(dst.width % 8 == 0);
 392    int regs_written = 4 * (dst.width / 8) * scale;
 393    fs_reg vec4_result = fs_reg(GRF, alloc.allocate(regs_written),
 394                                dst.type, dst.width);
 395    fs_inst *inst = bld.emit(op, vec4_result, surf_index, vec4_offset);
 396    inst->regs_written = regs_written;
 397
 398    if (devinfo->gen < 7) {
 399       inst->base_mrf = 13;
 400       inst->header_size = 1;
 401       if (devinfo->gen == 4)
 402          inst->mlen = 3;
 403       else
 404          inst->mlen = 1 + dispatch_width / 8;
 405    }
 406
 407    bld.MOV(dst, offset(vec4_result, (const_offset & 3) * scale));
 408 }
 409
 410 /**
 411  * A helper for MOV generation for fixing up broken hardware SEND dependency
 412  * handling.
 413  */
 414 void
 415 fs_visitor::DEP_RESOLVE_MOV(const fs_builder &bld, int grf)
 416 {
 417    /* The caller always wants uncompressed to emit the minimal extra
 418     * dependencies, and to avoid having to deal with aligning its regs to 2.
 419     */
 420    const fs_builder ubld = bld.annotate("send dependency resolve")
 421                               .half(0);
 422
 423    ubld.MOV(ubld.null_reg_f(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
 424 }
 425
 426 bool
 427 fs_inst::equals(fs_inst *inst) const
 428 {
 429    return (opcode == inst->opcode &&
 430            dst.equals(inst->dst) &&
 431            src[0].equals(inst->src[0]) &&
 432            src[1].equals(inst->src[1]) &&
 433            src[2].equals(inst->src[2]) &&
 434            saturate == inst->saturate &&
 435            predicate == inst->predicate &&
 436            conditional_mod == inst->conditional_mod &&
 437            mlen == inst->mlen &&
 438            base_mrf == inst->base_mrf &&
 439            target == inst->target &&
 440            eot == inst->eot &&
 441            header_size == inst->header_size &&
 442            shadow_compare == inst->shadow_compare &&
 443            exec_size == inst->exec_size &&
 444            offset == inst->offset);
 445 }
 446
 447 bool
 448 fs_inst::overwrites_reg(const fs_reg &reg) const
 449 {
 450    return reg.in_range(dst, regs_written);
 451 }
 452
 453 bool
 454 fs_inst::is_send_from_grf() const
 455 {
 456    switch (opcode) {
 457    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
 458    case SHADER_OPCODE_SHADER_TIME_ADD:
 459    case FS_OPCODE_INTERPOLATE_AT_CENTROID:
 460    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
 461    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
 462    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
 463    case SHADER_OPCODE_UNTYPED_ATOMIC:
 464    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
 465    case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
 466    case SHADER_OPCODE_TYPED_ATOMIC:
 467    case SHADER_OPCODE_TYPED_SURFACE_READ:
 468    case SHADER_OPCODE_TYPED_SURFACE_WRITE:
 469    case SHADER_OPCODE_URB_WRITE_SIMD8:
 470       return true;
 471    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 472       return src[1].file == GRF;
 473    case FS_OPCODE_FB_WRITE:
 474       return src[0].file == GRF;
 475    default:
 476       if (is_tex())
 477          return src[0].file == GRF;
 478
 479       return false;
 480    }
 481 }
 482
 483 bool
 484 fs_inst::is_copy_payload(const brw::simple_allocator &grf_alloc) const
 485 {
 486    if (this->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
 487       return false;
 488
 489    fs_reg reg = this->src[0];
 490    if (reg.file != GRF || reg.reg_offset != 0 || reg.stride == 0)
 491       return false;
 492
 493    if (grf_alloc.sizes[reg.reg] != this->regs_written)
 494       return false;
 495
 496    for (int i = 0; i < this->sources; i++) {
 497       reg.type = this->src[i].type;
 498       reg.width = this->src[i].width;
 499       if (!this->src[i].equals(reg))
 500          return false;
 501       reg = ::offset(reg, 1);
 502    }
 503
 504    return true;
 505 }
 506
 507 bool
 508 fs_inst::can_do_source_mods(const struct brw_device_info *devinfo)
 509 {
 510    if (devinfo->gen == 6 && is_math())
 511       return false;
 512
 513    if (is_send_from_grf())
 514       return false;
 515
 516    if (!backend_instruction::can_do_source_mods())
 517       return false;
 518
 519    return true;
 520 }
 521
 522 bool
 523 fs_inst::has_side_effects() const
 524 {
 525    return this->eot || backend_instruction::has_side_effects();
 526 }
 527
 528 void
 529 fs_reg::init()
 530 {
 531    memset(this, 0, sizeof(*this));
 532    stride = 1;
 533 }
 534
 535 /** Generic unset register constructor. */
 536 fs_reg::fs_reg()
 537 {
 538    init();
 539    this->file = BAD_FILE;
 540 }
 541
 542 /** Immediate value constructor. */
 543 fs_reg::fs_reg(float f)
 544 {
 545    init();
 546    this->file = IMM;
 547    this->type = BRW_REGISTER_TYPE_F;
 548    this->fixed_hw_reg.dw1.f = f;
 549    this->width = 1;
 550 }
 551
 552 /** Immediate value constructor. */
 553 fs_reg::fs_reg(int32_t i)
 554 {
 555    init();
 556    this->file = IMM;
 557    this->type = BRW_REGISTER_TYPE_D;
 558    this->fixed_hw_reg.dw1.d = i;
 559    this->width = 1;
 560 }
 561
 562 /** Immediate value constructor. */
 563 fs_reg::fs_reg(uint32_t u)
 564 {
 565    init();
 566    this->file = IMM;
 567    this->type = BRW_REGISTER_TYPE_UD;
 568    this->fixed_hw_reg.dw1.ud = u;
 569    this->width = 1;
 570 }
 571
 572 /** Vector float immediate value constructor. */
 573 fs_reg::fs_reg(uint8_t vf[4])
 574 {
 575    init();
 576    this->file = IMM;
 577    this->type = BRW_REGISTER_TYPE_VF;
 578    memcpy(&this->fixed_hw_reg.dw1.ud, vf, sizeof(unsigned));
 579 }
 580
 581 /** Vector float immediate value constructor. */
 582 fs_reg::fs_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
 583 {
 584    init();
 585    this->file = IMM;
 586    this->type = BRW_REGISTER_TYPE_VF;
 587    this->fixed_hw_reg.dw1.ud = (vf0 <<  0) |
 588                                (vf1 <<  8) |
 589                                (vf2 << 16) |
 590                                (vf3 << 24);
 591 }
 592
 593 /** Fixed brw_reg. */
 594 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 595 {
 596    init();
 597    this->file = HW_REG;
 598    this->fixed_hw_reg = fixed_hw_reg;
 599    this->type = fixed_hw_reg.type;
 600    this->width = 1 << fixed_hw_reg.width;
 601 }
 602
 603 bool
 604 fs_reg::equals(const fs_reg &r) const
 605 {
 606    return (file == r.file &&
 607            reg == r.reg &&
 608            reg_offset == r.reg_offset &&
 609            subreg_offset == r.subreg_offset &&
 610            type == r.type &&
 611            negate == r.negate &&
 612            abs == r.abs &&
 613            !reladdr && !r.reladdr &&
 614            memcmp(&fixed_hw_reg, &r.fixed_hw_reg, sizeof(fixed_hw_reg)) == 0 &&
 615            width == r.width &&
 616            stride == r.stride);
 617 }
 618
 619 fs_reg &
 620 fs_reg::set_smear(unsigned subreg)
 621 {
 622    assert(file != HW_REG && file != IMM);
 623    subreg_offset = subreg * type_sz(type);
 624    stride = 0;
 625    return *this;
 626 }
 627
 628 bool
 629 fs_reg::is_contiguous() const
 630 {
 631    return stride == 1;
 632 }
 633
 634 int
 635 fs_visitor::type_size(const struct glsl_type *type)
 636 {
 637    unsigned int size, i;
 638
 639    switch (type->base_type) {
 640    case GLSL_TYPE_UINT:
 641    case GLSL_TYPE_INT:
 642    case GLSL_TYPE_FLOAT:
 643    case GLSL_TYPE_BOOL:
 644       return type->components();
 645    case GLSL_TYPE_ARRAY:
 646       return type_size(type->fields.array) * type->length;
 647    case GLSL_TYPE_STRUCT:
 648       size = 0;
 649       for (i = 0; i < type->length; i++) {
 650          size += type_size(type->fields.structure[i].type);
 651       }
 652       return size;
 653    case GLSL_TYPE_SAMPLER:
 654       /* Samplers take up no register space, since they're baked in at
 655        * link time.
 656        */
 657       return 0;
 658    case GLSL_TYPE_ATOMIC_UINT:
 659       return 0;
 660    case GLSL_TYPE_IMAGE:
 661    case GLSL_TYPE_VOID:
 662    case GLSL_TYPE_ERROR:
 663    case GLSL_TYPE_INTERFACE:
 664    case GLSL_TYPE_DOUBLE:
 665       unreachable("not reached");
 666    }
 667
 668    return 0;
 669 }
 670
 671 /**
 672  * Create a MOV to read the timestamp register.
 673  *
 674  * The caller is responsible for emitting the MOV.  The return value is
 675  * the destination of the MOV, with extra parameters set.
 676  */
 677 fs_reg
 678 fs_visitor::get_timestamp(const fs_builder &bld)
 679 {
 680    assert(devinfo->gen >= 7);
 681
 682    fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 683                                           BRW_ARF_TIMESTAMP,
 684                                           0),
 685                              BRW_REGISTER_TYPE_UD));
 686
 687    fs_reg dst = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 4);
 688
 689    /* We want to read the 3 fields we care about even if it's not enabled in
 690     * the dispatch.
 691     */
 692    bld.exec_all().MOV(dst, ts);
 693
 694    /* The caller wants the low 32 bits of the timestamp.  Since it's running
 695     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 696     * which is plenty of time for our purposes.  It is identical across the
 697     * EUs, but since it's tracking GPU core speed it will increment at a
 698     * varying rate as render P-states change.
 699     *
 700     * The caller could also check if render P-states have changed (or anything
 701     * else that might disrupt timing) by setting smear to 2 and checking if
 702     * that field is != 0.
 703     */
 704    dst.set_smear(0);
 705
 706    return dst;
 707 }
 708
 709 void
 710 fs_visitor::emit_shader_time_begin()
 711 {
 712    shader_start_time = get_timestamp(bld.annotate("shader time start"));
 713 }
 714
 715 void
 716 fs_visitor::emit_shader_time_end()
 717 {
 718    enum shader_time_shader_type type, written_type, reset_type;
 719    switch (stage) {
 720    case MESA_SHADER_VERTEX:
 721       type = ST_VS;
 722       written_type = ST_VS_WRITTEN;
 723       reset_type = ST_VS_RESET;
 724       break;
 725    case MESA_SHADER_GEOMETRY:
 726       type = ST_GS;
 727       written_type = ST_GS_WRITTEN;
 728       reset_type = ST_GS_RESET;
 729       break;
 730    case MESA_SHADER_FRAGMENT:
 731       if (dispatch_width == 8) {
 732          type = ST_FS8;
 733          written_type = ST_FS8_WRITTEN;
 734          reset_type = ST_FS8_RESET;
 735       } else {
 736          assert(dispatch_width == 16);
 737          type = ST_FS16;
 738          written_type = ST_FS16_WRITTEN;
 739          reset_type = ST_FS16_RESET;
 740       }
 741       break;
 742    case MESA_SHADER_COMPUTE:
 743       type = ST_CS;
 744       written_type = ST_CS_WRITTEN;
 745       reset_type = ST_CS_RESET;
 746       break;
 747    default:
 748       unreachable("fs_visitor::emit_shader_time_end missing code");
 749    }
 750
 751    /* Insert our code just before the final SEND with EOT. */
 752    exec_node *end = this->instructions.get_tail();
 753    assert(end && ((fs_inst *) end)->eot);
 754    const fs_builder ibld = bld.annotate("shader time end")
 755                               .exec_all().at(NULL, end);
 756
 757    fs_reg shader_end_time = get_timestamp(ibld);
 758
 759    /* Check that there weren't any timestamp reset events (assuming these
 760     * were the only two timestamp reads that happened).
 761     */
 762    fs_reg reset = shader_end_time;
 763    reset.set_smear(2);
 764    set_condmod(BRW_CONDITIONAL_Z,
 765                ibld.AND(ibld.null_reg_ud(), reset, fs_reg(1u)));
 766    ibld.IF(BRW_PREDICATE_NORMAL);
 767
 768    fs_reg start = shader_start_time;
 769    start.negate = true;
 770    fs_reg diff = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 1);
 771    diff.set_smear(0);
 772    ibld.ADD(diff, start, shader_end_time);
 773
 774    /* If there were no instructions between the two timestamp gets, the diff
 775     * is 2 cycles.  Remove that overhead, so I can forget about that when
 776     * trying to determine the time taken for single instructions.
 777     */
 778    ibld.ADD(diff, diff, fs_reg(-2u));
 779    SHADER_TIME_ADD(ibld, type, diff);
 780    SHADER_TIME_ADD(ibld, written_type, fs_reg(1u));
 781    ibld.emit(BRW_OPCODE_ELSE);
 782    SHADER_TIME_ADD(ibld, reset_type, fs_reg(1u));
 783    ibld.emit(BRW_OPCODE_ENDIF);
 784 }
 785
 786 void
 787 fs_visitor::SHADER_TIME_ADD(const fs_builder &bld,
 788                             enum shader_time_shader_type type, fs_reg value)
 789 {
 790    int shader_time_index =
 791       brw_get_shader_time_index(brw, shader_prog, prog, type);
 792    fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
 793
 794    fs_reg payload;
 795    if (dispatch_width == 8)
 796       payload = vgrf(glsl_type::uvec2_type);
 797    else
 798       payload = vgrf(glsl_type::uint_type);
 799
 800    bld.emit(SHADER_OPCODE_SHADER_TIME_ADD, fs_reg(), payload, offset, value);
 801 }
 802
 803 void
 804 fs_visitor::vfail(const char *format, va_list va)
 805 {
 806    char *msg;
 807
 808    if (failed)
 809       return;
 810
 811    failed = true;
 812
 813    msg = ralloc_vasprintf(mem_ctx, format, va);
 814    msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
 815
 816    this->fail_msg = msg;
 817
 818    if (debug_enabled) {
 819       fprintf(stderr, "%s",  msg);
 820    }
 821 }
 822
 823 void
 824 fs_visitor::fail(const char *format, ...)
 825 {
 826    va_list va;
 827
 828    va_start(va, format);
 829    vfail(format, va);
 830    va_end(va);
 831 }
 832
 833 /**
 834  * Mark this program as impossible to compile in SIMD16 mode.
 835  *
 836  * During the SIMD8 compile (which happens first), we can detect and flag
 837  * things that are unsupported in SIMD16 mode, so the compiler can skip
 838  * the SIMD16 compile altogether.
 839  *
 840  * During a SIMD16 compile (if one happens anyway), this just calls fail().
 841  */
 842 void
 843 fs_visitor::no16(const char *format, ...)
 844 {
 845    va_list va;
 846
 847    va_start(va, format);
 848
 849    if (dispatch_width == 16) {
 850       vfail(format, va);
 851    } else {
 852       simd16_unsupported = true;
 853
 854       if (brw->perf_debug) {
 855          if (no16_msg)
 856             ralloc_vasprintf_append(&no16_msg, format, va);
 857          else
 858             no16_msg = ralloc_vasprintf(mem_ctx, format, va);
 859       }
 860    }
 861
 862    va_end(va);
 863 }
 864
 865 fs_inst *
 866 fs_visitor::emit(enum opcode opcode)
 867 {
 868    return emit(new(mem_ctx) fs_inst(opcode, dispatch_width));
 869 }
 870
 871 fs_inst *
 872 fs_visitor::emit(enum opcode opcode, const fs_reg &dst)
 873 {
 874    return emit(new(mem_ctx) fs_inst(opcode, dst));
 875 }
 876
 877 fs_inst *
 878 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
 879 {
 880    return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
 881 }
 882
 883 fs_inst *
 884 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 885                  const fs_reg &src1)
 886 {
 887    return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
 888 }
 889
 890 fs_inst *
 891 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 892                  const fs_reg &src1, const fs_reg &src2)
 893 {
 894    return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
 895 }
 896
 897 fs_inst *
 898 fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
 899                  fs_reg src[], int sources)
 900 {
 901    return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
 902 }
 903
 904 /**
 905  * Returns true if the instruction has a flag that means it won't
 906  * update an entire destination register.
 907  *
 908  * For example, dead code elimination and live variable analysis want to know
 909  * when a write to a variable screens off any preceding values that were in
 910  * it.
 911  */
 912 bool
 913 fs_inst::is_partial_write() const
 914 {
 915    return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
 916            (this->dst.width * type_sz(this->dst.type)) < 32 ||
 917            !this->dst.is_contiguous());
 918 }
 919
 920 int
 921 fs_inst::regs_read(int arg) const
 922 {
 923    if (is_tex() && arg == 0 && src[0].file == GRF) {
 924       return mlen;
 925    } else if (opcode == FS_OPCODE_FB_WRITE && arg == 0) {
 926       return mlen;
 927    } else if (opcode == SHADER_OPCODE_URB_WRITE_SIMD8 && arg == 0) {
 928       return mlen;
 929    } else if (opcode == SHADER_OPCODE_UNTYPED_ATOMIC && arg == 0) {
 930       return mlen;
 931    } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ && arg == 0) {
 932       return mlen;
 933    } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_WRITE && arg == 0) {
 934       return mlen;
 935    } else if (opcode == SHADER_OPCODE_TYPED_ATOMIC && arg == 0) {
 936       return mlen;
 937    } else if (opcode == SHADER_OPCODE_TYPED_SURFACE_READ && arg == 0) {
 938       return mlen;
 939    } else if (opcode == SHADER_OPCODE_TYPED_SURFACE_WRITE && arg == 0) {
 940       return mlen;
 941    } else if (opcode == FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET && arg == 0) {
 942       return mlen;
 943    } else if (opcode == FS_OPCODE_LINTERP && arg == 0) {
 944       return exec_size / 4;
 945    }
 946
 947    switch (src[arg].file) {
 948    case BAD_FILE:
 949    case UNIFORM:
 950    case IMM:
 951       return 1;
 952    case GRF:
 953    case HW_REG:
 954       if (src[arg].stride == 0) {
 955          return 1;
 956       } else {
 957          int size = src[arg].width * src[arg].stride * type_sz(src[arg].type);
 958          return (size + 31) / 32;
 959       }
 960    case MRF:
 961       unreachable("MRF registers are not allowed as sources");
 962    default:
 963       unreachable("Invalid register file");
 964    }
 965 }
 966
 967 bool
 968 fs_inst::reads_flag() const
 969 {
 970    return predicate;
 971 }
 972
 973 bool
 974 fs_inst::writes_flag() const
 975 {
 976    return (conditional_mod && (opcode != BRW_OPCODE_SEL &&
 977                                opcode != BRW_OPCODE_IF &&
 978                                opcode != BRW_OPCODE_WHILE)) ||
 979           opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
 980 }
 981
 982 /**
 983  * Returns how many MRFs an FS opcode will write over.
 984  *
 985  * Note that this is not the 0 or 1 implied writes in an actual gen
 986  * instruction -- the FS opcodes often generate MOVs in addition.
 987  */
 988 int
 989 fs_visitor::implied_mrf_writes(fs_inst *inst)
 990 {
 991    if (inst->mlen == 0)
 992       return 0;
 993
 994    if (inst->base_mrf == -1)
 995       return 0;
 996
 997    switch (inst->opcode) {
 998    case SHADER_OPCODE_RCP:
 999    case SHADER_OPCODE_RSQ:
1000    case SHADER_OPCODE_SQRT:
1001    case SHADER_OPCODE_EXP2:
1002    case SHADER_OPCODE_LOG2:
1003    case SHADER_OPCODE_SIN:
1004    case SHADER_OPCODE_COS:
1005       return 1 * dispatch_width / 8;
1006    case SHADER_OPCODE_POW:
1007    case SHADER_OPCODE_INT_QUOTIENT:
1008    case SHADER_OPCODE_INT_REMAINDER:
1009       return 2 * dispatch_width / 8;
1010    case SHADER_OPCODE_TEX:
1011    case FS_OPCODE_TXB:
1012    case SHADER_OPCODE_TXD:
1013    case SHADER_OPCODE_TXF:
1014    case SHADER_OPCODE_TXF_CMS:
1015    case SHADER_OPCODE_TXF_MCS:
1016    case SHADER_OPCODE_TG4:
1017    case SHADER_OPCODE_TG4_OFFSET:
1018    case SHADER_OPCODE_TXL:
1019    case SHADER_OPCODE_TXS:
1020    case SHADER_OPCODE_LOD:
1021       return 1;
1022    case FS_OPCODE_FB_WRITE:
1023       return 2;
1024    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
1025    case SHADER_OPCODE_GEN4_SCRATCH_READ:
1026       return 1;
1027    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
1028       return inst->mlen;
1029    case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
1030       return inst->mlen;
1031    case SHADER_OPCODE_UNTYPED_ATOMIC:
1032    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
1033    case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
1034    case SHADER_OPCODE_TYPED_ATOMIC:
1035    case SHADER_OPCODE_TYPED_SURFACE_READ:
1036    case SHADER_OPCODE_TYPED_SURFACE_WRITE:
1037    case SHADER_OPCODE_URB_WRITE_SIMD8:
1038    case FS_OPCODE_INTERPOLATE_AT_CENTROID:
1039    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
1040    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
1041    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
1042       return 0;
1043    default:
1044       unreachable("not reached");
1045    }
1046 }
1047
1048 fs_reg
1049 fs_visitor::vgrf(const glsl_type *const type)
1050 {
1051    int reg_width = dispatch_width / 8;
1052    return fs_reg(GRF, alloc.allocate(type_size(type) * reg_width),
1053                  brw_type_for_base_type(type), dispatch_width);
1054 }
1055
1056 fs_reg
1057 fs_visitor::vgrf(int num_components)
1058 {
1059    int reg_width = dispatch_width / 8;
1060    return fs_reg(GRF, alloc.allocate(num_components * reg_width),
1061                  BRW_REGISTER_TYPE_F, dispatch_width);
1062 }
1063
1064 /** Fixed HW reg constructor. */
1065 fs_reg::fs_reg(enum register_file file, int reg)
1066 {
1067    init();
1068    this->file = file;
1069    this->reg = reg;
1070    this->type = BRW_REGISTER_TYPE_F;
1071
1072    switch (file) {
1073    case UNIFORM:
1074       this->width = 1;
1075       break;
1076    default:
1077       this->width = 8;
1078    }
1079 }
1080
1081 /** Fixed HW reg constructor. */
1082 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
1083 {
1084    init();
1085    this->file = file;
1086    this->reg = reg;
1087    this->type = type;
1088
1089    switch (file) {
1090    case UNIFORM:
1091       this->width = 1;
1092       break;
1093    default:
1094       this->width = 8;
1095    }
1096 }
1097
1098 /** Fixed HW reg constructor. */
1099 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type,
1100                uint8_t width)
1101 {
1102    init();
1103    this->file = file;
1104    this->reg = reg;
1105    this->type = type;
1106    this->width = width;
1107 }
1108
1109 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
1110  * This brings in those uniform definitions
1111  */
1112 void
1113 fs_visitor::import_uniforms(fs_visitor *v)
1114 {
1115    this->push_constant_loc = v->push_constant_loc;
1116    this->pull_constant_loc = v->pull_constant_loc;
1117    this->uniforms = v->uniforms;
1118    this->param_size = v->param_size;
1119 }
1120
1121 fs_reg *
1122 fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
1123                                          bool origin_upper_left)
1124 {
1125    assert(stage == MESA_SHADER_FRAGMENT);
1126    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1127    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec4_type));
1128    fs_reg wpos = *reg;
1129    bool flip = !origin_upper_left ^ key->render_to_fbo;
1130
1131    /* gl_FragCoord.x */
1132    if (pixel_center_integer) {
1133       bld.MOV(wpos, this->pixel_x);
1134    } else {
1135       bld.ADD(wpos, this->pixel_x, fs_reg(0.5f));
1136    }
1137    wpos = offset(wpos, 1);
1138
1139    /* gl_FragCoord.y */
1140    if (!flip && pixel_center_integer) {
1141       bld.MOV(wpos, this->pixel_y);
1142    } else {
1143       fs_reg pixel_y = this->pixel_y;
1144       float offset = (pixel_center_integer ? 0.0 : 0.5);
1145
1146       if (flip) {
1147          pixel_y.negate = true;
1148          offset += key->drawable_height - 1.0;
1149       }
1150
1151       bld.ADD(wpos, pixel_y, fs_reg(offset));
1152    }
1153    wpos = offset(wpos, 1);
1154
1155    /* gl_FragCoord.z */
1156    if (devinfo->gen >= 6) {
1157       bld.MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0)));
1158    } else {
1159       bld.emit(FS_OPCODE_LINTERP, wpos,
1160            this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1161            interp_reg(VARYING_SLOT_POS, 2));
1162    }
1163    wpos = offset(wpos, 1);
1164
1165    /* gl_FragCoord.w: Already set up in emit_interpolation */
1166    bld.MOV(wpos, this->wpos_w);
1167
1168    return reg;
1169 }
1170
1171 fs_inst *
1172 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1173                          glsl_interp_qualifier interpolation_mode,
1174                          bool is_centroid, bool is_sample)
1175 {
1176    brw_wm_barycentric_interp_mode barycoord_mode;
1177    if (devinfo->gen >= 6) {
1178       if (is_centroid) {
1179          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1180             barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1181          else
1182             barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1183       } else if (is_sample) {
1184           if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1185             barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1186          else
1187             barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1188       } else {
1189          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1190             barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1191          else
1192             barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1193       }
1194    } else {
1195       /* On Ironlake and below, there is only one interpolation mode.
1196        * Centroid interpolation doesn't mean anything on this hardware --
1197        * there is no multisampling.
1198        */
1199       barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1200    }
1201    return bld.emit(FS_OPCODE_LINTERP, attr,
1202                    this->delta_xy[barycoord_mode], interp);
1203 }
1204
1205 void
1206 fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
1207                                        const glsl_type *type,
1208                                        glsl_interp_qualifier interpolation_mode,
1209                                        int location, bool mod_centroid,
1210                                        bool mod_sample)
1211 {
1212    attr.type = brw_type_for_base_type(type->get_scalar_type());
1213
1214    assert(stage == MESA_SHADER_FRAGMENT);
1215    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1216    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1217
1218    unsigned int array_elements;
1219
1220    if (type->is_array()) {
1221       array_elements = type->length;
1222       if (array_elements == 0) {
1223          fail("dereferenced array '%s' has length 0\n", name);
1224       }
1225       type = type->fields.array;
1226    } else {
1227       array_elements = 1;
1228    }
1229
1230    if (interpolation_mode == INTERP_QUALIFIER_NONE) {
1231       bool is_gl_Color =
1232          location == VARYING_SLOT_COL0 || location == VARYING_SLOT_COL1;
1233       if (key->flat_shade && is_gl_Color) {
1234          interpolation_mode = INTERP_QUALIFIER_FLAT;
1235       } else {
1236          interpolation_mode = INTERP_QUALIFIER_SMOOTH;
1237       }
1238    }
1239
1240    for (unsigned int i = 0; i < array_elements; i++) {
1241       for (unsigned int j = 0; j < type->matrix_columns; j++) {
1242          if (prog_data->urb_setup[location] == -1) {
1243             /* If there's no incoming setup data for this slot, don't
1244              * emit interpolation for it.
1245              */
1246             attr = offset(attr, type->vector_elements);
1247             location++;
1248             continue;
1249          }
1250
1251          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1252             /* Constant interpolation (flat shading) case. The SF has
1253              * handed us defined values in only the constant offset
1254              * field of the setup reg.
1255              */
1256             for (unsigned int k = 0; k < type->vector_elements; k++) {
1257                struct brw_reg interp = interp_reg(location, k);
1258                interp = suboffset(interp, 3);
1259                interp.type = attr.type;
1260                bld.emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1261                attr = offset(attr, 1);
1262             }
1263          } else {
1264             /* Smooth/noperspective interpolation case. */
1265             for (unsigned int k = 0; k < type->vector_elements; k++) {
1266                struct brw_reg interp = interp_reg(location, k);
1267                if (devinfo->needs_unlit_centroid_workaround && mod_centroid) {
1268                   /* Get the pixel/sample mask into f0 so that we know
1269                    * which pixels are lit.  Then, for each channel that is
1270                    * unlit, replace the centroid data with non-centroid
1271                    * data.
1272                    */
1273                   bld.emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1274
1275                   fs_inst *inst;
1276                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1277                                       false, false);
1278                   inst->predicate = BRW_PREDICATE_NORMAL;
1279                   inst->predicate_inverse = true;
1280                   if (devinfo->has_pln)
1281                      inst->no_dd_clear = true;
1282
1283                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1284                                       mod_centroid && !key->persample_shading,
1285                                       mod_sample || key->persample_shading);
1286                   inst->predicate = BRW_PREDICATE_NORMAL;
1287                   inst->predicate_inverse = false;
1288                   if (devinfo->has_pln)
1289                      inst->no_dd_check = true;
1290
1291                } else {
1292                   emit_linterp(attr, fs_reg(interp), interpolation_mode,
1293                                mod_centroid && !key->persample_shading,
1294                                mod_sample || key->persample_shading);
1295                }
1296                if (devinfo->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1297                   bld.MUL(attr, attr, this->pixel_w);
1298                }
1299                attr = offset(attr, 1);
1300             }
1301
1302          }
1303          location++;
1304       }
1305    }
1306 }
1307
1308 fs_reg *
1309 fs_visitor::emit_frontfacing_interpolation()
1310 {
1311    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type));
1312
1313    if (devinfo->gen >= 6) {
1314       /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
1315        * a boolean result from this (~0/true or 0/false).
1316        *
1317        * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
1318        * this task in only one instruction:
1319        *    - a negation source modifier will flip the bit; and
1320        *    - a W -> D type conversion will sign extend the bit into the high
1321        *      word of the destination.
1322        *
1323        * An ASR 15 fills the low word of the destination.
1324        */
1325       fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
1326       g0.negate = true;
1327
1328       bld.ASR(*reg, g0, fs_reg(15));
1329    } else {
1330       /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
1331        * a boolean result from this (1/true or 0/false).
1332        *
1333        * Like in the above case, since the bit is the MSB of g1.6:UD we can use
1334        * the negation source modifier to flip it. Unfortunately the SHR
1335        * instruction only operates on UD (or D with an abs source modifier)
1336        * sources without negation.
1337        *
1338        * Instead, use ASR (which will give ~0/true or 0/false).
1339        */
1340       fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
1341       g1_6.negate = true;
1342
1343       bld.ASR(*reg, g1_6, fs_reg(31));
1344    }
1345
1346    return reg;
1347 }
1348
1349 void
1350 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1351 {
1352    assert(stage == MESA_SHADER_FRAGMENT);
1353    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1354    assert(dst.type == BRW_REGISTER_TYPE_F);
1355
1356    if (key->compute_pos_offset) {
1357       /* Convert int_sample_pos to floating point */
1358       emit(MOV(dst, int_sample_pos));
1359       /* Scale to the range [0, 1] */
1360       emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1361    }
1362    else {
1363       /* From ARB_sample_shading specification:
1364        * "When rendering to a non-multisample buffer, or if multisample
1365        *  rasterization is disabled, gl_SamplePosition will always be
1366        *  (0.5, 0.5).
1367        */
1368       emit(MOV(dst, fs_reg(0.5f)));
1369    }
1370 }
1371
1372 fs_reg *
1373 fs_visitor::emit_samplepos_setup()
1374 {
1375    assert(devinfo->gen >= 6);
1376
1377    this->current_annotation = "compute sample position";
1378    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type));
1379    fs_reg pos = *reg;
1380    fs_reg int_sample_x = vgrf(glsl_type::int_type);
1381    fs_reg int_sample_y = vgrf(glsl_type::int_type);
1382
1383    /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1384     * mode will be enabled.
1385     *
1386     * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1387     * R31.1:0         Position Offset X/Y for Slot[3:0]
1388     * R31.3:2         Position Offset X/Y for Slot[7:4]
1389     * .....
1390     *
1391     * The X, Y sample positions come in as bytes in  thread payload. So, read
1392     * the positions using vstride=16, width=8, hstride=2.
1393     */
1394    struct brw_reg sample_pos_reg =
1395       stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1396                     BRW_REGISTER_TYPE_B), 16, 8, 2);
1397
1398    if (dispatch_width == 8) {
1399       emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1400    } else {
1401       emit(MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg)));
1402       emit(MOV(half(int_sample_x, 1), fs_reg(suboffset(sample_pos_reg, 16))))
1403          ->force_sechalf = true;
1404    }
1405    /* Compute gl_SamplePosition.x */
1406    compute_sample_position(pos, int_sample_x);
1407    pos = offset(pos, 1);
1408    if (dispatch_width == 8) {
1409       emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1410    } else {
1411       emit(MOV(half(int_sample_y, 0),
1412                fs_reg(suboffset(sample_pos_reg, 1))));
1413       emit(MOV(half(int_sample_y, 1), fs_reg(suboffset(sample_pos_reg, 17))))
1414          ->force_sechalf = true;
1415    }
1416    /* Compute gl_SamplePosition.y */
1417    compute_sample_position(pos, int_sample_y);
1418    return reg;
1419 }
1420
1421 fs_reg *
1422 fs_visitor::emit_sampleid_setup()
1423 {
1424    assert(stage == MESA_SHADER_FRAGMENT);
1425    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1426    assert(devinfo->gen >= 6);
1427
1428    this->current_annotation = "compute sample id";
1429    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
1430
1431    if (key->compute_sample_id) {
1432       fs_reg t1 = vgrf(glsl_type::int_type);
1433       fs_reg t2 = vgrf(glsl_type::int_type);
1434       t2.type = BRW_REGISTER_TYPE_UW;
1435
1436       /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1437        * 8x multisampling, subspan 0 will represent sample N (where N
1438        * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1439        * 7. We can find the value of N by looking at R0.0 bits 7:6
1440        * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1441        * (since samples are always delivered in pairs). That is, we
1442        * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1443        * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1444        * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1445        * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1446        * populating a temporary variable with the sequence (0, 1, 2, 3),
1447        * and then reading from it using vstride=1, width=4, hstride=0.
1448        * These computations hold good for 4x multisampling as well.
1449        *
1450        * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1451        * the first four slots are sample 0 of subspan 0; the next four
1452        * are sample 1 of subspan 0; the third group is sample 0 of
1453        * subspan 1, and finally sample 1 of subspan 1.
1454        */
1455       fs_inst *inst;
1456       inst = emit(BRW_OPCODE_AND, t1,
1457                   fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1458                   fs_reg(0xc0));
1459       inst->force_writemask_all = true;
1460       inst = emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1461       inst->force_writemask_all = true;
1462       /* This works for both SIMD8 and SIMD16 */
1463       inst = emit(MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210)));
1464       inst->force_writemask_all = true;
1465       /* This special instruction takes care of setting vstride=1,
1466        * width=4, hstride=0 of t2 during an ADD instruction.
1467        */
1468       emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1469    } else {
1470       /* As per GL_ARB_sample_shading specification:
1471        * "When rendering to a non-multisample buffer, or if multisample
1472        *  rasterization is disabled, gl_SampleID will always be zero."
1473        */
1474       emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1475    }
1476
1477    return reg;
1478 }
1479
1480 void
1481 fs_visitor::resolve_source_modifiers(fs_reg *src)
1482 {
1483    if (!src->abs && !src->negate)
1484       return;
1485
1486    fs_reg temp = retype(vgrf(1), src->type);
1487    emit(MOV(temp, *src));
1488    *src = temp;
1489 }
1490
1491 fs_reg
1492 fs_visitor::fix_math_operand(fs_reg src)
1493 {
1494    /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1495     * might be able to do better by doing execsize = 1 math and then
1496     * expanding that result out, but we would need to be careful with
1497     * masking.
1498     *
1499     * The hardware ignores source modifiers (negate and abs) on math
1500     * instructions, so we also move to a temp to set those up.
1501     */
1502    if (devinfo->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1503        !src.abs && !src.negate)
1504       return src;
1505
1506    /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1507     * operands to math
1508     */
1509    if (devinfo->gen >= 7 && src.file != IMM)
1510       return src;
1511
1512    fs_reg expanded = vgrf(glsl_type::float_type);
1513    expanded.type = src.type;
1514    emit(BRW_OPCODE_MOV, expanded, src);
1515    return expanded;
1516 }
1517
1518 fs_inst *
1519 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1520 {
1521    switch (opcode) {
1522    case SHADER_OPCODE_RCP:
1523    case SHADER_OPCODE_RSQ:
1524    case SHADER_OPCODE_SQRT:
1525    case SHADER_OPCODE_EXP2:
1526    case SHADER_OPCODE_LOG2:
1527    case SHADER_OPCODE_SIN:
1528    case SHADER_OPCODE_COS:
1529       break;
1530    default:
1531       unreachable("not reached: bad math opcode");
1532    }
1533
1534    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
1535     * might be able to do better by doing execsize = 1 math and then
1536     * expanding that result out, but we would need to be careful with
1537     * masking.
1538     *
1539     * Gen 6 hardware ignores source modifiers (negate and abs) on math
1540     * instructions, so we also move to a temp to set those up.
1541     */
1542    if (devinfo->gen == 6 || devinfo->gen == 7)
1543       src = fix_math_operand(src);
1544
1545    fs_inst *inst = emit(opcode, dst, src);
1546
1547    if (devinfo->gen < 6) {
1548       inst->base_mrf = 2;
1549       inst->mlen = dispatch_width / 8;
1550    }
1551
1552    return inst;
1553 }
1554
1555 fs_inst *
1556 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1557 {
1558    int base_mrf = 2;
1559    fs_inst *inst;
1560
1561    if (devinfo->gen >= 8) {
1562       inst = emit(opcode, dst, src0, src1);
1563    } else if (devinfo->gen >= 6) {
1564       src0 = fix_math_operand(src0);
1565       src1 = fix_math_operand(src1);
1566
1567       inst = emit(opcode, dst, src0, src1);
1568    } else {
1569       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1570        * "Message Payload":
1571        *
1572        * "Operand0[7].  For the INT DIV functions, this operand is the
1573        *  denominator."
1574        *  ...
1575        * "Operand1[7].  For the INT DIV functions, this operand is the
1576        *  numerator."
1577        */
1578       bool is_int_div = opcode != SHADER_OPCODE_POW;
1579       fs_reg &op0 = is_int_div ? src1 : src0;
1580       fs_reg &op1 = is_int_div ? src0 : src1;
1581
1582       emit(MOV(fs_reg(MRF, base_mrf + 1, op1.type, dispatch_width), op1));
1583       inst = emit(opcode, dst, op0, reg_null_f);
1584
1585       inst->base_mrf = base_mrf;
1586       inst->mlen = 2 * dispatch_width / 8;
1587    }
1588    return inst;
1589 }
1590
1591 void
1592 fs_visitor::emit_discard_jump()
1593 {
1594    assert(((brw_wm_prog_data*) this->prog_data)->uses_kill);
1595
1596    /* For performance, after a discard, jump to the end of the
1597     * shader if all relevant channels have been discarded.
1598     */
1599    fs_inst *discard_jump = emit(FS_OPCODE_DISCARD_JUMP);
1600    discard_jump->flag_subreg = 1;
1601
1602    discard_jump->predicate = (dispatch_width == 8)
1603                              ? BRW_PREDICATE_ALIGN1_ANY8H
1604                              : BRW_PREDICATE_ALIGN1_ANY16H;
1605    discard_jump->predicate_inverse = true;
1606 }
1607
1608 void
1609 fs_visitor::assign_curb_setup()
1610 {
1611    if (dispatch_width == 8) {
1612       prog_data->dispatch_grf_start_reg = payload.num_regs;
1613    } else {
1614       if (stage == MESA_SHADER_FRAGMENT) {
1615          brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1616          prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1617       } else if (stage == MESA_SHADER_COMPUTE) {
1618          brw_cs_prog_data *prog_data = (brw_cs_prog_data*) this->prog_data;
1619          prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1620       } else {
1621          unreachable("Unsupported shader type!");
1622       }
1623    }
1624
1625    prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1626
1627    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1628    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1629       for (unsigned int i = 0; i < inst->sources; i++) {
1630          if (inst->src[i].file == UNIFORM) {
1631             int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1632             int constant_nr;
1633             if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1634                constant_nr = push_constant_loc[uniform_nr];
1635             } else {
1636                /* Section 5.11 of the OpenGL 4.1 spec says:
1637                 * "Out-of-bounds reads return undefined values, which include
1638                 *  values from other variables of the active program or zero."
1639                 * Just return the first push constant.
1640                 */
1641                constant_nr = 0;
1642             }
1643
1644             struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1645                                                   constant_nr / 8,
1646                                                   constant_nr % 8);
1647
1648             inst->src[i].file = HW_REG;
1649             inst->src[i].fixed_hw_reg = byte_offset(
1650                retype(brw_reg, inst->src[i].type),
1651                inst->src[i].subreg_offset);
1652          }
1653       }
1654    }
1655 }
1656
1657 void
1658 fs_visitor::calculate_urb_setup()
1659 {
1660    assert(stage == MESA_SHADER_FRAGMENT);
1661    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1662    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1663
1664    memset(prog_data->urb_setup, -1,
1665           sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
1666
1667    int urb_next = 0;
1668    /* Figure out where each of the incoming setup attributes lands. */
1669    if (devinfo->gen >= 6) {
1670       if (_mesa_bitcount_64(prog->InputsRead &
1671                             BRW_FS_VARYING_INPUT_MASK) <= 16) {
1672          /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1673           * first 16 varying inputs, so we can put them wherever we want.
1674           * Just put them in order.
1675           *
1676           * This is useful because it means that (a) inputs not used by the
1677           * fragment shader won't take up valuable register space, and (b) we
1678           * won't have to recompile the fragment shader if it gets paired with
1679           * a different vertex (or geometry) shader.
1680           */
1681          for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1682             if (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1683                 BITFIELD64_BIT(i)) {
1684                prog_data->urb_setup[i] = urb_next++;
1685             }
1686          }
1687       } else {
1688          /* We have enough input varyings that the SF/SBE pipeline stage can't
1689           * arbitrarily rearrange them to suit our whim; we have to put them
1690           * in an order that matches the output of the previous pipeline stage
1691           * (geometry or vertex shader).
1692           */
1693          struct brw_vue_map prev_stage_vue_map;
1694          brw_compute_vue_map(devinfo, &prev_stage_vue_map,
1695                              key->input_slots_valid);
1696          int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1697          assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1698          for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1699               slot++) {
1700             int varying = prev_stage_vue_map.slot_to_varying[slot];
1701             /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1702              * unused.
1703              */
1704             if (varying != BRW_VARYING_SLOT_COUNT &&
1705                 (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1706                  BITFIELD64_BIT(varying))) {
1707                prog_data->urb_setup[varying] = slot - first_slot;
1708             }
1709          }
1710          urb_next = prev_stage_vue_map.num_slots - first_slot;
1711       }
1712    } else {
1713       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1714       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1715          /* Point size is packed into the header, not as a general attribute */
1716          if (i == VARYING_SLOT_PSIZ)
1717             continue;
1718
1719          if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1720             /* The back color slot is skipped when the front color is
1721              * also written to.  In addition, some slots can be
1722              * written in the vertex shader and not read in the
1723              * fragment shader.  So the register number must always be
1724              * incremented, mapped or not.
1725              */
1726             if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1727                prog_data->urb_setup[i] = urb_next;
1728             urb_next++;
1729          }
1730       }
1731
1732       /*
1733        * It's a FS only attribute, and we did interpolation for this attribute
1734        * in SF thread. So, count it here, too.
1735        *
1736        * See compile_sf_prog() for more info.
1737        */
1738       if (prog->InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1739          prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1740    }
1741
1742    prog_data->num_varying_inputs = urb_next;
1743 }
1744
1745 void
1746 fs_visitor::assign_urb_setup()
1747 {
1748    assert(stage == MESA_SHADER_FRAGMENT);
1749    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1750
1751    int urb_start = payload.num_regs + prog_data->base.curb_read_length;
1752
1753    /* Offset all the urb_setup[] index by the actual position of the
1754     * setup regs, now that the location of the constants has been chosen.
1755     */
1756    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1757       if (inst->opcode == FS_OPCODE_LINTERP) {
1758          assert(inst->src[1].file == HW_REG);
1759          inst->src[1].fixed_hw_reg.nr += urb_start;
1760       }
1761
1762       if (inst->opcode == FS_OPCODE_CINTERP) {
1763          assert(inst->src[0].file == HW_REG);
1764          inst->src[0].fixed_hw_reg.nr += urb_start;
1765       }
1766    }
1767
1768    /* Each attribute is 4 setup channels, each of which is half a reg. */
1769    this->first_non_payload_grf =
1770       urb_start + prog_data->num_varying_inputs * 2;
1771 }
1772
1773 void
1774 fs_visitor::assign_vs_urb_setup()
1775 {
1776    brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
1777    int grf, count, slot, channel, attr;
1778
1779    assert(stage == MESA_SHADER_VERTEX);
1780    count = _mesa_bitcount_64(vs_prog_data->inputs_read);
1781    if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid)
1782       count++;
1783
1784    /* Each attribute is 4 regs. */
1785    this->first_non_payload_grf =
1786       payload.num_regs + prog_data->curb_read_length + count * 4;
1787
1788    unsigned vue_entries =
1789       MAX2(count, vs_prog_data->base.vue_map.num_slots);
1790
1791    vs_prog_data->base.urb_entry_size = ALIGN(vue_entries, 4) / 4;
1792    vs_prog_data->base.urb_read_length = (count + 1) / 2;
1793
1794    assert(vs_prog_data->base.urb_read_length <= 15);
1795
1796    /* Rewrite all ATTR file references to the hw grf that they land in. */
1797    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1798       for (int i = 0; i < inst->sources; i++) {
1799          if (inst->src[i].file == ATTR) {
1800
1801             if (inst->src[i].reg == VERT_ATTRIB_MAX) {
1802                slot = count - 1;
1803             } else {
1804                /* Attributes come in in a contiguous block, ordered by their
1805                 * gl_vert_attrib value.  That means we can compute the slot
1806                 * number for an attribute by masking out the enabled
1807                 * attributes before it and counting the bits.
1808                 */
1809                attr = inst->src[i].reg + inst->src[i].reg_offset / 4;
1810                slot = _mesa_bitcount_64(vs_prog_data->inputs_read &
1811                                         BITFIELD64_MASK(attr));
1812             }
1813
1814             channel = inst->src[i].reg_offset & 3;
1815
1816             grf = payload.num_regs +
1817                prog_data->curb_read_length +
1818                slot * 4 + channel;
1819
1820             inst->src[i].file = HW_REG;
1821             inst->src[i].fixed_hw_reg =
1822                retype(brw_vec8_grf(grf, 0), inst->src[i].type);
1823          }
1824       }
1825    }
1826 }
1827
1828 /**
1829  * Split large virtual GRFs into separate components if we can.
1830  *
1831  * This is mostly duplicated with what brw_fs_vector_splitting does,
1832  * but that's really conservative because it's afraid of doing
1833  * splitting that doesn't result in real progress after the rest of
1834  * the optimization phases, which would cause infinite looping in
1835  * optimization.  We can do it once here, safely.  This also has the
1836  * opportunity to split interpolated values, or maybe even uniforms,
1837  * which we don't have at the IR level.
1838  *
1839  * We want to split, because virtual GRFs are what we register
1840  * allocate and spill (due to contiguousness requirements for some
1841  * instructions), and they're what we naturally generate in the
1842  * codegen process, but most virtual GRFs don't actually need to be
1843  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1844  * live intervals and better dead code elimination and coalescing.
1845  */
1846 void
1847 fs_visitor::split_virtual_grfs()
1848 {
1849    int num_vars = this->alloc.count;
1850
1851    /* Count the total number of registers */
1852    int reg_count = 0;
1853    int vgrf_to_reg[num_vars];
1854    for (int i = 0; i < num_vars; i++) {
1855       vgrf_to_reg[i] = reg_count;
1856       reg_count += alloc.sizes[i];
1857    }
1858
1859    /* An array of "split points".  For each register slot, this indicates
1860     * if this slot can be separated from the previous slot.  Every time an
1861     * instruction uses multiple elements of a register (as a source or
1862     * destination), we mark the used slots as inseparable.  Then we go
1863     * through and split the registers into the smallest pieces we can.
1864     */
1865    bool split_points[reg_count];
1866    memset(split_points, 0, sizeof(split_points));
1867
1868    /* Mark all used registers as fully splittable */
1869    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1870       if (inst->dst.file == GRF) {
1871          int reg = vgrf_to_reg[inst->dst.reg];
1872          for (unsigned j = 1; j < this->alloc.sizes[inst->dst.reg]; j++)
1873             split_points[reg + j] = true;
1874       }
1875
1876       for (int i = 0; i < inst->sources; i++) {
1877          if (inst->src[i].file == GRF) {
1878             int reg = vgrf_to_reg[inst->src[i].reg];
1879             for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].reg]; j++)
1880                split_points[reg + j] = true;
1881          }
1882       }
1883    }
1884
1885    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1886       if (inst->dst.file == GRF) {
1887          int reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
1888          for (int j = 1; j < inst->regs_written; j++)
1889             split_points[reg + j] = false;
1890       }
1891       for (int i = 0; i < inst->sources; i++) {
1892          if (inst->src[i].file == GRF) {
1893             int reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
1894             for (int j = 1; j < inst->regs_read(i); j++)
1895                split_points[reg + j] = false;
1896          }
1897       }
1898    }
1899
1900    int new_virtual_grf[reg_count];
1901    int new_reg_offset[reg_count];
1902
1903    int reg = 0;
1904    for (int i = 0; i < num_vars; i++) {
1905       /* The first one should always be 0 as a quick sanity check. */
1906       assert(split_points[reg] == false);
1907
1908       /* j = 0 case */
1909       new_reg_offset[reg] = 0;
1910       reg++;
1911       int offset = 1;
1912
1913       /* j > 0 case */
1914       for (unsigned j = 1; j < alloc.sizes[i]; j++) {
1915          /* If this is a split point, reset the offset to 0 and allocate a
1916           * new virtual GRF for the previous offset many registers
1917           */
1918          if (split_points[reg]) {
1919             assert(offset <= MAX_VGRF_SIZE);
1920             int grf = alloc.allocate(offset);
1921             for (int k = reg - offset; k < reg; k++)
1922                new_virtual_grf[k] = grf;
1923             offset = 0;
1924          }
1925          new_reg_offset[reg] = offset;
1926          offset++;
1927          reg++;
1928       }
1929
1930       /* The last one gets the original register number */
1931       assert(offset <= MAX_VGRF_SIZE);
1932       alloc.sizes[i] = offset;
1933       for (int k = reg - offset; k < reg; k++)
1934          new_virtual_grf[k] = i;
1935    }
1936    assert(reg == reg_count);
1937
1938    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1939       if (inst->dst.file == GRF) {
1940          reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
1941          inst->dst.reg = new_virtual_grf[reg];
1942          inst->dst.reg_offset = new_reg_offset[reg];
1943          assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
1944       }
1945       for (int i = 0; i < inst->sources; i++) {
1946          if (inst->src[i].file == GRF) {
1947             reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
1948             inst->src[i].reg = new_virtual_grf[reg];
1949             inst->src[i].reg_offset = new_reg_offset[reg];
1950             assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
1951          }
1952       }
1953    }
1954    invalidate_live_intervals();
1955 }
1956
1957 /**
1958  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1959  *
1960  * During code generation, we create tons of temporary variables, many of
1961  * which get immediately killed and are never used again.  Yet, in later
1962  * optimization and analysis passes, such as compute_live_intervals, we need
1963  * to loop over all the virtual GRFs.  Compacting them can save a lot of
1964  * overhead.
1965  */
1966 bool
1967 fs_visitor::compact_virtual_grfs()
1968 {
1969    bool progress = false;
1970    int remap_table[this->alloc.count];
1971    memset(remap_table, -1, sizeof(remap_table));
1972
1973    /* Mark which virtual GRFs are used. */
1974    foreach_block_and_inst(block, const fs_inst, inst, cfg) {
1975       if (inst->dst.file == GRF)
1976          remap_table[inst->dst.reg] = 0;
1977
1978       for (int i = 0; i < inst->sources; i++) {
1979          if (inst->src[i].file == GRF)
1980             remap_table[inst->src[i].reg] = 0;
1981       }
1982    }
1983
1984    /* Compact the GRF arrays. */
1985    int new_index = 0;
1986    for (unsigned i = 0; i < this->alloc.count; i++) {
1987       if (remap_table[i] == -1) {
1988          /* We just found an unused register.  This means that we are
1989           * actually going to compact something.
1990           */
1991          progress = true;
1992       } else {
1993          remap_table[i] = new_index;
1994          alloc.sizes[new_index] = alloc.sizes[i];
1995          invalidate_live_intervals();
1996          ++new_index;
1997       }
1998    }
1999
2000    this->alloc.count = new_index;
2001
2002    /* Patch all the instructions to use the newly renumbered registers */
2003    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2004       if (inst->dst.file == GRF)
2005          inst->dst.reg = remap_table[inst->dst.reg];
2006
2007       for (int i = 0; i < inst->sources; i++) {
2008          if (inst->src[i].file == GRF)
2009             inst->src[i].reg = remap_table[inst->src[i].reg];
2010       }
2011    }
2012
2013    /* Patch all the references to delta_xy, since they're used in register
2014     * allocation.  If they're unused, switch them to BAD_FILE so we don't
2015     * think some random VGRF is delta_xy.
2016     */
2017    for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
2018       if (delta_xy[i].file == GRF) {
2019          if (remap_table[delta_xy[i].reg] != -1) {
2020             delta_xy[i].reg = remap_table[delta_xy[i].reg];
2021          } else {
2022             delta_xy[i].file = BAD_FILE;
2023          }
2024       }
2025    }
2026
2027    return progress;
2028 }
2029
2030 /*
2031  * Implements array access of uniforms by inserting a
2032  * PULL_CONSTANT_LOAD instruction.
2033  *
2034  * Unlike temporary GRF array access (where we don't support it due to
2035  * the difficulty of doing relative addressing on instruction
2036  * destinations), we could potentially do array access of uniforms
2037  * that were loaded in GRF space as push constants.  In real-world
2038  * usage we've seen, though, the arrays being used are always larger
2039  * than we could load as push constants, so just always move all
2040  * uniform array access out to a pull constant buffer.
2041  */
2042 void
2043 fs_visitor::move_uniform_array_access_to_pull_constants()
2044 {
2045    if (dispatch_width != 8)
2046       return;
2047
2048    pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2049    memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
2050
2051    /* Walk through and find array access of uniforms.  Put a copy of that
2052     * uniform in the pull constant buffer.
2053     *
2054     * Note that we don't move constant-indexed accesses to arrays.  No
2055     * testing has been done of the performance impact of this choice.
2056     */
2057    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2058       for (int i = 0 ; i < inst->sources; i++) {
2059          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2060             continue;
2061
2062          int uniform = inst->src[i].reg;
2063
2064          /* If this array isn't already present in the pull constant buffer,
2065           * add it.
2066           */
2067          if (pull_constant_loc[uniform] == -1) {
2068             const gl_constant_value **values = &stage_prog_data->param[uniform];
2069
2070             assert(param_size[uniform]);
2071
2072             for (int j = 0; j < param_size[uniform]; j++) {
2073                pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
2074
2075                stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
2076                   values[j];
2077             }
2078          }
2079       }
2080    }
2081 }
2082
2083 /**
2084  * Assign UNIFORM file registers to either push constants or pull constants.
2085  *
2086  * We allow a fragment shader to have more than the specified minimum
2087  * maximum number of fragment shader uniform components (64).  If
2088  * there are too many of these, they'd fill up all of register space.
2089  * So, this will push some of them out to the pull constant buffer and
2090  * update the program to load them.
2091  */
2092 void
2093 fs_visitor::assign_constant_locations()
2094 {
2095    /* Only the first compile (SIMD8 mode) gets to decide on locations. */
2096    if (dispatch_width != 8)
2097       return;
2098
2099    /* Find which UNIFORM registers are still in use. */
2100    bool is_live[uniforms];
2101    for (unsigned int i = 0; i < uniforms; i++) {
2102       is_live[i] = false;
2103    }
2104
2105    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2106       for (int i = 0; i < inst->sources; i++) {
2107          if (inst->src[i].file != UNIFORM)
2108             continue;
2109
2110          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
2111          if (constant_nr >= 0 && constant_nr < (int) uniforms)
2112             is_live[constant_nr] = true;
2113       }
2114    }
2115
2116    /* Only allow 16 registers (128 uniform components) as push constants.
2117     *
2118     * Just demote the end of the list.  We could probably do better
2119     * here, demoting things that are rarely used in the program first.
2120     *
2121     * If changing this value, note the limitation about total_regs in
2122     * brw_curbe.c.
2123     */
2124    unsigned int max_push_components = 16 * 8;
2125    unsigned int num_push_constants = 0;
2126
2127    push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2128
2129    for (unsigned int i = 0; i < uniforms; i++) {
2130       if (!is_live[i] || pull_constant_loc[i] != -1) {
2131          /* This UNIFORM register is either dead, or has already been demoted
2132           * to a pull const.  Mark it as no longer living in the param[] array.
2133           */
2134          push_constant_loc[i] = -1;
2135          continue;
2136       }
2137
2138       if (num_push_constants < max_push_components) {
2139          /* Retain as a push constant.  Record the location in the params[]
2140           * array.
2141           */
2142          push_constant_loc[i] = num_push_constants++;
2143       } else {
2144          /* Demote to a pull constant. */
2145          push_constant_loc[i] = -1;
2146
2147          int pull_index = stage_prog_data->nr_pull_params++;
2148          stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
2149          pull_constant_loc[i] = pull_index;
2150       }
2151    }
2152
2153    stage_prog_data->nr_params = num_push_constants;
2154
2155    /* Up until now, the param[] array has been indexed by reg + reg_offset
2156     * of UNIFORM registers.  Condense it to only contain the uniforms we
2157     * chose to upload as push constants.
2158     */
2159    for (unsigned int i = 0; i < uniforms; i++) {
2160       int remapped = push_constant_loc[i];
2161
2162       if (remapped == -1)
2163          continue;
2164
2165       assert(remapped <= (int)i);
2166       stage_prog_data->param[remapped] = stage_prog_data->param[i];
2167    }
2168 }
2169
2170 /**
2171  * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
2172  * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
2173  */
2174 void
2175 fs_visitor::demote_pull_constants()
2176 {
2177    foreach_block_and_inst (block, fs_inst, inst, cfg) {
2178       for (int i = 0; i < inst->sources; i++) {
2179          if (inst->src[i].file != UNIFORM)
2180             continue;
2181
2182          int pull_index;
2183          unsigned location = inst->src[i].reg + inst->src[i].reg_offset;
2184          if (location >= uniforms) /* Out of bounds access */
2185             pull_index = -1;
2186          else
2187             pull_index = pull_constant_loc[location];
2188
2189          if (pull_index == -1)
2190             continue;
2191
2192          /* Set up the annotation tracking for new generated instructions. */
2193          const fs_builder ibld = bld.annotate(inst->annotation, inst->ir)
2194                                     .at(block, inst);
2195          fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
2196          fs_reg dst = vgrf(glsl_type::float_type);
2197
2198          /* Generate a pull load into dst. */
2199          if (inst->src[i].reladdr) {
2200             VARYING_PULL_CONSTANT_LOAD(ibld, dst,
2201                                        surf_index,
2202                                        *inst->src[i].reladdr,
2203                                        pull_index);
2204             inst->src[i].reladdr = NULL;
2205          } else {
2206             fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
2207             ibld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
2208                       dst, surf_index, offset);
2209             inst->src[i].set_smear(pull_index & 3);
2210          }
2211
2212          /* Rewrite the instruction to use the temporary VGRF. */
2213          inst->src[i].file = GRF;
2214          inst->src[i].reg = dst.reg;
2215          inst->src[i].reg_offset = 0;
2216          inst->src[i].width = dispatch_width;
2217       }
2218    }
2219    invalidate_live_intervals();
2220 }
2221
2222 bool
2223 fs_visitor::opt_algebraic()
2224 {
2225    bool progress = false;
2226
2227    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2228       switch (inst->opcode) {
2229       case BRW_OPCODE_MOV:
2230          if (inst->src[0].file != IMM)
2231             break;
2232
2233          if (inst->saturate) {
2234             if (inst->dst.type != inst->src[0].type)
2235                assert(!"unimplemented: saturate mixed types");
2236
2237             if (brw_saturate_immediate(inst->dst.type,
2238                                        &inst->src[0].fixed_hw_reg)) {
2239                inst->saturate = false;
2240                progress = true;
2241             }
2242          }
2243          break;
2244
2245       case BRW_OPCODE_MUL:
2246          if (inst->src[1].file != IMM)
2247             continue;
2248
2249          /* a * 1.0 = a */
2250          if (inst->src[1].is_one()) {
2251             inst->opcode = BRW_OPCODE_MOV;
2252             inst->src[1] = reg_undef;
2253             progress = true;
2254             break;
2255          }
2256
2257          /* a * -1.0 = -a */
2258          if (inst->src[1].is_negative_one()) {
2259             inst->opcode = BRW_OPCODE_MOV;
2260             inst->src[0].negate = !inst->src[0].negate;
2261             inst->src[1] = reg_undef;
2262             progress = true;
2263             break;
2264          }
2265
2266          /* a * 0.0 = 0.0 */
2267          if (inst->src[1].is_zero()) {
2268             inst->opcode = BRW_OPCODE_MOV;
2269             inst->src[0] = inst->src[1];
2270             inst->src[1] = reg_undef;
2271             progress = true;
2272             break;
2273          }
2274
2275          if (inst->src[0].file == IMM) {
2276             assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2277             inst->opcode = BRW_OPCODE_MOV;
2278             inst->src[0].fixed_hw_reg.dw1.f *= inst->src[1].fixed_hw_reg.dw1.f;
2279             inst->src[1] = reg_undef;
2280             progress = true;
2281             break;
2282          }
2283          break;
2284       case BRW_OPCODE_ADD:
2285          if (inst->src[1].file != IMM)
2286             continue;
2287
2288          /* a + 0.0 = a */
2289          if (inst->src[1].is_zero()) {
2290             inst->opcode = BRW_OPCODE_MOV;
2291             inst->src[1] = reg_undef;
2292             progress = true;
2293             break;
2294          }
2295
2296          if (inst->src[0].file == IMM) {
2297             assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2298             inst->opcode = BRW_OPCODE_MOV;
2299             inst->src[0].fixed_hw_reg.dw1.f += inst->src[1].fixed_hw_reg.dw1.f;
2300             inst->src[1] = reg_undef;
2301             progress = true;
2302             break;
2303          }
2304          break;
2305       case BRW_OPCODE_OR:
2306          if (inst->src[0].equals(inst->src[1])) {
2307             inst->opcode = BRW_OPCODE_MOV;
2308             inst->src[1] = reg_undef;
2309             progress = true;
2310             break;
2311          }
2312          break;
2313       case BRW_OPCODE_LRP:
2314          if (inst->src[1].equals(inst->src[2])) {
2315             inst->opcode = BRW_OPCODE_MOV;
2316             inst->src[0] = inst->src[1];
2317             inst->src[1] = reg_undef;
2318             inst->src[2] = reg_undef;
2319             progress = true;
2320             break;
2321          }
2322          break;
2323       case BRW_OPCODE_CMP:
2324          if (inst->conditional_mod == BRW_CONDITIONAL_GE &&
2325              inst->src[0].abs &&
2326              inst->src[0].negate &&
2327              inst->src[1].is_zero()) {
2328             inst->src[0].abs = false;
2329             inst->src[0].negate = false;
2330             inst->conditional_mod = BRW_CONDITIONAL_Z;
2331             progress = true;
2332             break;
2333          }
2334          break;
2335       case BRW_OPCODE_SEL:
2336          if (inst->src[0].equals(inst->src[1])) {
2337             inst->opcode = BRW_OPCODE_MOV;
2338             inst->src[1] = reg_undef;
2339             inst->predicate = BRW_PREDICATE_NONE;
2340             inst->predicate_inverse = false;
2341             progress = true;
2342          } else if (inst->saturate && inst->src[1].file == IMM) {
2343             switch (inst->conditional_mod) {
2344             case BRW_CONDITIONAL_LE:
2345             case BRW_CONDITIONAL_L:
2346                switch (inst->src[1].type) {
2347                case BRW_REGISTER_TYPE_F:
2348                   if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
2349                      inst->opcode = BRW_OPCODE_MOV;
2350                      inst->src[1] = reg_undef;
2351                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
2352                      progress = true;
2353                   }
2354                   break;
2355                default:
2356                   break;
2357                }
2358                break;
2359             case BRW_CONDITIONAL_GE:
2360             case BRW_CONDITIONAL_G:
2361                switch (inst->src[1].type) {
2362                case BRW_REGISTER_TYPE_F:
2363                   if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
2364                      inst->opcode = BRW_OPCODE_MOV;
2365                      inst->src[1] = reg_undef;
2366                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
2367                      progress = true;
2368                   }
2369                   break;
2370                default:
2371                   break;
2372                }
2373             default:
2374                break;
2375             }
2376          }
2377          break;
2378       case BRW_OPCODE_MAD:
2379          if (inst->src[1].is_zero() || inst->src[2].is_zero()) {
2380             inst->opcode = BRW_OPCODE_MOV;
2381             inst->src[1] = reg_undef;
2382             inst->src[2] = reg_undef;
2383             progress = true;
2384          } else if (inst->src[0].is_zero()) {
2385             inst->opcode = BRW_OPCODE_MUL;
2386             inst->src[0] = inst->src[2];
2387             inst->src[2] = reg_undef;
2388             progress = true;
2389          } else if (inst->src[1].is_one()) {
2390             inst->opcode = BRW_OPCODE_ADD;
2391             inst->src[1] = inst->src[2];
2392             inst->src[2] = reg_undef;
2393             progress = true;
2394          } else if (inst->src[2].is_one()) {
2395             inst->opcode = BRW_OPCODE_ADD;
2396             inst->src[2] = reg_undef;
2397             progress = true;
2398          } else if (inst->src[1].file == IMM && inst->src[2].file == IMM) {
2399             inst->opcode = BRW_OPCODE_ADD;
2400             inst->src[1].fixed_hw_reg.dw1.f *= inst->src[2].fixed_hw_reg.dw1.f;
2401             inst->src[2] = reg_undef;
2402             progress = true;
2403          }
2404          break;
2405       case SHADER_OPCODE_RCP: {
2406          fs_inst *prev = (fs_inst *)inst->prev;
2407          if (prev->opcode == SHADER_OPCODE_SQRT) {
2408             if (inst->src[0].equals(prev->dst)) {
2409                inst->opcode = SHADER_OPCODE_RSQ;
2410                inst->src[0] = prev->src[0];
2411                progress = true;
2412             }
2413          }
2414          break;
2415       }
2416       case SHADER_OPCODE_BROADCAST:
2417          if (is_uniform(inst->src[0])) {
2418             inst->opcode = BRW_OPCODE_MOV;
2419             inst->sources = 1;
2420             inst->force_writemask_all = true;
2421             progress = true;
2422          } else if (inst->src[1].file == IMM) {
2423             inst->opcode = BRW_OPCODE_MOV;
2424             inst->src[0] = component(inst->src[0],
2425                                      inst->src[1].fixed_hw_reg.dw1.ud);
2426             inst->sources = 1;
2427             inst->force_writemask_all = true;
2428             progress = true;
2429          }
2430          break;
2431
2432       default:
2433          break;
2434       }
2435
2436       /* Swap if src[0] is immediate. */
2437       if (progress && inst->is_commutative()) {
2438          if (inst->src[0].file == IMM) {
2439             fs_reg tmp = inst->src[1];
2440             inst->src[1] = inst->src[0];
2441             inst->src[0] = tmp;
2442          }
2443       }
2444    }
2445    return progress;
2446 }
2447
2448 /**
2449  * Optimize sample messages that have constant zero values for the trailing
2450  * texture coordinates. We can just reduce the message length for these
2451  * instructions instead of reserving a register for it. Trailing parameters
2452  * that aren't sent default to zero anyway. This will cause the dead code
2453  * eliminator to remove the MOV instruction that would otherwise be emitted to
2454  * set up the zero value.
2455  */
2456 bool
2457 fs_visitor::opt_zero_samples()
2458 {
2459    /* Gen4 infers the texturing opcode based on the message length so we can't
2460     * change it.
2461     */
2462    if (devinfo->gen < 5)
2463       return false;
2464
2465    bool progress = false;
2466
2467    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2468       if (!inst->is_tex())
2469          continue;
2470
2471       fs_inst *load_payload = (fs_inst *) inst->prev;
2472
2473       if (load_payload->is_head_sentinel() ||
2474           load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2475          continue;
2476
2477       /* We don't want to remove the message header or the first parameter.
2478        * Removing the first parameter is not allowed, see the Haswell PRM
2479        * volume 7, page 149:
2480        *
2481        *     "Parameter 0 is required except for the sampleinfo message, which
2482        *      has no parameter 0"
2483        */
2484       while (inst->mlen > inst->header_size + dispatch_width / 8 &&
2485              load_payload->src[(inst->mlen - inst->header_size) /
2486                                (dispatch_width / 8) +
2487                                inst->header_size - 1].is_zero()) {
2488          inst->mlen -= dispatch_width / 8;
2489          progress = true;
2490       }
2491    }
2492
2493    if (progress)
2494       invalidate_live_intervals();
2495
2496    return progress;
2497 }
2498
2499 /**
2500  * Optimize sample messages which are followed by the final RT write.
2501  *
2502  * CHV, and GEN9+ can mark a texturing SEND instruction with EOT to have its
2503  * results sent directly to the framebuffer, bypassing the EU.  Recognize the
2504  * final texturing results copied to the framebuffer write payload and modify
2505  * them to write to the framebuffer directly.
2506  */
2507 bool
2508 fs_visitor::opt_sampler_eot()
2509 {
2510    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2511
2512    if (stage != MESA_SHADER_FRAGMENT)
2513       return false;
2514
2515    if (devinfo->gen < 9 && !devinfo->is_cherryview)
2516       return false;
2517
2518    /* FINISHME: It should be possible to implement this optimization when there
2519     * are multiple drawbuffers.
2520     */
2521    if (key->nr_color_regions != 1)
2522       return false;
2523
2524    /* Look for a texturing instruction immediately before the final FB_WRITE. */
2525    fs_inst *fb_write = (fs_inst *) cfg->blocks[cfg->num_blocks - 1]->end();
2526    assert(fb_write->eot);
2527    assert(fb_write->opcode == FS_OPCODE_FB_WRITE);
2528
2529    fs_inst *tex_inst = (fs_inst *) fb_write->prev;
2530
2531    /* There wasn't one; nothing to do. */
2532    if (unlikely(tex_inst->is_head_sentinel()) || !tex_inst->is_tex())
2533       return false;
2534
2535    /* This optimisation doesn't seem to work for textureGather for some
2536     * reason. I can't find any documentation or known workarounds to indicate
2537     * that this is expected, but considering that it is probably pretty
2538     * unlikely that a shader would directly write out the results from
2539     * textureGather we might as well just disable it.
2540     */
2541    if (tex_inst->opcode == SHADER_OPCODE_TG4 ||
2542        tex_inst->opcode == SHADER_OPCODE_TG4_OFFSET)
2543       return false;
2544
2545    /* If there's no header present, we need to munge the LOAD_PAYLOAD as well.
2546     * It's very likely to be the previous instruction.
2547     */
2548    fs_inst *load_payload = (fs_inst *) tex_inst->prev;
2549    if (load_payload->is_head_sentinel() ||
2550        load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2551       return false;
2552
2553    assert(!tex_inst->eot); /* We can't get here twice */
2554    assert((tex_inst->offset & (0xff << 24)) == 0);
2555
2556    tex_inst->offset |= fb_write->target << 24;
2557    tex_inst->eot = true;
2558    tex_inst->dst = bld.null_reg_ud();
2559    fb_write->remove(cfg->blocks[cfg->num_blocks - 1]);
2560
2561    /* If a header is present, marking the eot is sufficient. Otherwise, we need
2562     * to create a new LOAD_PAYLOAD command with the same sources and a space
2563     * saved for the header. Using a new destination register not only makes sure
2564     * we have enough space, but it will make sure the dead code eliminator kills
2565     * the instruction that this will replace.
2566     */
2567    if (tex_inst->header_size != 0)
2568       return true;
2569
2570    fs_reg send_header = bld.vgrf(BRW_REGISTER_TYPE_F,
2571                                  load_payload->sources + 1);
2572    fs_reg *new_sources =
2573       ralloc_array(mem_ctx, fs_reg, load_payload->sources + 1);
2574
2575    new_sources[0] = fs_reg();
2576    for (int i = 0; i < load_payload->sources; i++)
2577       new_sources[i+1] = load_payload->src[i];
2578
2579    /* The LOAD_PAYLOAD helper seems like the obvious choice here. However, it
2580     * requires a lot of information about the sources to appropriately figure
2581     * out the number of registers needed to be used. Given this stage in our
2582     * optimization, we may not have the appropriate GRFs required by
2583     * LOAD_PAYLOAD at this point (copy propagation). Therefore, we need to
2584     * manually emit the instruction.
2585     */
2586    fs_inst *new_load_payload = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD,
2587                                                     load_payload->exec_size,
2588                                                     send_header,
2589                                                     new_sources,
2590                                                     load_payload->sources + 1);
2591
2592    new_load_payload->regs_written = load_payload->regs_written + 1;
2593    new_load_payload->header_size = 1;
2594    tex_inst->mlen++;
2595    tex_inst->header_size = 1;
2596    tex_inst->insert_before(cfg->blocks[cfg->num_blocks - 1], new_load_payload);
2597    tex_inst->src[0] = send_header;
2598
2599    return true;
2600 }
2601
2602 bool
2603 fs_visitor::opt_register_renaming()
2604 {
2605    bool progress = false;
2606    int depth = 0;
2607
2608    int remap[alloc.count];
2609    memset(remap, -1, sizeof(int) * alloc.count);
2610
2611    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2612       if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
2613          depth++;
2614       } else if (inst->opcode == BRW_OPCODE_ENDIF ||
2615                  inst->opcode == BRW_OPCODE_WHILE) {
2616          depth--;
2617       }
2618
2619       /* Rewrite instruction sources. */
2620       for (int i = 0; i < inst->sources; i++) {
2621          if (inst->src[i].file == GRF &&
2622              remap[inst->src[i].reg] != -1 &&
2623              remap[inst->src[i].reg] != inst->src[i].reg) {
2624             inst->src[i].reg = remap[inst->src[i].reg];
2625             progress = true;
2626          }
2627       }
2628
2629       const int dst = inst->dst.reg;
2630
2631       if (depth == 0 &&
2632           inst->dst.file == GRF &&
2633           alloc.sizes[inst->dst.reg] == inst->dst.width / 8 &&
2634           !inst->is_partial_write()) {
2635          if (remap[dst] == -1) {
2636             remap[dst] = dst;
2637          } else {
2638             remap[dst] = alloc.allocate(inst->dst.width / 8);
2639             inst->dst.reg = remap[dst];
2640             progress = true;
2641          }
2642       } else if (inst->dst.file == GRF &&
2643                  remap[dst] != -1 &&
2644                  remap[dst] != dst) {
2645          inst->dst.reg = remap[dst];
2646          progress = true;
2647       }
2648    }
2649
2650    if (progress) {
2651       invalidate_live_intervals();
2652
2653       for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
2654          if (delta_xy[i].file == GRF && remap[delta_xy[i].reg] != -1) {
2655             delta_xy[i].reg = remap[delta_xy[i].reg];
2656          }
2657       }
2658    }
2659
2660    return progress;
2661 }
2662
2663 /**
2664  * Remove redundant or useless discard jumps.
2665  *
2666  * For example, we can eliminate jumps in the following sequence:
2667  *
2668  * discard-jump       (redundant with the next jump)
2669  * discard-jump       (useless; jumps to the next instruction)
2670  * placeholder-halt
2671  */
2672 bool
2673 fs_visitor::opt_redundant_discard_jumps()
2674 {
2675    bool progress = false;
2676
2677    bblock_t *last_bblock = cfg->blocks[cfg->num_blocks - 1];
2678
2679    fs_inst *placeholder_halt = NULL;
2680    foreach_inst_in_block_reverse(fs_inst, inst, last_bblock) {
2681       if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT) {
2682          placeholder_halt = inst;
2683          break;
2684       }
2685    }
2686
2687    if (!placeholder_halt)
2688       return false;
2689
2690    /* Delete any HALTs immediately before the placeholder halt. */
2691    for (fs_inst *prev = (fs_inst *) placeholder_halt->prev;
2692         !prev->is_head_sentinel() && prev->opcode == FS_OPCODE_DISCARD_JUMP;
2693         prev = (fs_inst *) placeholder_halt->prev) {
2694       prev->remove(last_bblock);
2695       progress = true;
2696    }
2697
2698    if (progress)
2699       invalidate_live_intervals();
2700
2701    return progress;
2702 }
2703
2704 bool
2705 fs_visitor::compute_to_mrf()
2706 {
2707    bool progress = false;
2708    int next_ip = 0;
2709
2710    /* No MRFs on Gen >= 7. */
2711    if (devinfo->gen >= 7)
2712       return false;
2713
2714    calculate_live_intervals();
2715
2716    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2717       int ip = next_ip;
2718       next_ip++;
2719
2720       if (inst->opcode != BRW_OPCODE_MOV ||
2721           inst->is_partial_write() ||
2722           inst->dst.file != MRF || inst->src[0].file != GRF ||
2723           inst->dst.type != inst->src[0].type ||
2724           inst->src[0].abs || inst->src[0].negate ||
2725           !inst->src[0].is_contiguous() ||
2726           inst->src[0].subreg_offset)
2727          continue;
2728
2729       /* Work out which hardware MRF registers are written by this
2730        * instruction.
2731        */
2732       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2733       int mrf_high;
2734       if (inst->dst.reg & BRW_MRF_COMPR4) {
2735          mrf_high = mrf_low + 4;
2736       } else if (inst->exec_size == 16) {
2737          mrf_high = mrf_low + 1;
2738       } else {
2739          mrf_high = mrf_low;
2740       }
2741
2742       /* Can't compute-to-MRF this GRF if someone else was going to
2743        * read it later.
2744        */
2745       if (this->virtual_grf_end[inst->src[0].reg] > ip)
2746          continue;
2747
2748       /* Found a move of a GRF to a MRF.  Let's see if we can go
2749        * rewrite the thing that made this GRF to write into the MRF.
2750        */
2751       foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2752          if (scan_inst->dst.file == GRF &&
2753              scan_inst->dst.reg == inst->src[0].reg) {
2754             /* Found the last thing to write our reg we want to turn
2755              * into a compute-to-MRF.
2756              */
2757
2758             /* If this one instruction didn't populate all the
2759              * channels, bail.  We might be able to rewrite everything
2760              * that writes that reg, but it would require smarter
2761              * tracking to delay the rewriting until complete success.
2762              */
2763             if (scan_inst->is_partial_write())
2764                break;
2765
2766             /* Things returning more than one register would need us to
2767              * understand coalescing out more than one MOV at a time.
2768              */
2769             if (scan_inst->regs_written > scan_inst->dst.width / 8)
2770                break;
2771
2772             /* SEND instructions can't have MRF as a destination. */
2773             if (scan_inst->mlen)
2774                break;
2775
2776             if (devinfo->gen == 6) {
2777                /* gen6 math instructions must have the destination be
2778                 * GRF, so no compute-to-MRF for them.
2779                 */
2780                if (scan_inst->is_math()) {
2781                   break;
2782                }
2783             }
2784
2785             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2786                /* Found the creator of our MRF's source value. */
2787                scan_inst->dst.file = MRF;
2788                scan_inst->dst.reg = inst->dst.reg;
2789                scan_inst->saturate |= inst->saturate;
2790                inst->remove(block);
2791                progress = true;
2792             }
2793             break;
2794          }
2795
2796          /* We don't handle control flow here.  Most computation of
2797           * values that end up in MRFs are shortly before the MRF
2798           * write anyway.
2799           */
2800          if (block->start() == scan_inst)
2801             break;
2802
2803          /* You can't read from an MRF, so if someone else reads our
2804           * MRF's source GRF that we wanted to rewrite, that stops us.
2805           */
2806          bool interfered = false;
2807          for (int i = 0; i < scan_inst->sources; i++) {
2808             if (scan_inst->src[i].file == GRF &&
2809                 scan_inst->src[i].reg == inst->src[0].reg &&
2810                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2811                interfered = true;
2812             }
2813          }
2814          if (interfered)
2815             break;
2816
2817          if (scan_inst->dst.file == MRF) {
2818             /* If somebody else writes our MRF here, we can't
2819              * compute-to-MRF before that.
2820              */
2821             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2822             int scan_mrf_high;
2823
2824             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2825                scan_mrf_high = scan_mrf_low + 4;
2826             } else if (scan_inst->exec_size == 16) {
2827                scan_mrf_high = scan_mrf_low + 1;
2828             } else {
2829                scan_mrf_high = scan_mrf_low;
2830             }
2831
2832             if (mrf_low == scan_mrf_low ||
2833                 mrf_low == scan_mrf_high ||
2834                 mrf_high == scan_mrf_low ||
2835                 mrf_high == scan_mrf_high) {
2836                break;
2837             }
2838          }
2839
2840          if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2841             /* Found a SEND instruction, which means that there are
2842              * live values in MRFs from base_mrf to base_mrf +
2843              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2844              * above it.
2845              */
2846             if (mrf_low >= scan_inst->base_mrf &&
2847                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2848                break;
2849             }
2850             if (mrf_high >= scan_inst->base_mrf &&
2851                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2852                break;
2853             }
2854          }
2855       }
2856    }
2857
2858    if (progress)
2859       invalidate_live_intervals();
2860
2861    return progress;
2862 }
2863
2864 /**
2865  * Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control
2866  * flow.  We could probably do better here with some form of divergence
2867  * analysis.
2868  */
2869 bool
2870 fs_visitor::eliminate_find_live_channel()
2871 {
2872    bool progress = false;
2873    unsigned depth = 0;
2874
2875    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2876       switch (inst->opcode) {
2877       case BRW_OPCODE_IF:
2878       case BRW_OPCODE_DO:
2879          depth++;
2880          break;
2881
2882       case BRW_OPCODE_ENDIF:
2883       case BRW_OPCODE_WHILE:
2884          depth--;
2885          break;
2886
2887       case FS_OPCODE_DISCARD_JUMP:
2888          /* This can potentially make control flow non-uniform until the end
2889           * of the program.
2890           */
2891          return progress;
2892
2893       case SHADER_OPCODE_FIND_LIVE_CHANNEL:
2894          if (depth == 0) {
2895             inst->opcode = BRW_OPCODE_MOV;
2896             inst->src[0] = fs_reg(0);
2897             inst->sources = 1;
2898             inst->force_writemask_all = true;
2899             progress = true;
2900          }
2901          break;
2902
2903       default:
2904          break;
2905       }
2906    }
2907
2908    return progress;
2909 }
2910
2911 /**
2912  * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
2913  * instructions to FS_OPCODE_REP_FB_WRITE.
2914  */
2915 void
2916 fs_visitor::emit_repclear_shader()
2917 {
2918    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2919    int base_mrf = 1;
2920    int color_mrf = base_mrf + 2;
2921
2922    fs_inst *mov = emit(MOV(vec4(brw_message_reg(color_mrf)),
2923                            fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F)));
2924    mov->force_writemask_all = true;
2925
2926    fs_inst *write;
2927    if (key->nr_color_regions == 1) {
2928       write = emit(FS_OPCODE_REP_FB_WRITE);
2929       write->saturate = key->clamp_fragment_color;
2930       write->base_mrf = color_mrf;
2931       write->target = 0;
2932       write->header_size = 0;
2933       write->mlen = 1;
2934    } else {
2935       assume(key->nr_color_regions > 0);
2936       for (int i = 0; i < key->nr_color_regions; ++i) {
2937          write = emit(FS_OPCODE_REP_FB_WRITE);
2938          write->saturate = key->clamp_fragment_color;
2939          write->base_mrf = base_mrf;
2940          write->target = i;
2941          write->header_size = 2;
2942          write->mlen = 3;
2943       }
2944    }
2945    write->eot = true;
2946
2947    calculate_cfg();
2948
2949    assign_constant_locations();
2950    assign_curb_setup();
2951
2952    /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
2953    assert(mov->src[0].file == HW_REG);
2954    mov->src[0] = brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
2955 }
2956
2957 /**
2958  * Walks through basic blocks, looking for repeated MRF writes and
2959  * removing the later ones.
2960  */
2961 bool
2962 fs_visitor::remove_duplicate_mrf_writes()
2963 {
2964    fs_inst *last_mrf_move[16];
2965    bool progress = false;
2966
2967    /* Need to update the MRF tracking for compressed instructions. */
2968    if (dispatch_width == 16)
2969       return false;
2970
2971    memset(last_mrf_move, 0, sizeof(last_mrf_move));
2972
2973    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
2974       if (inst->is_control_flow()) {
2975          memset(last_mrf_move, 0, sizeof(last_mrf_move));
2976       }
2977
2978       if (inst->opcode == BRW_OPCODE_MOV &&
2979           inst->dst.file == MRF) {
2980          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2981          if (prev_inst && inst->equals(prev_inst)) {
2982             inst->remove(block);
2983             progress = true;
2984             continue;
2985          }
2986       }
2987
2988       /* Clear out the last-write records for MRFs that were overwritten. */
2989       if (inst->dst.file == MRF) {
2990          last_mrf_move[inst->dst.reg] = NULL;
2991       }
2992
2993       if (inst->mlen > 0 && inst->base_mrf != -1) {
2994          /* Found a SEND instruction, which will include two or fewer
2995           * implied MRF writes.  We could do better here.
2996           */
2997          for (int i = 0; i < implied_mrf_writes(inst); i++) {
2998             last_mrf_move[inst->base_mrf + i] = NULL;
2999          }
3000       }
3001
3002       /* Clear out any MRF move records whose sources got overwritten. */
3003       if (inst->dst.file == GRF) {
3004          for (unsigned int i = 0; i < ARRAY_SIZE(last_mrf_move); i++) {
3005             if (last_mrf_move[i] &&
3006                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
3007                last_mrf_move[i] = NULL;
3008             }
3009          }
3010       }
3011
3012       if (inst->opcode == BRW_OPCODE_MOV &&
3013           inst->dst.file == MRF &&
3014           inst->src[0].file == GRF &&
3015           !inst->is_partial_write()) {
3016          last_mrf_move[inst->dst.reg] = inst;
3017       }
3018    }
3019
3020    if (progress)
3021       invalidate_live_intervals();
3022
3023    return progress;
3024 }
3025
3026 static void
3027 clear_deps_for_inst_src(fs_inst *inst, bool *deps, int first_grf, int grf_len)
3028 {
3029    /* Clear the flag for registers that actually got read (as expected). */
3030    for (int i = 0; i < inst->sources; i++) {
3031       int grf;
3032       if (inst->src[i].file == GRF) {
3033          grf = inst->src[i].reg;
3034       } else if (inst->src[i].file == HW_REG &&
3035                  inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
3036          grf = inst->src[i].fixed_hw_reg.nr;
3037       } else {
3038          continue;
3039       }
3040
3041       if (grf >= first_grf &&
3042           grf < first_grf + grf_len) {
3043          deps[grf - first_grf] = false;
3044          if (inst->exec_size == 16)
3045             deps[grf - first_grf + 1] = false;
3046       }
3047    }
3048 }
3049
3050 /**
3051  * Implements this workaround for the original 965:
3052  *
3053  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
3054  *      check for post destination dependencies on this instruction, software
3055  *      must ensure that there is no destination hazard for the case of ‘write
3056  *      followed by a posted write’ shown in the following example.
3057  *
3058  *      1. mov r3 0
3059  *      2. send r3.xy <rest of send instruction>
3060  *      3. mov r2 r3
3061  *
3062  *      Due to no post-destination dependency check on the ‘send’, the above
3063  *      code sequence could have two instructions (1 and 2) in flight at the
3064  *      same time that both consider ‘r3’ as the target of their final writes.
3065  */
3066 void
3067 fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
3068                                                         fs_inst *inst)
3069 {
3070    int write_len = inst->regs_written;
3071    int first_write_grf = inst->dst.reg;
3072    bool needs_dep[BRW_MAX_MRF];
3073    assert(write_len < (int)sizeof(needs_dep) - 1);
3074
3075    memset(needs_dep, false, sizeof(needs_dep));
3076    memset(needs_dep, true, write_len);
3077
3078    clear_deps_for_inst_src(inst, needs_dep, first_write_grf, write_len);
3079
3080    /* Walk backwards looking for writes to registers we're writing which
3081     * aren't read since being written.  If we hit the start of the program,
3082     * we assume that there are no outstanding dependencies on entry to the
3083     * program.
3084     */
3085    foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
3086       /* If we hit control flow, assume that there *are* outstanding
3087        * dependencies, and force their cleanup before our instruction.
3088        */
3089       if (block->start() == scan_inst) {
3090          for (int i = 0; i < write_len; i++) {
3091             if (needs_dep[i])
3092                DEP_RESOLVE_MOV(bld.at(block, inst), first_write_grf + i);
3093          }
3094          return;
3095       }
3096
3097       /* We insert our reads as late as possible on the assumption that any
3098        * instruction but a MOV that might have left us an outstanding
3099        * dependency has more latency than a MOV.
3100        */
3101       if (scan_inst->dst.file == GRF) {
3102          for (int i = 0; i < scan_inst->regs_written; i++) {
3103             int reg = scan_inst->dst.reg + i;
3104
3105             if (reg >= first_write_grf &&
3106                 reg < first_write_grf + write_len &&
3107                 needs_dep[reg - first_write_grf]) {
3108                DEP_RESOLVE_MOV(bld.at(block, inst), reg);
3109                needs_dep[reg - first_write_grf] = false;
3110                if (scan_inst->exec_size == 16)
3111                   needs_dep[reg - first_write_grf + 1] = false;
3112             }
3113          }
3114       }
3115
3116       /* Clear the flag for registers that actually got read (as expected). */
3117       clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3118
3119       /* Continue the loop only if we haven't resolved all the dependencies */
3120       int i;
3121       for (i = 0; i < write_len; i++) {
3122          if (needs_dep[i])
3123             break;
3124       }
3125       if (i == write_len)
3126          return;
3127    }
3128 }
3129
3130 /**
3131  * Implements this workaround for the original 965:
3132  *
3133  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
3134  *      used as a destination register until after it has been sourced by an
3135  *      instruction with a different destination register.
3136  */
3137 void
3138 fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
3139 {
3140    int write_len = inst->regs_written;
3141    int first_write_grf = inst->dst.reg;
3142    bool needs_dep[BRW_MAX_MRF];
3143    assert(write_len < (int)sizeof(needs_dep) - 1);
3144
3145    memset(needs_dep, false, sizeof(needs_dep));
3146    memset(needs_dep, true, write_len);
3147    /* Walk forwards looking for writes to registers we're writing which aren't
3148     * read before being written.
3149     */
3150    foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst, block) {
3151       /* If we hit control flow, force resolve all remaining dependencies. */
3152       if (block->end() == scan_inst) {
3153          for (int i = 0; i < write_len; i++) {
3154             if (needs_dep[i])
3155                DEP_RESOLVE_MOV(bld.at(block, scan_inst), first_write_grf + i);
3156          }
3157          return;
3158       }
3159
3160       /* Clear the flag for registers that actually got read (as expected). */
3161       clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3162
3163       /* We insert our reads as late as possible since they're reading the
3164        * result of a SEND, which has massive latency.
3165        */
3166       if (scan_inst->dst.file == GRF &&
3167           scan_inst->dst.reg >= first_write_grf &&
3168           scan_inst->dst.reg < first_write_grf + write_len &&
3169           needs_dep[scan_inst->dst.reg - first_write_grf]) {
3170          DEP_RESOLVE_MOV(bld.at(block, scan_inst), scan_inst->dst.reg);
3171          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
3172       }
3173
3174       /* Continue the loop only if we haven't resolved all the dependencies */
3175       int i;
3176       for (i = 0; i < write_len; i++) {
3177          if (needs_dep[i])
3178             break;
3179       }
3180       if (i == write_len)
3181          return;
3182    }
3183 }
3184
3185 void
3186 fs_visitor::insert_gen4_send_dependency_workarounds()
3187 {
3188    if (devinfo->gen != 4 || devinfo->is_g4x)
3189       return;
3190
3191    bool progress = false;
3192
3193    /* Note that we're done with register allocation, so GRF fs_regs always
3194     * have a .reg_offset of 0.
3195     */
3196
3197    foreach_block_and_inst(block, fs_inst, inst, cfg) {
3198       if (inst->mlen != 0 && inst->dst.file == GRF) {
3199          insert_gen4_pre_send_dependency_workarounds(block, inst);
3200          insert_gen4_post_send_dependency_workarounds(block, inst);
3201          progress = true;
3202       }
3203    }
3204
3205    if (progress)
3206       invalidate_live_intervals();
3207 }
3208
3209 /**
3210  * Turns the generic expression-style uniform pull constant load instruction
3211  * into a hardware-specific series of instructions for loading a pull
3212  * constant.
3213  *
3214  * The expression style allows the CSE pass before this to optimize out
3215  * repeated loads from the same offset, and gives the pre-register-allocation
3216  * scheduling full flexibility, while the conversion to native instructions
3217  * allows the post-register-allocation scheduler the best information
3218  * possible.
3219  *
3220  * Note that execution masking for setting up pull constant loads is special:
3221  * the channels that need to be written are unrelated to the current execution
3222  * mask, since a later instruction will use one of the result channels as a
3223  * source operand for all 8 or 16 of its channels.
3224  */
3225 void
3226 fs_visitor::lower_uniform_pull_constant_loads()
3227 {
3228    foreach_block_and_inst (block, fs_inst, inst, cfg) {
3229       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
3230          continue;
3231
3232       if (devinfo->gen >= 7) {
3233          /* The offset arg before was a vec4-aligned byte offset.  We need to
3234           * turn it into a dword offset.
3235           */
3236          fs_reg const_offset_reg = inst->src[1];
3237          assert(const_offset_reg.file == IMM &&
3238                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
3239          const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
3240          fs_reg payload = fs_reg(GRF, alloc.allocate(1));
3241
3242          /* We have to use a message header on Skylake to get SIMD4x2 mode.
3243           * Reserve space for the register.
3244           */
3245          if (devinfo->gen >= 9) {
3246             payload.reg_offset++;
3247             alloc.sizes[payload.reg] = 2;
3248          }
3249
3250          /* This is actually going to be a MOV, but since only the first dword
3251           * is accessed, we have a special opcode to do just that one.  Note
3252           * that this needs to be an operation that will be considered a def
3253           * by live variable analysis, or register allocation will explode.
3254           */
3255          fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
3256                                                8, payload, const_offset_reg);
3257          setup->force_writemask_all = true;
3258
3259          setup->ir = inst->ir;
3260          setup->annotation = inst->annotation;
3261          inst->insert_before(block, setup);
3262
3263          /* Similarly, this will only populate the first 4 channels of the
3264           * result register (since we only use smear values from 0-3), but we
3265           * don't tell the optimizer.
3266           */
3267          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
3268          inst->src[1] = payload;
3269
3270          invalidate_live_intervals();
3271       } else {
3272          /* Before register allocation, we didn't tell the scheduler about the
3273           * MRF we use.  We know it's safe to use this MRF because nothing
3274           * else does except for register spill/unspill, which generates and
3275           * uses its MRF within a single IR instruction.
3276           */
3277          inst->base_mrf = 14;
3278          inst->mlen = 1;
3279       }
3280    }
3281 }
3282
3283 bool
3284 fs_visitor::lower_load_payload()
3285 {
3286    bool progress = false;
3287
3288    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3289       if (inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
3290          continue;
3291
3292       assert(inst->dst.file == MRF || inst->dst.file == GRF);
3293       assert(inst->saturate == false);
3294
3295       const fs_builder ibld = bld.group(inst->exec_size, inst->force_sechalf)
3296                                  .exec_all(inst->force_writemask_all)
3297                                  .at(block, inst);
3298       fs_reg dst = inst->dst;
3299
3300       /* Get rid of COMPR4.  We'll add it back in if we need it */
3301       if (dst.file == MRF)
3302          dst.reg = dst.reg & ~BRW_MRF_COMPR4;
3303
3304       dst.width = 8;
3305       for (uint8_t i = 0; i < inst->header_size; i++) {
3306          if (inst->src[i].file != BAD_FILE) {
3307             fs_reg mov_dst = retype(dst, BRW_REGISTER_TYPE_UD);
3308             fs_reg mov_src = retype(inst->src[i], BRW_REGISTER_TYPE_UD);
3309             mov_src.width = 8;
3310             ibld.exec_all().MOV(mov_dst, mov_src);
3311          }
3312          dst = offset(dst, 1);
3313       }
3314
3315       dst.width = inst->exec_size;
3316       if (inst->dst.file == MRF && (inst->dst.reg & BRW_MRF_COMPR4) &&
3317           inst->exec_size > 8) {
3318          /* In this case, the payload portion of the LOAD_PAYLOAD isn't
3319           * a straightforward copy.  Instead, the result of the
3320           * LOAD_PAYLOAD is treated as interleaved and the first four
3321           * non-header sources are unpacked as:
3322           *
3323           * m + 0: r0
3324           * m + 1: g0
3325           * m + 2: b0
3326           * m + 3: a0
3327           * m + 4: r1
3328           * m + 5: g1
3329           * m + 6: b1
3330           * m + 7: a1
3331           *
3332           * This is used for gen <= 5 fb writes.
3333           */
3334          assert(inst->exec_size == 16);
3335          assert(inst->header_size + 4 <= inst->sources);
3336          for (uint8_t i = inst->header_size; i < inst->header_size + 4; i++) {
3337             if (inst->src[i].file != BAD_FILE) {
3338                if (devinfo->has_compr4) {
3339                   fs_reg compr4_dst = retype(dst, inst->src[i].type);
3340                   compr4_dst.reg |= BRW_MRF_COMPR4;
3341                   ibld.MOV(compr4_dst, inst->src[i]);
3342                } else {
3343                   /* Platform doesn't have COMPR4.  We have to fake it */
3344                   fs_reg mov_dst = retype(dst, inst->src[i].type);
3345                   mov_dst.width = 8;
3346                   ibld.half(0).MOV(mov_dst, half(inst->src[i], 0));
3347                   ibld.half(1).MOV(offset(mov_dst, 4), half(inst->src[i], 1));
3348                }
3349             }
3350
3351             dst.reg++;
3352          }
3353
3354          /* The loop above only ever incremented us through the first set
3355           * of 4 registers.  However, thanks to the magic of COMPR4, we
3356           * actually wrote to the first 8 registers, so we need to take
3357           * that into account now.
3358           */
3359          dst.reg += 4;
3360
3361          /* The COMPR4 code took care of the first 4 sources.  We'll let
3362           * the regular path handle any remaining sources.  Yes, we are
3363           * modifying the instruction but we're about to delete it so
3364           * this really doesn't hurt anything.
3365           */
3366          inst->header_size += 4;
3367       }
3368
3369       for (uint8_t i = inst->header_size; i < inst->sources; i++) {
3370          if (inst->src[i].file != BAD_FILE)
3371             ibld.MOV(retype(dst, inst->src[i].type), inst->src[i]);
3372          dst = offset(dst, 1);
3373       }
3374
3375       inst->remove(block);
3376       progress = true;
3377    }
3378
3379    if (progress)
3380       invalidate_live_intervals();
3381
3382    return progress;
3383 }
3384
3385 bool
3386 fs_visitor::lower_integer_multiplication()
3387 {
3388    bool progress = false;
3389
3390    /* Gen8's MUL instruction can do a 32-bit x 32-bit -> 32-bit operation
3391     * directly, but Cherryview cannot.
3392     */
3393    if (devinfo->gen >= 8 && !devinfo->is_cherryview)
3394       return false;
3395
3396    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
3397       if (inst->opcode != BRW_OPCODE_MUL ||
3398           inst->dst.is_accumulator() ||
3399           (inst->dst.type != BRW_REGISTER_TYPE_D &&
3400            inst->dst.type != BRW_REGISTER_TYPE_UD))
3401          continue;
3402
3403       const fs_builder ibld = bld.at(block, inst);
3404
3405       /* The MUL instruction isn't commutative. On Gen <= 6, only the low
3406        * 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of
3407        * src1 are used.
3408        *
3409        * If multiplying by an immediate value that fits in 16-bits, do a
3410        * single MUL instruction with that value in the proper location.
3411        */
3412       if (inst->src[1].file == IMM &&
3413           inst->src[1].fixed_hw_reg.dw1.ud < (1 << 16)) {
3414          if (devinfo->gen < 7) {
3415             fs_reg imm(GRF, alloc.allocate(dispatch_width / 8),
3416                        inst->dst.type, dispatch_width);
3417             ibld.MOV(imm, inst->src[1]);
3418             ibld.MUL(inst->dst, imm, inst->src[0]);
3419          } else {
3420             ibld.MUL(inst->dst, inst->src[0], inst->src[1]);
3421          }
3422       } else {
3423          /* Gen < 8 (and some Gen8+ low-power parts like Cherryview) cannot
3424           * do 32-bit integer multiplication in one instruction, but instead
3425           * must do a sequence (which actually calculates a 64-bit result):
3426           *
3427           *    mul(8)  acc0<1>D   g3<8,8,1>D      g4<8,8,1>D
3428           *    mach(8) null       g3<8,8,1>D      g4<8,8,1>D
3429           *    mov(8)  g2<1>D     acc0<8,8,1>D
3430           *
3431           * But on Gen > 6, the ability to use second accumulator register
3432           * (acc1) for non-float data types was removed, preventing a simple
3433           * implementation in SIMD16. A 16-channel result can be calculated by
3434           * executing the three instructions twice in SIMD8, once with quarter
3435           * control of 1Q for the first eight channels and again with 2Q for
3436           * the second eight channels.
3437           *
3438           * Which accumulator register is implicitly accessed (by AccWrEnable
3439           * for instance) is determined by the quarter control. Unfortunately
3440           * Ivybridge (and presumably Baytrail) has a hardware bug in which an
3441           * implicit accumulator access by an instruction with 2Q will access
3442           * acc1 regardless of whether the data type is usable in acc1.
3443           *
3444           * Specifically, the 2Q mach(8) writes acc1 which does not exist for
3445           * integer data types.
3446           *
3447           * Since we only want the low 32-bits of the result, we can do two
3448           * 32-bit x 16-bit multiplies (like the mul and mach are doing), and
3449           * adjust the high result and add them (like the mach is doing):
3450           *
3451           *    mul(8)  g7<1>D     g3<8,8,1>D      g4.0<8,8,1>UW
3452           *    mul(8)  g8<1>D     g3<8,8,1>D      g4.1<8,8,1>UW
3453           *    shl(8)  g9<1>D     g8<8,8,1>D      16D
3454           *    add(8)  g2<1>D     g7<8,8,1>D      g8<8,8,1>D
3455           *
3456           * We avoid the shl instruction by realizing that we only want to add
3457           * the low 16-bits of the "high" result to the high 16-bits of the
3458           * "low" result and using proper regioning on the add:
3459           *
3460           *    mul(8)  g7<1>D     g3<8,8,1>D      g4.0<16,8,2>UW
3461           *    mul(8)  g8<1>D     g3<8,8,1>D      g4.1<16,8,2>UW
3462           *    add(8)  g7.1<2>UW  g7.1<16,8,2>UW  g8<16,8,2>UW
3463           *
3464           * Since it does not use the (single) accumulator register, we can
3465           * schedule multi-component multiplications much better.
3466           */
3467
3468          if (inst->conditional_mod && inst->dst.is_null()) {
3469             inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
3470                                inst->dst.type, dispatch_width);
3471          }
3472          fs_reg low = inst->dst;
3473          fs_reg high(GRF, alloc.allocate(dispatch_width / 8),
3474                      inst->dst.type, dispatch_width);
3475
3476          if (brw->gen >= 7) {
3477             fs_reg src1_0_w = inst->src[1];
3478             fs_reg src1_1_w = inst->src[1];
3479
3480             if (inst->src[1].file == IMM) {
3481                src1_0_w.fixed_hw_reg.dw1.ud &= 0xffff;
3482                src1_1_w.fixed_hw_reg.dw1.ud >>= 16;
3483             } else {
3484                src1_0_w.type = BRW_REGISTER_TYPE_UW;
3485                src1_0_w.stride = 2;
3486
3487                src1_1_w.type = BRW_REGISTER_TYPE_UW;
3488                src1_1_w.stride = 2;
3489                src1_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
3490             }
3491             ibld.MUL(low, inst->src[0], src1_0_w);
3492             ibld.MUL(high, inst->src[0], src1_1_w);
3493          } else {
3494             fs_reg src0_0_w = inst->src[0];
3495             fs_reg src0_1_w = inst->src[0];
3496
3497             src0_0_w.type = BRW_REGISTER_TYPE_UW;
3498             src0_0_w.stride = 2;
3499
3500             src0_1_w.type = BRW_REGISTER_TYPE_UW;
3501             src0_1_w.stride = 2;
3502             src0_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
3503
3504             ibld.MUL(low, src0_0_w, inst->src[1]);
3505             ibld.MUL(high, src0_1_w, inst->src[1]);
3506          }
3507
3508          fs_reg dst = inst->dst;
3509          dst.type = BRW_REGISTER_TYPE_UW;
3510          dst.subreg_offset = 2;
3511          dst.stride = 2;
3512
3513          high.type = BRW_REGISTER_TYPE_UW;
3514          high.stride = 2;
3515
3516          low.type = BRW_REGISTER_TYPE_UW;
3517          low.subreg_offset = 2;
3518          low.stride = 2;
3519
3520          ibld.ADD(dst, low, high);
3521
3522          if (inst->conditional_mod) {
3523             fs_reg null(retype(brw_null_reg(), inst->dst.type));
3524             set_condmod(inst->conditional_mod,
3525                         ibld.MOV(null, inst->dst));
3526          }
3527       }
3528
3529       inst->remove(block);
3530       progress = true;
3531    }
3532
3533    if (progress)
3534       invalidate_live_intervals();
3535
3536    return progress;
3537 }
3538
3539 void
3540 fs_visitor::dump_instructions()
3541 {
3542    dump_instructions(NULL);
3543 }
3544
3545 void
3546 fs_visitor::dump_instructions(const char *name)
3547 {
3548    FILE *file = stderr;
3549    if (name && geteuid() != 0) {
3550       file = fopen(name, "w");
3551       if (!file)
3552          file = stderr;
3553    }
3554
3555    if (cfg) {
3556       calculate_register_pressure();
3557       int ip = 0, max_pressure = 0;
3558       foreach_block_and_inst(block, backend_instruction, inst, cfg) {
3559          max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
3560          fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
3561          dump_instruction(inst, file);
3562          ip++;
3563       }
3564       fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
3565    } else {
3566       int ip = 0;
3567       foreach_in_list(backend_instruction, inst, &instructions) {
3568          fprintf(file, "%4d: ", ip++);
3569          dump_instruction(inst, file);
3570       }
3571    }
3572
3573    if (file != stderr) {
3574       fclose(file);
3575    }
3576 }
3577
3578 void
3579 fs_visitor::dump_instruction(backend_instruction *be_inst)
3580 {
3581    dump_instruction(be_inst, stderr);
3582 }
3583
3584 void
3585 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
3586 {
3587    fs_inst *inst = (fs_inst *)be_inst;
3588
3589    if (inst->predicate) {
3590       fprintf(file, "(%cf0.%d) ",
3591              inst->predicate_inverse ? '-' : '+',
3592              inst->flag_subreg);
3593    }
3594
3595    fprintf(file, "%s", brw_instruction_name(inst->opcode));
3596    if (inst->saturate)
3597       fprintf(file, ".sat");
3598    if (inst->conditional_mod) {
3599       fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
3600       if (!inst->predicate &&
3601           (devinfo->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
3602                               inst->opcode != BRW_OPCODE_IF &&
3603                               inst->opcode != BRW_OPCODE_WHILE))) {
3604          fprintf(file, ".f0.%d", inst->flag_subreg);
3605       }
3606    }
3607    fprintf(file, "(%d) ", inst->exec_size);
3608
3609    if (inst->mlen) {
3610       fprintf(file, "(mlen: %d) ", inst->mlen);
3611    }
3612
3613    switch (inst->dst.file) {
3614    case GRF:
3615       fprintf(file, "vgrf%d", inst->dst.reg);
3616       if (inst->dst.width != dispatch_width)
3617          fprintf(file, "@%d", inst->dst.width);
3618       if (alloc.sizes[inst->dst.reg] != inst->dst.width / 8 ||
3619           inst->dst.subreg_offset)
3620          fprintf(file, "+%d.%d",
3621                  inst->dst.reg_offset, inst->dst.subreg_offset);
3622       break;
3623    case MRF:
3624       fprintf(file, "m%d", inst->dst.reg);
3625       break;
3626    case BAD_FILE:
3627       fprintf(file, "(null)");
3628       break;
3629    case UNIFORM:
3630       fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
3631       break;
3632    case ATTR:
3633       fprintf(file, "***attr%d***", inst->dst.reg + inst->dst.reg_offset);
3634       break;
3635    case HW_REG:
3636       if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3637          switch (inst->dst.fixed_hw_reg.nr) {
3638          case BRW_ARF_NULL:
3639             fprintf(file, "null");
3640             break;
3641          case BRW_ARF_ADDRESS:
3642             fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
3643             break;
3644          case BRW_ARF_ACCUMULATOR:
3645             fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
3646             break;
3647          case BRW_ARF_FLAG:
3648             fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3649                              inst->dst.fixed_hw_reg.subnr);
3650             break;
3651          default:
3652             fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3653                                inst->dst.fixed_hw_reg.subnr);
3654             break;
3655          }
3656       } else {
3657          fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
3658       }
3659       if (inst->dst.fixed_hw_reg.subnr)
3660          fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
3661       break;
3662    default:
3663       fprintf(file, "???");
3664       break;
3665    }
3666    fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
3667
3668    for (int i = 0; i < inst->sources; i++) {
3669       if (inst->src[i].negate)
3670          fprintf(file, "-");
3671       if (inst->src[i].abs)
3672          fprintf(file, "|");
3673       switch (inst->src[i].file) {
3674       case GRF:
3675          fprintf(file, "vgrf%d", inst->src[i].reg);
3676          if (inst->src[i].width != dispatch_width)
3677             fprintf(file, "@%d", inst->src[i].width);
3678          if (alloc.sizes[inst->src[i].reg] != inst->src[i].width / 8 ||
3679              inst->src[i].subreg_offset)
3680             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3681                     inst->src[i].subreg_offset);
3682          break;
3683       case MRF:
3684          fprintf(file, "***m%d***", inst->src[i].reg);
3685          break;
3686       case ATTR:
3687          fprintf(file, "attr%d", inst->src[i].reg + inst->src[i].reg_offset);
3688          break;
3689       case UNIFORM:
3690          fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
3691          if (inst->src[i].reladdr) {
3692             fprintf(file, "+reladdr");
3693          } else if (inst->src[i].subreg_offset) {
3694             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3695                     inst->src[i].subreg_offset);
3696          }
3697          break;
3698       case BAD_FILE:
3699          fprintf(file, "(null)");
3700          break;
3701       case IMM:
3702          switch (inst->src[i].type) {
3703          case BRW_REGISTER_TYPE_F:
3704             fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
3705             break;
3706          case BRW_REGISTER_TYPE_W:
3707          case BRW_REGISTER_TYPE_D:
3708             fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
3709             break;
3710          case BRW_REGISTER_TYPE_UW:
3711          case BRW_REGISTER_TYPE_UD:
3712             fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
3713             break;
3714          case BRW_REGISTER_TYPE_VF:
3715             fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
3716                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >>  0) & 0xff),
3717                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >>  8) & 0xff),
3718                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 16) & 0xff),
3719                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 24) & 0xff));
3720             break;
3721          default:
3722             fprintf(file, "???");
3723             break;
3724          }
3725          break;
3726       case HW_REG:
3727          if (inst->src[i].fixed_hw_reg.negate)
3728             fprintf(file, "-");
3729          if (inst->src[i].fixed_hw_reg.abs)
3730             fprintf(file, "|");
3731          if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3732             switch (inst->src[i].fixed_hw_reg.nr) {
3733             case BRW_ARF_NULL:
3734                fprintf(file, "null");
3735                break;
3736             case BRW_ARF_ADDRESS:
3737                fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
3738                break;
3739             case BRW_ARF_ACCUMULATOR:
3740                fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
3741                break;
3742             case BRW_ARF_FLAG:
3743                fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3744                                 inst->src[i].fixed_hw_reg.subnr);
3745                break;
3746             default:
3747                fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3748                                   inst->src[i].fixed_hw_reg.subnr);
3749                break;
3750             }
3751          } else {
3752             fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3753          }
3754          if (inst->src[i].fixed_hw_reg.subnr)
3755             fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
3756          if (inst->src[i].fixed_hw_reg.abs)
3757             fprintf(file, "|");
3758          break;
3759       default:
3760          fprintf(file, "???");
3761          break;
3762       }
3763       if (inst->src[i].abs)
3764          fprintf(file, "|");
3765
3766       if (inst->src[i].file != IMM) {
3767          fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
3768       }
3769
3770       if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
3771          fprintf(file, ", ");
3772    }
3773
3774    fprintf(file, " ");
3775
3776    if (dispatch_width == 16 && inst->exec_size == 8) {
3777       if (inst->force_sechalf)
3778          fprintf(file, "2ndhalf ");
3779       else
3780          fprintf(file, "1sthalf ");
3781    }
3782
3783    fprintf(file, "\n");
3784 }
3785
3786 /**
3787  * Possibly returns an instruction that set up @param reg.
3788  *
3789  * Sometimes we want to take the result of some expression/variable
3790  * dereference tree and rewrite the instruction generating the result
3791  * of the tree.  When processing the tree, we know that the
3792  * instructions generated are all writing temporaries that are dead
3793  * outside of this tree.  So, if we have some instructions that write
3794  * a temporary, we're free to point that temp write somewhere else.
3795  *
3796  * Note that this doesn't guarantee that the instruction generated
3797  * only reg -- it might be the size=4 destination of a texture instruction.
3798  */
3799 fs_inst *
3800 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3801                                            fs_inst *end,
3802                                            const fs_reg &reg)
3803 {
3804    if (end == start ||
3805        end->is_partial_write() ||
3806        reg.reladdr ||
3807        !reg.equals(end->dst)) {
3808       return NULL;
3809    } else {
3810       return end;
3811    }
3812 }
3813
3814 void
3815 fs_visitor::setup_payload_gen6()
3816 {
3817    bool uses_depth =
3818       (prog->InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3819    unsigned barycentric_interp_modes =
3820       (stage == MESA_SHADER_FRAGMENT) ?
3821       ((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
3822
3823    assert(devinfo->gen >= 6);
3824
3825    /* R0-1: masks, pixel X/Y coordinates. */
3826    payload.num_regs = 2;
3827    /* R2: only for 32-pixel dispatch.*/
3828
3829    /* R3-26: barycentric interpolation coordinates.  These appear in the
3830     * same order that they appear in the brw_wm_barycentric_interp_mode
3831     * enum.  Each set of coordinates occupies 2 registers if dispatch width
3832     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
3833     * appear if they were enabled using the "Barycentric Interpolation
3834     * Mode" bits in WM_STATE.
3835     */
3836    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3837       if (barycentric_interp_modes & (1 << i)) {
3838          payload.barycentric_coord_reg[i] = payload.num_regs;
3839          payload.num_regs += 2;
3840          if (dispatch_width == 16) {
3841             payload.num_regs += 2;
3842          }
3843       }
3844    }
3845
3846    /* R27: interpolated depth if uses source depth */
3847    if (uses_depth) {
3848       payload.source_depth_reg = payload.num_regs;
3849       payload.num_regs++;
3850       if (dispatch_width == 16) {
3851          /* R28: interpolated depth if not SIMD8. */
3852          payload.num_regs++;
3853       }
3854    }
3855    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3856    if (uses_depth) {
3857       payload.source_w_reg = payload.num_regs;
3858       payload.num_regs++;
3859       if (dispatch_width == 16) {
3860          /* R30: interpolated W if not SIMD8. */
3861          payload.num_regs++;
3862       }
3863    }
3864
3865    if (stage == MESA_SHADER_FRAGMENT) {
3866       brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3867       brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3868       prog_data->uses_pos_offset = key->compute_pos_offset;
3869       /* R31: MSAA position offsets. */
3870       if (prog_data->uses_pos_offset) {
3871          payload.sample_pos_reg = payload.num_regs;
3872          payload.num_regs++;
3873       }
3874    }
3875
3876    /* R32: MSAA input coverage mask */
3877    if (prog->SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3878       assert(devinfo->gen >= 7);
3879       payload.sample_mask_in_reg = payload.num_regs;
3880       payload.num_regs++;
3881       if (dispatch_width == 16) {
3882          /* R33: input coverage mask if not SIMD8. */
3883          payload.num_regs++;
3884       }
3885    }
3886
3887    /* R34-: bary for 32-pixel. */
3888    /* R58-59: interp W for 32-pixel. */
3889
3890    if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3891       source_depth_to_render_target = true;
3892    }
3893 }
3894
3895 void
3896 fs_visitor::setup_vs_payload()
3897 {
3898    /* R0: thread header, R1: urb handles */
3899    payload.num_regs = 2;
3900 }
3901
3902 void
3903 fs_visitor::setup_cs_payload()
3904 {
3905    assert(brw->gen >= 7);
3906
3907    payload.num_regs = 1;
3908 }
3909
3910 void
3911 fs_visitor::assign_binding_table_offsets()
3912 {
3913    assert(stage == MESA_SHADER_FRAGMENT);
3914    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3915    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3916    uint32_t next_binding_table_offset = 0;
3917
3918    /* If there are no color regions, we still perform an FB write to a null
3919     * renderbuffer, which we place at surface index 0.
3920     */
3921    prog_data->binding_table.render_target_start = next_binding_table_offset;
3922    next_binding_table_offset += MAX2(key->nr_color_regions, 1);
3923
3924    assign_common_binding_table_offsets(next_binding_table_offset);
3925 }
3926
3927 void
3928 fs_visitor::calculate_register_pressure()
3929 {
3930    invalidate_live_intervals();
3931    calculate_live_intervals();
3932
3933    unsigned num_instructions = 0;
3934    foreach_block(block, cfg)
3935       num_instructions += block->instructions.length();
3936
3937    regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3938
3939    for (unsigned reg = 0; reg < alloc.count; reg++) {
3940       for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3941          regs_live_at_ip[ip] += alloc.sizes[reg];
3942    }
3943 }
3944
3945 void
3946 fs_visitor::optimize()
3947 {
3948    /* bld is the common builder object pointing at the end of the program we
3949     * used to translate it into i965 IR.  For the optimization and lowering
3950     * passes coming next, any code added after the end of the program without
3951     * having explicitly called fs_builder::at() clearly points at a mistake.
3952     * Ideally optimization passes wouldn't be part of the visitor so they
3953     * wouldn't have access to bld at all, but they do, so just in case some
3954     * pass forgets to ask for a location explicitly set it to NULL here to
3955     * make it trip.
3956     */
3957    bld = bld.at(NULL, NULL);
3958
3959    split_virtual_grfs();
3960
3961    move_uniform_array_access_to_pull_constants();
3962    assign_constant_locations();
3963    demote_pull_constants();
3964
3965 #define OPT(pass, args...) ({                                           \
3966       pass_num++;                                                       \
3967       bool this_progress = pass(args);                                  \
3968                                                                         \
3969       if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) {   \
3970          char filename[64];                                             \
3971          snprintf(filename, 64, "%s%d-%04d-%02d-%02d-" #pass,              \
3972                   stage_abbrev, dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
3973                                                                         \
3974          backend_shader::dump_instructions(filename);                   \
3975       }                                                                 \
3976                                                                         \
3977       progress = progress || this_progress;                             \
3978       this_progress;                                                    \
3979    })
3980
3981    if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
3982       char filename[64];
3983       snprintf(filename, 64, "%s%d-%04d-00-start",
3984                stage_abbrev, dispatch_width,
3985                shader_prog ? shader_prog->Name : 0);
3986
3987       backend_shader::dump_instructions(filename);
3988    }
3989
3990    bool progress;
3991    int iteration = 0;
3992    int pass_num = 0;
3993    do {
3994       progress = false;
3995       pass_num = 0;
3996       iteration++;
3997
3998       OPT(remove_duplicate_mrf_writes);
3999
4000       OPT(opt_algebraic);
4001       OPT(opt_cse);
4002       OPT(opt_copy_propagate);
4003       OPT(opt_peephole_predicated_break);
4004       OPT(opt_cmod_propagation);
4005       OPT(dead_code_eliminate);
4006       OPT(opt_peephole_sel);
4007       OPT(dead_control_flow_eliminate, this);
4008       OPT(opt_register_renaming);
4009       OPT(opt_redundant_discard_jumps);
4010       OPT(opt_saturate_propagation);
4011       OPT(opt_zero_samples);
4012       OPT(register_coalesce);
4013       OPT(compute_to_mrf);
4014       OPT(eliminate_find_live_channel);
4015
4016       OPT(compact_virtual_grfs);
4017    } while (progress);
4018
4019    pass_num = 0;
4020
4021    OPT(opt_sampler_eot);
4022
4023    if (OPT(lower_load_payload)) {
4024       split_virtual_grfs();
4025       OPT(register_coalesce);
4026       OPT(compute_to_mrf);
4027       OPT(dead_code_eliminate);
4028    }
4029
4030    OPT(opt_combine_constants);
4031    OPT(lower_integer_multiplication);
4032
4033    lower_uniform_pull_constant_loads();
4034 }
4035
4036 /**
4037  * Three source instruction must have a GRF/MRF destination register.
4038  * ARF NULL is not allowed.  Fix that up by allocating a temporary GRF.
4039  */
4040 void
4041 fs_visitor::fixup_3src_null_dest()
4042 {
4043    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
4044       if (inst->is_3src() && inst->dst.is_null()) {
4045          inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
4046                             inst->dst.type);
4047       }
4048    }
4049 }
4050
4051 void
4052 fs_visitor::allocate_registers()
4053 {
4054    bool allocated_without_spills;
4055
4056    static const enum instruction_scheduler_mode pre_modes[] = {
4057       SCHEDULE_PRE,
4058       SCHEDULE_PRE_NON_LIFO,
4059       SCHEDULE_PRE_LIFO,
4060    };
4061
4062    /* Try each scheduling heuristic to see if it can successfully register
4063     * allocate without spilling.  They should be ordered by decreasing
4064     * performance but increasing likelihood of allocating.
4065     */
4066    for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
4067       schedule_instructions(pre_modes[i]);
4068
4069       if (0) {
4070          assign_regs_trivial();
4071          allocated_without_spills = true;
4072       } else {
4073          allocated_without_spills = assign_regs(false);
4074       }
4075       if (allocated_without_spills)
4076          break;
4077    }
4078
4079    if (!allocated_without_spills) {
4080       /* We assume that any spilling is worse than just dropping back to
4081        * SIMD8.  There's probably actually some intermediate point where
4082        * SIMD16 with a couple of spills is still better.
4083        */
4084       if (dispatch_width == 16) {
4085          fail("Failure to register allocate.  Reduce number of "
4086               "live scalar values to avoid this.");
4087       } else {
4088          perf_debug("%s shader triggered register spilling.  "
4089                     "Try reducing the number of live scalar values to "
4090                     "improve performance.\n", stage_name);
4091       }
4092
4093       /* Since we're out of heuristics, just go spill registers until we
4094        * get an allocation.
4095        */
4096       while (!assign_regs(true)) {
4097          if (failed)
4098             break;
4099       }
4100    }
4101
4102    /* This must come after all optimization and register allocation, since
4103     * it inserts dead code that happens to have side effects, and it does
4104     * so based on the actual physical registers in use.
4105     */
4106    insert_gen4_send_dependency_workarounds();
4107
4108    if (failed)
4109       return;
4110
4111    if (!allocated_without_spills)
4112       schedule_instructions(SCHEDULE_POST);
4113
4114    if (last_scratch > 0)
4115       prog_data->total_scratch = brw_get_scratch_size(last_scratch);
4116 }
4117
4118 bool
4119 fs_visitor::run_vs()
4120 {
4121    assert(stage == MESA_SHADER_VERTEX);
4122
4123    assign_common_binding_table_offsets(0);
4124    setup_vs_payload();
4125
4126    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4127       emit_shader_time_begin();
4128
4129    emit_nir_code();
4130
4131    if (failed)
4132       return false;
4133
4134    emit_urb_writes();
4135
4136    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4137       emit_shader_time_end();
4138
4139    calculate_cfg();
4140
4141    optimize();
4142
4143    assign_curb_setup();
4144    assign_vs_urb_setup();
4145
4146    fixup_3src_null_dest();
4147    allocate_registers();
4148
4149    return !failed;
4150 }
4151
4152 bool
4153 fs_visitor::run_fs()
4154 {
4155    brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
4156    brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
4157
4158    assert(stage == MESA_SHADER_FRAGMENT);
4159
4160    sanity_param_count = prog->Parameters->NumParameters;
4161
4162    assign_binding_table_offsets();
4163
4164    if (devinfo->gen >= 6)
4165       setup_payload_gen6();
4166    else
4167       setup_payload_gen4();
4168
4169    if (0) {
4170       emit_dummy_fs();
4171    } else if (brw->use_rep_send && dispatch_width == 16) {
4172       emit_repclear_shader();
4173    } else {
4174       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4175          emit_shader_time_begin();
4176
4177       calculate_urb_setup();
4178       if (prog->InputsRead > 0) {
4179          if (devinfo->gen < 6)
4180             emit_interpolation_setup_gen4();
4181          else
4182             emit_interpolation_setup_gen6();
4183       }
4184
4185       /* We handle discards by keeping track of the still-live pixels in f0.1.
4186        * Initialize it with the dispatched pixels.
4187        */
4188       if (wm_prog_data->uses_kill) {
4189          fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
4190          discard_init->flag_subreg = 1;
4191       }
4192
4193       /* Generate FS IR for main().  (the visitor only descends into
4194        * functions called "main").
4195        */
4196       emit_nir_code();
4197
4198       if (failed)
4199          return false;
4200
4201       if (wm_prog_data->uses_kill)
4202          emit(FS_OPCODE_PLACEHOLDER_HALT);
4203
4204       if (wm_key->alpha_test_func)
4205          emit_alpha_test();
4206
4207       emit_fb_writes();
4208
4209       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4210          emit_shader_time_end();
4211
4212       calculate_cfg();
4213
4214       optimize();
4215
4216       assign_curb_setup();
4217       assign_urb_setup();
4218
4219       fixup_3src_null_dest();
4220       allocate_registers();
4221
4222       if (failed)
4223          return false;
4224    }
4225
4226    if (dispatch_width == 8)
4227       wm_prog_data->reg_blocks = brw_register_blocks(grf_used);
4228    else
4229       wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
4230
4231    /* If any state parameters were appended, then ParameterValues could have
4232     * been realloced, in which case the driver uniform storage set up by
4233     * _mesa_associate_uniform_storage() would point to freed memory.  Make
4234     * sure that didn't happen.
4235     */
4236    assert(sanity_param_count == prog->Parameters->NumParameters);
4237
4238    return !failed;
4239 }
4240
4241 bool
4242 fs_visitor::run_cs()
4243 {
4244    assert(stage == MESA_SHADER_COMPUTE);
4245    assert(shader);
4246
4247    sanity_param_count = prog->Parameters->NumParameters;
4248
4249    assign_common_binding_table_offsets(0);
4250
4251    setup_cs_payload();
4252
4253    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4254       emit_shader_time_begin();
4255
4256    emit_nir_code();
4257
4258    if (failed)
4259       return false;
4260
4261    emit_cs_terminate();
4262
4263    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4264       emit_shader_time_end();
4265
4266    calculate_cfg();
4267
4268    optimize();
4269
4270    assign_curb_setup();
4271
4272    fixup_3src_null_dest();
4273    allocate_registers();
4274
4275    if (failed)
4276       return false;
4277
4278    /* If any state parameters were appended, then ParameterValues could have
4279     * been realloced, in which case the driver uniform storage set up by
4280     * _mesa_associate_uniform_storage() would point to freed memory.  Make
4281     * sure that didn't happen.
4282     */
4283    assert(sanity_param_count == prog->Parameters->NumParameters);
4284
4285    return !failed;
4286 }
4287
4288 const unsigned *
4289 brw_wm_fs_emit(struct brw_context *brw,
4290                void *mem_ctx,
4291                const struct brw_wm_prog_key *key,
4292                struct brw_wm_prog_data *prog_data,
4293                struct gl_fragment_program *fp,
4294                struct gl_shader_program *prog,
4295                unsigned *final_assembly_size)
4296 {
4297    bool start_busy = false;
4298    double start_time = 0;
4299
4300    if (unlikely(brw->perf_debug)) {
4301       start_busy = (brw->batch.last_bo &&
4302                     drm_intel_bo_busy(brw->batch.last_bo));
4303       start_time = get_time();
4304    }
4305
4306    struct brw_shader *shader = NULL;
4307    if (prog)
4308       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
4309
4310    if (unlikely(INTEL_DEBUG & DEBUG_WM))
4311       brw_dump_ir("fragment", prog, &shader->base, &fp->Base);
4312
4313    /* Now the main event: Visit the shader IR and generate our FS IR for it.
4314     */
4315    fs_visitor v(brw, mem_ctx, MESA_SHADER_FRAGMENT, key, &prog_data->base,
4316                 prog, &fp->Base, 8);
4317    if (!v.run_fs()) {
4318       if (prog) {
4319          prog->LinkStatus = false;
4320          ralloc_strcat(&prog->InfoLog, v.fail_msg);
4321       }
4322
4323       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
4324                     v.fail_msg);
4325
4326       return NULL;
4327    }
4328
4329    cfg_t *simd16_cfg = NULL;
4330    fs_visitor v2(brw, mem_ctx, MESA_SHADER_FRAGMENT, key, &prog_data->base,
4331                  prog, &fp->Base, 16);
4332    if (likely(!(INTEL_DEBUG & DEBUG_NO16) || brw->use_rep_send)) {
4333       if (!v.simd16_unsupported) {
4334          /* Try a SIMD16 compile */
4335          v2.import_uniforms(&v);
4336          if (!v2.run_fs()) {
4337             perf_debug("SIMD16 shader failed to compile, falling back to "
4338                        "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
4339          } else {
4340             simd16_cfg = v2.cfg;
4341          }
4342       } else {
4343          perf_debug("SIMD16 shader unsupported, falling back to "
4344                     "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
4345       }
4346    }
4347
4348    cfg_t *simd8_cfg;
4349    int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8;
4350    if ((no_simd8 || brw->gen < 5) && simd16_cfg) {
4351       simd8_cfg = NULL;
4352       prog_data->no_8 = true;
4353    } else {
4354       simd8_cfg = v.cfg;
4355       prog_data->no_8 = false;
4356    }
4357
4358    fs_generator g(brw, mem_ctx, (void *) key, &prog_data->base,
4359                   &fp->Base, v.promoted_constants, v.runtime_check_aads_emit, "FS");
4360
4361    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
4362       char *name;
4363       if (prog)
4364          name = ralloc_asprintf(mem_ctx, "%s fragment shader %d",
4365                                 prog->Label ? prog->Label : "unnamed",
4366                                 prog->Name);
4367       else
4368          name = ralloc_asprintf(mem_ctx, "fragment program %d", fp->Base.Id);
4369
4370       g.enable_debug(name);
4371    }
4372
4373    if (simd8_cfg)
4374       g.generate_code(simd8_cfg, 8);
4375    if (simd16_cfg)
4376       prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
4377
4378    if (unlikely(brw->perf_debug) && shader) {
4379       if (shader->compiled_once)
4380          brw_wm_debug_recompile(brw, prog, key);
4381       shader->compiled_once = true;
4382
4383       if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
4384          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
4385                     (get_time() - start_time) * 1000);
4386       }
4387    }
4388
4389    return g.get_assembly(final_assembly_size);
4390 }
4391
4392 extern "C" bool
4393 brw_fs_precompile(struct gl_context *ctx,
4394                   struct gl_shader_program *shader_prog,
4395                   struct gl_program *prog)
4396 {
4397    struct brw_context *brw = brw_context(ctx);
4398    struct brw_wm_prog_key key;
4399
4400    struct gl_fragment_program *fp = (struct gl_fragment_program *) prog;
4401    struct brw_fragment_program *bfp = brw_fragment_program(fp);
4402    bool program_uses_dfdy = fp->UsesDFdy;
4403
4404    memset(&key, 0, sizeof(key));
4405
4406    if (brw->gen < 6) {
4407       if (fp->UsesKill)
4408          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
4409
4410       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
4411          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
4412
4413       /* Just assume depth testing. */
4414       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
4415       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
4416    }
4417
4418    if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
4419                                          BRW_FS_VARYING_INPUT_MASK) > 16)
4420       key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
4421
4422    brw_setup_tex_for_precompile(brw, &key.tex, &fp->Base);
4423
4424    if (fp->Base.InputsRead & VARYING_BIT_POS) {
4425       key.drawable_height = ctx->DrawBuffer->Height;
4426    }
4427
4428    key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
4429          ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
4430          BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
4431
4432    if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
4433       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
4434                           key.nr_color_regions > 1;
4435    }
4436
4437    key.program_string_id = bfp->id;
4438
4439    uint32_t old_prog_offset = brw->wm.base.prog_offset;
4440    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
4441
4442    bool success = brw_codegen_wm_prog(brw, shader_prog, bfp, &key);
4443
4444    brw->wm.base.prog_offset = old_prog_offset;
4445    brw->wm.prog_data = old_prog_data;
4446
4447    return success;
4448 }
4449
4450 void
4451 brw_setup_tex_for_precompile(struct brw_context *brw,
4452                              struct brw_sampler_prog_key_data *tex,
4453                              struct gl_program *prog)
4454 {
4455    const bool has_shader_channel_select = brw->is_haswell || brw->gen >= 8;
4456    unsigned sampler_count = _mesa_fls(prog->SamplersUsed);
4457    for (unsigned i = 0; i < sampler_count; i++) {
4458       if (!has_shader_channel_select && (prog->ShadowSamplers & (1 << i))) {
4459          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
4460          tex->swizzles[i] =
4461             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
4462       } else {
4463          /* Color sampler: assume no swizzling. */
4464          tex->swizzles[i] = SWIZZLE_XYZW;
4465       }
4466    }
4467 }