src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 #include <sys/types.h>
  32
  33 #include "util/hash_table.h"
  34 #include "main/macros.h"
  35 #include "main/shaderobj.h"
  36 #include "main/fbobject.h"
  37 #include "program/prog_parameter.h"
  38 #include "program/prog_print.h"
  39 #include "util/register_allocate.h"
  40 #include "program/hash_table.h"
  41 #include "brw_context.h"
  42 #include "brw_eu.h"
  43 #include "brw_wm.h"
  44 #include "brw_fs.h"
  45 #include "brw_cfg.h"
  46 #include "brw_dead_control_flow.h"
  47 #include "main/uniforms.h"
  48 #include "brw_fs_live_variables.h"
  49 #include "glsl/glsl_types.h"
  50 #include "program/sampler.h"
  51
  52 using namespace brw;
  53
  54 void
  55 fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
  56               const fs_reg *src, unsigned sources)
  57 {
  58    memset(this, 0, sizeof(*this));
  59
  60    this->src = new fs_reg[MAX2(sources, 3)];
  61    for (unsigned i = 0; i < sources; i++)
  62       this->src[i] = src[i];
  63
  64    this->opcode = opcode;
  65    this->dst = dst;
  66    this->sources = sources;
  67    this->exec_size = exec_size;
  68
  69    assert(dst.file != IMM && dst.file != UNIFORM);
  70
  71    /* If exec_size == 0, try to guess it from the registers.  Since all
  72     * manner of things may use hardware registers, we first try to guess
  73     * based on GRF registers.  If this fails, we will go ahead and take the
  74     * width from the destination register.
  75     */
  76    if (this->exec_size == 0) {
  77       if (dst.file == GRF) {
  78          this->exec_size = dst.width;
  79       } else {
  80          for (unsigned i = 0; i < sources; ++i) {
  81             if (src[i].file != GRF && src[i].file != ATTR)
  82                continue;
  83
  84             if (this->exec_size <= 1)
  85                this->exec_size = src[i].width;
  86             assert(src[i].width == 1 || src[i].width == this->exec_size);
  87          }
  88       }
  89
  90       if (this->exec_size == 0 && dst.file != BAD_FILE)
  91          this->exec_size = dst.width;
  92    }
  93    assert(this->exec_size != 0);
  94
  95    this->conditional_mod = BRW_CONDITIONAL_NONE;
  96
  97    /* This will be the case for almost all instructions. */
  98    switch (dst.file) {
  99    case GRF:
 100    case HW_REG:
 101    case MRF:
 102    case ATTR:
 103       this->regs_written =
 104          DIV_ROUND_UP(MAX2(dst.width * dst.stride, 1) * type_sz(dst.type), 32);
 105       break;
 106    case BAD_FILE:
 107       this->regs_written = 0;
 108       break;
 109    case IMM:
 110    case UNIFORM:
 111       unreachable("Invalid destination register file");
 112    default:
 113       unreachable("Invalid register file");
 114    }
 115
 116    this->writes_accumulator = false;
 117 }
 118
 119 fs_inst::fs_inst()
 120 {
 121    init(BRW_OPCODE_NOP, 8, dst, NULL, 0);
 122 }
 123
 124 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
 125 {
 126    init(opcode, exec_size, reg_undef, NULL, 0);
 127 }
 128
 129 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
 130 {
 131    init(opcode, 0, dst, NULL, 0);
 132 }
 133
 134 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 135                  const fs_reg &src0)
 136 {
 137    const fs_reg src[1] = { src0 };
 138    init(opcode, exec_size, dst, src, 1);
 139 }
 140
 141 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
 142 {
 143    const fs_reg src[1] = { src0 };
 144    init(opcode, 0, dst, src, 1);
 145 }
 146
 147 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 148                  const fs_reg &src0, const fs_reg &src1)
 149 {
 150    const fs_reg src[2] = { src0, src1 };
 151    init(opcode, exec_size, dst, src, 2);
 152 }
 153
 154 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 155                  const fs_reg &src1)
 156 {
 157    const fs_reg src[2] = { src0, src1 };
 158    init(opcode, 0, dst, src, 2);
 159 }
 160
 161 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 162                  const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
 163 {
 164    const fs_reg src[3] = { src0, src1, src2 };
 165    init(opcode, exec_size, dst, src, 3);
 166 }
 167
 168 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 169                  const fs_reg &src1, const fs_reg &src2)
 170 {
 171    const fs_reg src[3] = { src0, src1, src2 };
 172    init(opcode, 0, dst, src, 3);
 173 }
 174
 175 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst,
 176                  const fs_reg src[], unsigned sources)
 177 {
 178    init(opcode, 0, dst, src, sources);
 179 }
 180
 181 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
 182                  const fs_reg src[], unsigned sources)
 183 {
 184    init(opcode, exec_width, dst, src, sources);
 185 }
 186
 187 fs_inst::fs_inst(const fs_inst &that)
 188 {
 189    memcpy(this, &that, sizeof(that));
 190
 191    this->src = new fs_reg[MAX2(that.sources, 3)];
 192
 193    for (unsigned i = 0; i < that.sources; i++)
 194       this->src[i] = that.src[i];
 195 }
 196
 197 fs_inst::~fs_inst()
 198 {
 199    delete[] this->src;
 200 }
 201
 202 void
 203 fs_inst::resize_sources(uint8_t num_sources)
 204 {
 205    if (this->sources != num_sources) {
 206       fs_reg *src = new fs_reg[MAX2(num_sources, 3)];
 207
 208       for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i)
 209          src[i] = this->src[i];
 210
 211       delete[] this->src;
 212       this->src = src;
 213       this->sources = num_sources;
 214    }
 215 }
 216
 217 #define ALU1(op)                                                        \
 218    fs_inst *                                                            \
 219    fs_visitor::op(const fs_reg &dst, const fs_reg &src0)                \
 220    {                                                                    \
 221       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
 222    }
 223
 224 #define ALU2(op)                                                        \
 225    fs_inst *                                                            \
 226    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 227                   const fs_reg &src1)                                   \
 228    {                                                                    \
 229       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
 230    }
 231
 232 #define ALU2_ACC(op)                                                    \
 233    fs_inst *                                                            \
 234    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 235                   const fs_reg &src1)                                   \
 236    {                                                                    \
 237       fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
 238       inst->writes_accumulator = true;                                  \
 239       return inst;                                                      \
 240    }
 241
 242 #define ALU3(op)                                                        \
 243    fs_inst *                                                            \
 244    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 245                   const fs_reg &src1, const fs_reg &src2)               \
 246    {                                                                    \
 247       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
 248    }
 249
 250 ALU1(NOT)
 251 ALU1(MOV)
 252 ALU1(FRC)
 253 ALU1(RNDD)
 254 ALU1(RNDE)
 255 ALU1(RNDZ)
 256 ALU2(ADD)
 257 ALU2(MUL)
 258 ALU2_ACC(MACH)
 259 ALU2(AND)
 260 ALU2(OR)
 261 ALU2(XOR)
 262 ALU2(SHL)
 263 ALU2(SHR)
 264 ALU2(ASR)
 265 ALU3(LRP)
 266 ALU1(BFREV)
 267 ALU3(BFE)
 268 ALU2(BFI1)
 269 ALU3(BFI2)
 270 ALU1(FBH)
 271 ALU1(FBL)
 272 ALU1(CBIT)
 273 ALU3(MAD)
 274 ALU2_ACC(ADDC)
 275 ALU2_ACC(SUBB)
 276 ALU2(SEL)
 277 ALU2(MAC)
 278
 279 /** Gen4 predicated IF. */
 280 fs_inst *
 281 fs_visitor::IF(enum brw_predicate predicate)
 282 {
 283    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width);
 284    inst->predicate = predicate;
 285    return inst;
 286 }
 287
 288 /** Gen6 IF with embedded comparison. */
 289 fs_inst *
 290 fs_visitor::IF(const fs_reg &src0, const fs_reg &src1,
 291                enum brw_conditional_mod condition)
 292 {
 293    assert(devinfo->gen == 6);
 294    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width,
 295                                         reg_null_d, src0, src1);
 296    inst->conditional_mod = condition;
 297    return inst;
 298 }
 299
 300 /**
 301  * CMP: Sets the low bit of the destination channels with the result
 302  * of the comparison, while the upper bits are undefined, and updates
 303  * the flag register with the packed 16 bits of the result.
 304  */
 305 fs_inst *
 306 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
 307                 enum brw_conditional_mod condition)
 308 {
 309    fs_inst *inst;
 310
 311    /* Take the instruction:
 312     *
 313     * CMP null<d> src0<f> src1<f>
 314     *
 315     * Original gen4 does type conversion to the destination type before
 316     * comparison, producing garbage results for floating point comparisons.
 317     *
 318     * The destination type doesn't matter on newer generations, so we set the
 319     * type to match src0 so we can compact the instruction.
 320     */
 321    dst.type = src0.type;
 322    if (dst.file == HW_REG)
 323       dst.fixed_hw_reg.type = dst.type;
 324
 325    resolve_ud_negate(&src0);
 326    resolve_ud_negate(&src1);
 327
 328    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
 329    inst->conditional_mod = condition;
 330
 331    return inst;
 332 }
 333
 334 fs_inst *
 335 fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources,
 336                          int header_size)
 337 {
 338    assert(dst.width % 8 == 0);
 339    fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, dst.width,
 340                                         dst, src, sources);
 341    inst->header_size = header_size;
 342
 343    for (int i = 0; i < header_size; i++)
 344       assert(src[i].file != GRF || src[i].width * type_sz(src[i].type) == 32);
 345    inst->regs_written = header_size;
 346
 347    for (int i = header_size; i < sources; ++i)
 348       assert(src[i].file != GRF || src[i].width == dst.width);
 349    inst->regs_written += (sources - header_size) * (dst.width / 8);
 350
 351    return inst;
 352 }
 353
 354 void
 355 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld,
 356                                        const fs_reg &dst,
 357                                        const fs_reg &surf_index,
 358                                        const fs_reg &varying_offset,
 359                                        uint32_t const_offset)
 360 {
 361    /* We have our constant surface use a pitch of 4 bytes, so our index can
 362     * be any component of a vector, and then we load 4 contiguous
 363     * components starting from that.
 364     *
 365     * We break down the const_offset to a portion added to the variable
 366     * offset and a portion done using reg_offset, which means that if you
 367     * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
 368     * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
 369     * CSE can later notice that those loads are all the same and eliminate
 370     * the redundant ones.
 371     */
 372    fs_reg vec4_offset = vgrf(glsl_type::int_type);
 373    bld.ADD(vec4_offset, varying_offset, fs_reg(const_offset & ~3));
 374
 375    int scale = 1;
 376    if (devinfo->gen == 4 && dst.width == 8) {
 377       /* Pre-gen5, we can either use a SIMD8 message that requires (header,
 378        * u, v, r) as parameters, or we can just use the SIMD16 message
 379        * consisting of (header, u).  We choose the second, at the cost of a
 380        * longer return length.
 381        */
 382       scale = 2;
 383    }
 384
 385    enum opcode op;
 386    if (devinfo->gen >= 7)
 387       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
 388    else
 389       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
 390
 391    assert(dst.width % 8 == 0);
 392    int regs_written = 4 * (dst.width / 8) * scale;
 393    fs_reg vec4_result = fs_reg(GRF, alloc.allocate(regs_written),
 394                                dst.type, dst.width);
 395    fs_inst *inst = bld.emit(op, vec4_result, surf_index, vec4_offset);
 396    inst->regs_written = regs_written;
 397
 398    if (devinfo->gen < 7) {
 399       inst->base_mrf = 13;
 400       inst->header_size = 1;
 401       if (devinfo->gen == 4)
 402          inst->mlen = 3;
 403       else
 404          inst->mlen = 1 + dispatch_width / 8;
 405    }
 406
 407    bld.MOV(dst, offset(vec4_result, (const_offset & 3) * scale));
 408 }
 409
 410 /**
 411  * A helper for MOV generation for fixing up broken hardware SEND dependency
 412  * handling.
 413  */
 414 void
 415 fs_visitor::DEP_RESOLVE_MOV(const fs_builder &bld, int grf)
 416 {
 417    /* The caller always wants uncompressed to emit the minimal extra
 418     * dependencies, and to avoid having to deal with aligning its regs to 2.
 419     */
 420    const fs_builder ubld = bld.annotate("send dependency resolve")
 421                               .half(0);
 422
 423    ubld.MOV(ubld.null_reg_f(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
 424 }
 425
 426 bool
 427 fs_inst::equals(fs_inst *inst) const
 428 {
 429    return (opcode == inst->opcode &&
 430            dst.equals(inst->dst) &&
 431            src[0].equals(inst->src[0]) &&
 432            src[1].equals(inst->src[1]) &&
 433            src[2].equals(inst->src[2]) &&
 434            saturate == inst->saturate &&
 435            predicate == inst->predicate &&
 436            conditional_mod == inst->conditional_mod &&
 437            mlen == inst->mlen &&
 438            base_mrf == inst->base_mrf &&
 439            target == inst->target &&
 440            eot == inst->eot &&
 441            header_size == inst->header_size &&
 442            shadow_compare == inst->shadow_compare &&
 443            exec_size == inst->exec_size &&
 444            offset == inst->offset);
 445 }
 446
 447 bool
 448 fs_inst::overwrites_reg(const fs_reg &reg) const
 449 {
 450    return reg.in_range(dst, regs_written);
 451 }
 452
 453 bool
 454 fs_inst::is_send_from_grf() const
 455 {
 456    switch (opcode) {
 457    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
 458    case SHADER_OPCODE_SHADER_TIME_ADD:
 459    case FS_OPCODE_INTERPOLATE_AT_CENTROID:
 460    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
 461    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
 462    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
 463    case SHADER_OPCODE_UNTYPED_ATOMIC:
 464    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
 465    case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
 466    case SHADER_OPCODE_TYPED_ATOMIC:
 467    case SHADER_OPCODE_TYPED_SURFACE_READ:
 468    case SHADER_OPCODE_TYPED_SURFACE_WRITE:
 469    case SHADER_OPCODE_URB_WRITE_SIMD8:
 470       return true;
 471    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 472       return src[1].file == GRF;
 473    case FS_OPCODE_FB_WRITE:
 474       return src[0].file == GRF;
 475    default:
 476       if (is_tex())
 477          return src[0].file == GRF;
 478
 479       return false;
 480    }
 481 }
 482
 483 bool
 484 fs_inst::is_copy_payload(const brw::simple_allocator &grf_alloc) const
 485 {
 486    if (this->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
 487       return false;
 488
 489    fs_reg reg = this->src[0];
 490    if (reg.file != GRF || reg.reg_offset != 0 || reg.stride == 0)
 491       return false;
 492
 493    if (grf_alloc.sizes[reg.reg] != this->regs_written)
 494       return false;
 495
 496    for (int i = 0; i < this->sources; i++) {
 497       reg.type = this->src[i].type;
 498       reg.width = this->src[i].width;
 499       if (!this->src[i].equals(reg))
 500          return false;
 501       reg = ::offset(reg, 1);
 502    }
 503
 504    return true;
 505 }
 506
 507 bool
 508 fs_inst::can_do_source_mods(const struct brw_device_info *devinfo)
 509 {
 510    if (devinfo->gen == 6 && is_math())
 511       return false;
 512
 513    if (is_send_from_grf())
 514       return false;
 515
 516    if (!backend_instruction::can_do_source_mods())
 517       return false;
 518
 519    return true;
 520 }
 521
 522 bool
 523 fs_inst::has_side_effects() const
 524 {
 525    return this->eot || backend_instruction::has_side_effects();
 526 }
 527
 528 void
 529 fs_reg::init()
 530 {
 531    memset(this, 0, sizeof(*this));
 532    stride = 1;
 533 }
 534
 535 /** Generic unset register constructor. */
 536 fs_reg::fs_reg()
 537 {
 538    init();
 539    this->file = BAD_FILE;
 540 }
 541
 542 /** Immediate value constructor. */
 543 fs_reg::fs_reg(float f)
 544 {
 545    init();
 546    this->file = IMM;
 547    this->type = BRW_REGISTER_TYPE_F;
 548    this->fixed_hw_reg.dw1.f = f;
 549    this->width = 1;
 550 }
 551
 552 /** Immediate value constructor. */
 553 fs_reg::fs_reg(int32_t i)
 554 {
 555    init();
 556    this->file = IMM;
 557    this->type = BRW_REGISTER_TYPE_D;
 558    this->fixed_hw_reg.dw1.d = i;
 559    this->width = 1;
 560 }
 561
 562 /** Immediate value constructor. */
 563 fs_reg::fs_reg(uint32_t u)
 564 {
 565    init();
 566    this->file = IMM;
 567    this->type = BRW_REGISTER_TYPE_UD;
 568    this->fixed_hw_reg.dw1.ud = u;
 569    this->width = 1;
 570 }
 571
 572 /** Vector float immediate value constructor. */
 573 fs_reg::fs_reg(uint8_t vf[4])
 574 {
 575    init();
 576    this->file = IMM;
 577    this->type = BRW_REGISTER_TYPE_VF;
 578    memcpy(&this->fixed_hw_reg.dw1.ud, vf, sizeof(unsigned));
 579 }
 580
 581 /** Vector float immediate value constructor. */
 582 fs_reg::fs_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
 583 {
 584    init();
 585    this->file = IMM;
 586    this->type = BRW_REGISTER_TYPE_VF;
 587    this->fixed_hw_reg.dw1.ud = (vf0 <<  0) |
 588                                (vf1 <<  8) |
 589                                (vf2 << 16) |
 590                                (vf3 << 24);
 591 }
 592
 593 /** Fixed brw_reg. */
 594 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 595 {
 596    init();
 597    this->file = HW_REG;
 598    this->fixed_hw_reg = fixed_hw_reg;
 599    this->type = fixed_hw_reg.type;
 600    this->width = 1 << fixed_hw_reg.width;
 601 }
 602
 603 bool
 604 fs_reg::equals(const fs_reg &r) const
 605 {
 606    return (file == r.file &&
 607            reg == r.reg &&
 608            reg_offset == r.reg_offset &&
 609            subreg_offset == r.subreg_offset &&
 610            type == r.type &&
 611            negate == r.negate &&
 612            abs == r.abs &&
 613            !reladdr && !r.reladdr &&
 614            memcmp(&fixed_hw_reg, &r.fixed_hw_reg, sizeof(fixed_hw_reg)) == 0 &&
 615            width == r.width &&
 616            stride == r.stride);
 617 }
 618
 619 fs_reg &
 620 fs_reg::set_smear(unsigned subreg)
 621 {
 622    assert(file != HW_REG && file != IMM);
 623    subreg_offset = subreg * type_sz(type);
 624    stride = 0;
 625    return *this;
 626 }
 627
 628 bool
 629 fs_reg::is_contiguous() const
 630 {
 631    return stride == 1;
 632 }
 633
 634 int
 635 fs_visitor::type_size(const struct glsl_type *type)
 636 {
 637    unsigned int size, i;
 638
 639    switch (type->base_type) {
 640    case GLSL_TYPE_UINT:
 641    case GLSL_TYPE_INT:
 642    case GLSL_TYPE_FLOAT:
 643    case GLSL_TYPE_BOOL:
 644       return type->components();
 645    case GLSL_TYPE_ARRAY:
 646       return type_size(type->fields.array) * type->length;
 647    case GLSL_TYPE_STRUCT:
 648       size = 0;
 649       for (i = 0; i < type->length; i++) {
 650          size += type_size(type->fields.structure[i].type);
 651       }
 652       return size;
 653    case GLSL_TYPE_SAMPLER:
 654       /* Samplers take up no register space, since they're baked in at
 655        * link time.
 656        */
 657       return 0;
 658    case GLSL_TYPE_ATOMIC_UINT:
 659       return 0;
 660    case GLSL_TYPE_IMAGE:
 661    case GLSL_TYPE_VOID:
 662    case GLSL_TYPE_ERROR:
 663    case GLSL_TYPE_INTERFACE:
 664    case GLSL_TYPE_DOUBLE:
 665       unreachable("not reached");
 666    }
 667
 668    return 0;
 669 }
 670
 671 /**
 672  * Create a MOV to read the timestamp register.
 673  *
 674  * The caller is responsible for emitting the MOV.  The return value is
 675  * the destination of the MOV, with extra parameters set.
 676  */
 677 fs_reg
 678 fs_visitor::get_timestamp(const fs_builder &bld)
 679 {
 680    assert(devinfo->gen >= 7);
 681
 682    fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 683                                           BRW_ARF_TIMESTAMP,
 684                                           0),
 685                              BRW_REGISTER_TYPE_UD));
 686
 687    fs_reg dst = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 4);
 688
 689    /* We want to read the 3 fields we care about even if it's not enabled in
 690     * the dispatch.
 691     */
 692    bld.exec_all().MOV(dst, ts);
 693
 694    /* The caller wants the low 32 bits of the timestamp.  Since it's running
 695     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 696     * which is plenty of time for our purposes.  It is identical across the
 697     * EUs, but since it's tracking GPU core speed it will increment at a
 698     * varying rate as render P-states change.
 699     *
 700     * The caller could also check if render P-states have changed (or anything
 701     * else that might disrupt timing) by setting smear to 2 and checking if
 702     * that field is != 0.
 703     */
 704    dst.set_smear(0);
 705
 706    return dst;
 707 }
 708
 709 void
 710 fs_visitor::emit_shader_time_begin()
 711 {
 712    shader_start_time = get_timestamp(bld.annotate("shader time start"));
 713 }
 714
 715 void
 716 fs_visitor::emit_shader_time_end()
 717 {
 718    enum shader_time_shader_type type, written_type, reset_type;
 719    switch (stage) {
 720    case MESA_SHADER_VERTEX:
 721       type = ST_VS;
 722       written_type = ST_VS_WRITTEN;
 723       reset_type = ST_VS_RESET;
 724       break;
 725    case MESA_SHADER_GEOMETRY:
 726       type = ST_GS;
 727       written_type = ST_GS_WRITTEN;
 728       reset_type = ST_GS_RESET;
 729       break;
 730    case MESA_SHADER_FRAGMENT:
 731       if (dispatch_width == 8) {
 732          type = ST_FS8;
 733          written_type = ST_FS8_WRITTEN;
 734          reset_type = ST_FS8_RESET;
 735       } else {
 736          assert(dispatch_width == 16);
 737          type = ST_FS16;
 738          written_type = ST_FS16_WRITTEN;
 739          reset_type = ST_FS16_RESET;
 740       }
 741       break;
 742    case MESA_SHADER_COMPUTE:
 743       type = ST_CS;
 744       written_type = ST_CS_WRITTEN;
 745       reset_type = ST_CS_RESET;
 746       break;
 747    default:
 748       unreachable("fs_visitor::emit_shader_time_end missing code");
 749    }
 750
 751    /* Insert our code just before the final SEND with EOT. */
 752    exec_node *end = this->instructions.get_tail();
 753    assert(end && ((fs_inst *) end)->eot);
 754    const fs_builder ibld = bld.annotate("shader time end")
 755                               .exec_all().at(NULL, end);
 756
 757    fs_reg shader_end_time = get_timestamp(ibld);
 758
 759    /* Check that there weren't any timestamp reset events (assuming these
 760     * were the only two timestamp reads that happened).
 761     */
 762    fs_reg reset = shader_end_time;
 763    reset.set_smear(2);
 764    set_condmod(BRW_CONDITIONAL_Z,
 765                ibld.AND(ibld.null_reg_ud(), reset, fs_reg(1u)));
 766    ibld.IF(BRW_PREDICATE_NORMAL);
 767
 768    fs_reg start = shader_start_time;
 769    start.negate = true;
 770    fs_reg diff = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 1);
 771    diff.set_smear(0);
 772    ibld.ADD(diff, start, shader_end_time);
 773
 774    /* If there were no instructions between the two timestamp gets, the diff
 775     * is 2 cycles.  Remove that overhead, so I can forget about that when
 776     * trying to determine the time taken for single instructions.
 777     */
 778    ibld.ADD(diff, diff, fs_reg(-2u));
 779    SHADER_TIME_ADD(ibld, type, diff);
 780    SHADER_TIME_ADD(ibld, written_type, fs_reg(1u));
 781    ibld.emit(BRW_OPCODE_ELSE);
 782    SHADER_TIME_ADD(ibld, reset_type, fs_reg(1u));
 783    ibld.emit(BRW_OPCODE_ENDIF);
 784 }
 785
 786 void
 787 fs_visitor::SHADER_TIME_ADD(const fs_builder &bld,
 788                             enum shader_time_shader_type type, fs_reg value)
 789 {
 790    int shader_time_index =
 791       brw_get_shader_time_index(brw, shader_prog, prog, type);
 792    fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
 793
 794    fs_reg payload;
 795    if (dispatch_width == 8)
 796       payload = vgrf(glsl_type::uvec2_type);
 797    else
 798       payload = vgrf(glsl_type::uint_type);
 799
 800    bld.emit(SHADER_OPCODE_SHADER_TIME_ADD, fs_reg(), payload, offset, value);
 801 }
 802
 803 void
 804 fs_visitor::vfail(const char *format, va_list va)
 805 {
 806    char *msg;
 807
 808    if (failed)
 809       return;
 810
 811    failed = true;
 812
 813    msg = ralloc_vasprintf(mem_ctx, format, va);
 814    msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
 815
 816    this->fail_msg = msg;
 817
 818    if (debug_enabled) {
 819       fprintf(stderr, "%s",  msg);
 820    }
 821 }
 822
 823 void
 824 fs_visitor::fail(const char *format, ...)
 825 {
 826    va_list va;
 827
 828    va_start(va, format);
 829    vfail(format, va);
 830    va_end(va);
 831 }
 832
 833 /**
 834  * Mark this program as impossible to compile in SIMD16 mode.
 835  *
 836  * During the SIMD8 compile (which happens first), we can detect and flag
 837  * things that are unsupported in SIMD16 mode, so the compiler can skip
 838  * the SIMD16 compile altogether.
 839  *
 840  * During a SIMD16 compile (if one happens anyway), this just calls fail().
 841  */
 842 void
 843 fs_visitor::no16(const char *format, ...)
 844 {
 845    va_list va;
 846
 847    va_start(va, format);
 848
 849    if (dispatch_width == 16) {
 850       vfail(format, va);
 851    } else {
 852       simd16_unsupported = true;
 853
 854       if (brw->perf_debug) {
 855          if (no16_msg)
 856             ralloc_vasprintf_append(&no16_msg, format, va);
 857          else
 858             no16_msg = ralloc_vasprintf(mem_ctx, format, va);
 859       }
 860    }
 861
 862    va_end(va);
 863 }
 864
 865 fs_inst *
 866 fs_visitor::emit(enum opcode opcode)
 867 {
 868    return emit(new(mem_ctx) fs_inst(opcode, dispatch_width));
 869 }
 870
 871 fs_inst *
 872 fs_visitor::emit(enum opcode opcode, const fs_reg &dst)
 873 {
 874    return emit(new(mem_ctx) fs_inst(opcode, dst));
 875 }
 876
 877 fs_inst *
 878 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
 879 {
 880    return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
 881 }
 882
 883 fs_inst *
 884 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 885                  const fs_reg &src1)
 886 {
 887    return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
 888 }
 889
 890 fs_inst *
 891 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 892                  const fs_reg &src1, const fs_reg &src2)
 893 {
 894    return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
 895 }
 896
 897 fs_inst *
 898 fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
 899                  fs_reg src[], int sources)
 900 {
 901    return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
 902 }
 903
 904 /**
 905  * Returns true if the instruction has a flag that means it won't
 906  * update an entire destination register.
 907  *
 908  * For example, dead code elimination and live variable analysis want to know
 909  * when a write to a variable screens off any preceding values that were in
 910  * it.
 911  */
 912 bool
 913 fs_inst::is_partial_write() const
 914 {
 915    return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
 916            (this->dst.width * type_sz(this->dst.type)) < 32 ||
 917            !this->dst.is_contiguous());
 918 }
 919
 920 int
 921 fs_inst::regs_read(int arg) const
 922 {
 923    if (is_tex() && arg == 0 && src[0].file == GRF) {
 924       return mlen;
 925    } else if (opcode == FS_OPCODE_FB_WRITE && arg == 0) {
 926       return mlen;
 927    } else if (opcode == SHADER_OPCODE_URB_WRITE_SIMD8 && arg == 0) {
 928       return mlen;
 929    } else if (opcode == SHADER_OPCODE_UNTYPED_ATOMIC && arg == 0) {
 930       return mlen;
 931    } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ && arg == 0) {
 932       return mlen;
 933    } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_WRITE && arg == 0) {
 934       return mlen;
 935    } else if (opcode == SHADER_OPCODE_TYPED_ATOMIC && arg == 0) {
 936       return mlen;
 937    } else if (opcode == SHADER_OPCODE_TYPED_SURFACE_READ && arg == 0) {
 938       return mlen;
 939    } else if (opcode == SHADER_OPCODE_TYPED_SURFACE_WRITE && arg == 0) {
 940       return mlen;
 941    } else if (opcode == FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET && arg == 0) {
 942       return mlen;
 943    } else if (opcode == FS_OPCODE_LINTERP && arg == 0) {
 944       return exec_size / 4;
 945    }
 946
 947    switch (src[arg].file) {
 948    case BAD_FILE:
 949    case UNIFORM:
 950    case IMM:
 951       return 1;
 952    case GRF:
 953    case HW_REG:
 954       if (src[arg].stride == 0) {
 955          return 1;
 956       } else {
 957          int size = src[arg].width * src[arg].stride * type_sz(src[arg].type);
 958          return (size + 31) / 32;
 959       }
 960    case MRF:
 961       unreachable("MRF registers are not allowed as sources");
 962    default:
 963       unreachable("Invalid register file");
 964    }
 965 }
 966
 967 bool
 968 fs_inst::reads_flag() const
 969 {
 970    return predicate;
 971 }
 972
 973 bool
 974 fs_inst::writes_flag() const
 975 {
 976    return (conditional_mod && (opcode != BRW_OPCODE_SEL &&
 977                                opcode != BRW_OPCODE_IF &&
 978                                opcode != BRW_OPCODE_WHILE)) ||
 979           opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
 980 }
 981
 982 /**
 983  * Returns how many MRFs an FS opcode will write over.
 984  *
 985  * Note that this is not the 0 or 1 implied writes in an actual gen
 986  * instruction -- the FS opcodes often generate MOVs in addition.
 987  */
 988 int
 989 fs_visitor::implied_mrf_writes(fs_inst *inst)
 990 {
 991    if (inst->mlen == 0)
 992       return 0;
 993
 994    if (inst->base_mrf == -1)
 995       return 0;
 996
 997    switch (inst->opcode) {
 998    case SHADER_OPCODE_RCP:
 999    case SHADER_OPCODE_RSQ:
1000    case SHADER_OPCODE_SQRT:
1001    case SHADER_OPCODE_EXP2:
1002    case SHADER_OPCODE_LOG2:
1003    case SHADER_OPCODE_SIN:
1004    case SHADER_OPCODE_COS:
1005       return 1 * dispatch_width / 8;
1006    case SHADER_OPCODE_POW:
1007    case SHADER_OPCODE_INT_QUOTIENT:
1008    case SHADER_OPCODE_INT_REMAINDER:
1009       return 2 * dispatch_width / 8;
1010    case SHADER_OPCODE_TEX:
1011    case FS_OPCODE_TXB:
1012    case SHADER_OPCODE_TXD:
1013    case SHADER_OPCODE_TXF:
1014    case SHADER_OPCODE_TXF_CMS:
1015    case SHADER_OPCODE_TXF_MCS:
1016    case SHADER_OPCODE_TG4:
1017    case SHADER_OPCODE_TG4_OFFSET:
1018    case SHADER_OPCODE_TXL:
1019    case SHADER_OPCODE_TXS:
1020    case SHADER_OPCODE_LOD:
1021       return 1;
1022    case FS_OPCODE_FB_WRITE:
1023       return 2;
1024    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
1025    case SHADER_OPCODE_GEN4_SCRATCH_READ:
1026       return 1;
1027    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
1028       return inst->mlen;
1029    case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
1030       return inst->mlen;
1031    case SHADER_OPCODE_UNTYPED_ATOMIC:
1032    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
1033    case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
1034    case SHADER_OPCODE_TYPED_ATOMIC:
1035    case SHADER_OPCODE_TYPED_SURFACE_READ:
1036    case SHADER_OPCODE_TYPED_SURFACE_WRITE:
1037    case SHADER_OPCODE_URB_WRITE_SIMD8:
1038    case FS_OPCODE_INTERPOLATE_AT_CENTROID:
1039    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
1040    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
1041    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
1042       return 0;
1043    default:
1044       unreachable("not reached");
1045    }
1046 }
1047
1048 fs_reg
1049 fs_visitor::vgrf(const glsl_type *const type)
1050 {
1051    int reg_width = dispatch_width / 8;
1052    return fs_reg(GRF, alloc.allocate(type_size(type) * reg_width),
1053                  brw_type_for_base_type(type), dispatch_width);
1054 }
1055
1056 fs_reg
1057 fs_visitor::vgrf(int num_components)
1058 {
1059    int reg_width = dispatch_width / 8;
1060    return fs_reg(GRF, alloc.allocate(num_components * reg_width),
1061                  BRW_REGISTER_TYPE_F, dispatch_width);
1062 }
1063
1064 /** Fixed HW reg constructor. */
1065 fs_reg::fs_reg(enum register_file file, int reg)
1066 {
1067    init();
1068    this->file = file;
1069    this->reg = reg;
1070    this->type = BRW_REGISTER_TYPE_F;
1071
1072    switch (file) {
1073    case UNIFORM:
1074       this->width = 1;
1075       break;
1076    default:
1077       this->width = 8;
1078    }
1079 }
1080
1081 /** Fixed HW reg constructor. */
1082 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
1083 {
1084    init();
1085    this->file = file;
1086    this->reg = reg;
1087    this->type = type;
1088
1089    switch (file) {
1090    case UNIFORM:
1091       this->width = 1;
1092       break;
1093    default:
1094       this->width = 8;
1095    }
1096 }
1097
1098 /** Fixed HW reg constructor. */
1099 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type,
1100                uint8_t width)
1101 {
1102    init();
1103    this->file = file;
1104    this->reg = reg;
1105    this->type = type;
1106    this->width = width;
1107 }
1108
1109 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
1110  * This brings in those uniform definitions
1111  */
1112 void
1113 fs_visitor::import_uniforms(fs_visitor *v)
1114 {
1115    this->push_constant_loc = v->push_constant_loc;
1116    this->pull_constant_loc = v->pull_constant_loc;
1117    this->uniforms = v->uniforms;
1118    this->param_size = v->param_size;
1119 }
1120
1121 fs_reg *
1122 fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
1123                                          bool origin_upper_left)
1124 {
1125    assert(stage == MESA_SHADER_FRAGMENT);
1126    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1127    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec4_type));
1128    fs_reg wpos = *reg;
1129    bool flip = !origin_upper_left ^ key->render_to_fbo;
1130
1131    /* gl_FragCoord.x */
1132    if (pixel_center_integer) {
1133       bld.MOV(wpos, this->pixel_x);
1134    } else {
1135       bld.ADD(wpos, this->pixel_x, fs_reg(0.5f));
1136    }
1137    wpos = offset(wpos, 1);
1138
1139    /* gl_FragCoord.y */
1140    if (!flip && pixel_center_integer) {
1141       bld.MOV(wpos, this->pixel_y);
1142    } else {
1143       fs_reg pixel_y = this->pixel_y;
1144       float offset = (pixel_center_integer ? 0.0 : 0.5);
1145
1146       if (flip) {
1147          pixel_y.negate = true;
1148          offset += key->drawable_height - 1.0;
1149       }
1150
1151       bld.ADD(wpos, pixel_y, fs_reg(offset));
1152    }
1153    wpos = offset(wpos, 1);
1154
1155    /* gl_FragCoord.z */
1156    if (devinfo->gen >= 6) {
1157       bld.MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0)));
1158    } else {
1159       bld.emit(FS_OPCODE_LINTERP, wpos,
1160            this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1161            interp_reg(VARYING_SLOT_POS, 2));
1162    }
1163    wpos = offset(wpos, 1);
1164
1165    /* gl_FragCoord.w: Already set up in emit_interpolation */
1166    bld.MOV(wpos, this->wpos_w);
1167
1168    return reg;
1169 }
1170
1171 fs_inst *
1172 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1173                          glsl_interp_qualifier interpolation_mode,
1174                          bool is_centroid, bool is_sample)
1175 {
1176    brw_wm_barycentric_interp_mode barycoord_mode;
1177    if (devinfo->gen >= 6) {
1178       if (is_centroid) {
1179          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1180             barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1181          else
1182             barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1183       } else if (is_sample) {
1184           if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1185             barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1186          else
1187             barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1188       } else {
1189          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1190             barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1191          else
1192             barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1193       }
1194    } else {
1195       /* On Ironlake and below, there is only one interpolation mode.
1196        * Centroid interpolation doesn't mean anything on this hardware --
1197        * there is no multisampling.
1198        */
1199       barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1200    }
1201    return bld.emit(FS_OPCODE_LINTERP, attr,
1202                    this->delta_xy[barycoord_mode], interp);
1203 }
1204
1205 void
1206 fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
1207                                        const glsl_type *type,
1208                                        glsl_interp_qualifier interpolation_mode,
1209                                        int location, bool mod_centroid,
1210                                        bool mod_sample)
1211 {
1212    attr.type = brw_type_for_base_type(type->get_scalar_type());
1213
1214    assert(stage == MESA_SHADER_FRAGMENT);
1215    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1216    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1217
1218    unsigned int array_elements;
1219
1220    if (type->is_array()) {
1221       array_elements = type->length;
1222       if (array_elements == 0) {
1223          fail("dereferenced array '%s' has length 0\n", name);
1224       }
1225       type = type->fields.array;
1226    } else {
1227       array_elements = 1;
1228    }
1229
1230    if (interpolation_mode == INTERP_QUALIFIER_NONE) {
1231       bool is_gl_Color =
1232          location == VARYING_SLOT_COL0 || location == VARYING_SLOT_COL1;
1233       if (key->flat_shade && is_gl_Color) {
1234          interpolation_mode = INTERP_QUALIFIER_FLAT;
1235       } else {
1236          interpolation_mode = INTERP_QUALIFIER_SMOOTH;
1237       }
1238    }
1239
1240    for (unsigned int i = 0; i < array_elements; i++) {
1241       for (unsigned int j = 0; j < type->matrix_columns; j++) {
1242          if (prog_data->urb_setup[location] == -1) {
1243             /* If there's no incoming setup data for this slot, don't
1244              * emit interpolation for it.
1245              */
1246             attr = offset(attr, type->vector_elements);
1247             location++;
1248             continue;
1249          }
1250
1251          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1252             /* Constant interpolation (flat shading) case. The SF has
1253              * handed us defined values in only the constant offset
1254              * field of the setup reg.
1255              */
1256             for (unsigned int k = 0; k < type->vector_elements; k++) {
1257                struct brw_reg interp = interp_reg(location, k);
1258                interp = suboffset(interp, 3);
1259                interp.type = attr.type;
1260                bld.emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1261                attr = offset(attr, 1);
1262             }
1263          } else {
1264             /* Smooth/noperspective interpolation case. */
1265             for (unsigned int k = 0; k < type->vector_elements; k++) {
1266                struct brw_reg interp = interp_reg(location, k);
1267                if (devinfo->needs_unlit_centroid_workaround && mod_centroid) {
1268                   /* Get the pixel/sample mask into f0 so that we know
1269                    * which pixels are lit.  Then, for each channel that is
1270                    * unlit, replace the centroid data with non-centroid
1271                    * data.
1272                    */
1273                   bld.emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1274
1275                   fs_inst *inst;
1276                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1277                                       false, false);
1278                   inst->predicate = BRW_PREDICATE_NORMAL;
1279                   inst->predicate_inverse = true;
1280                   if (devinfo->has_pln)
1281                      inst->no_dd_clear = true;
1282
1283                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1284                                       mod_centroid && !key->persample_shading,
1285                                       mod_sample || key->persample_shading);
1286                   inst->predicate = BRW_PREDICATE_NORMAL;
1287                   inst->predicate_inverse = false;
1288                   if (devinfo->has_pln)
1289                      inst->no_dd_check = true;
1290
1291                } else {
1292                   emit_linterp(attr, fs_reg(interp), interpolation_mode,
1293                                mod_centroid && !key->persample_shading,
1294                                mod_sample || key->persample_shading);
1295                }
1296                if (devinfo->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1297                   bld.MUL(attr, attr, this->pixel_w);
1298                }
1299                attr = offset(attr, 1);
1300             }
1301
1302          }
1303          location++;
1304       }
1305    }
1306 }
1307
1308 fs_reg *
1309 fs_visitor::emit_frontfacing_interpolation()
1310 {
1311    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type));
1312
1313    if (devinfo->gen >= 6) {
1314       /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
1315        * a boolean result from this (~0/true or 0/false).
1316        *
1317        * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
1318        * this task in only one instruction:
1319        *    - a negation source modifier will flip the bit; and
1320        *    - a W -> D type conversion will sign extend the bit into the high
1321        *      word of the destination.
1322        *
1323        * An ASR 15 fills the low word of the destination.
1324        */
1325       fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
1326       g0.negate = true;
1327
1328       bld.ASR(*reg, g0, fs_reg(15));
1329    } else {
1330       /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
1331        * a boolean result from this (1/true or 0/false).
1332        *
1333        * Like in the above case, since the bit is the MSB of g1.6:UD we can use
1334        * the negation source modifier to flip it. Unfortunately the SHR
1335        * instruction only operates on UD (or D with an abs source modifier)
1336        * sources without negation.
1337        *
1338        * Instead, use ASR (which will give ~0/true or 0/false).
1339        */
1340       fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
1341       g1_6.negate = true;
1342
1343       bld.ASR(*reg, g1_6, fs_reg(31));
1344    }
1345
1346    return reg;
1347 }
1348
1349 void
1350 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1351 {
1352    assert(stage == MESA_SHADER_FRAGMENT);
1353    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1354    assert(dst.type == BRW_REGISTER_TYPE_F);
1355
1356    if (key->compute_pos_offset) {
1357       /* Convert int_sample_pos to floating point */
1358       bld.MOV(dst, int_sample_pos);
1359       /* Scale to the range [0, 1] */
1360       bld.MUL(dst, dst, fs_reg(1 / 16.0f));
1361    }
1362    else {
1363       /* From ARB_sample_shading specification:
1364        * "When rendering to a non-multisample buffer, or if multisample
1365        *  rasterization is disabled, gl_SamplePosition will always be
1366        *  (0.5, 0.5).
1367        */
1368       bld.MOV(dst, fs_reg(0.5f));
1369    }
1370 }
1371
1372 fs_reg *
1373 fs_visitor::emit_samplepos_setup()
1374 {
1375    assert(devinfo->gen >= 6);
1376
1377    const fs_builder abld = bld.annotate("compute sample position");
1378    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type));
1379    fs_reg pos = *reg;
1380    fs_reg int_sample_x = vgrf(glsl_type::int_type);
1381    fs_reg int_sample_y = vgrf(glsl_type::int_type);
1382
1383    /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1384     * mode will be enabled.
1385     *
1386     * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1387     * R31.1:0         Position Offset X/Y for Slot[3:0]
1388     * R31.3:2         Position Offset X/Y for Slot[7:4]
1389     * .....
1390     *
1391     * The X, Y sample positions come in as bytes in  thread payload. So, read
1392     * the positions using vstride=16, width=8, hstride=2.
1393     */
1394    struct brw_reg sample_pos_reg =
1395       stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1396                     BRW_REGISTER_TYPE_B), 16, 8, 2);
1397
1398    if (dispatch_width == 8) {
1399       abld.MOV(int_sample_x, fs_reg(sample_pos_reg));
1400    } else {
1401       abld.half(0).MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg));
1402       abld.half(1).MOV(half(int_sample_x, 1),
1403                        fs_reg(suboffset(sample_pos_reg, 16)));
1404    }
1405    /* Compute gl_SamplePosition.x */
1406    compute_sample_position(pos, int_sample_x);
1407    pos = offset(pos, 1);
1408    if (dispatch_width == 8) {
1409       abld.MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1)));
1410    } else {
1411       abld.half(0).MOV(half(int_sample_y, 0),
1412                        fs_reg(suboffset(sample_pos_reg, 1)));
1413       abld.half(1).MOV(half(int_sample_y, 1),
1414                        fs_reg(suboffset(sample_pos_reg, 17)));
1415    }
1416    /* Compute gl_SamplePosition.y */
1417    compute_sample_position(pos, int_sample_y);
1418    return reg;
1419 }
1420
1421 fs_reg *
1422 fs_visitor::emit_sampleid_setup()
1423 {
1424    assert(stage == MESA_SHADER_FRAGMENT);
1425    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1426    assert(devinfo->gen >= 6);
1427
1428    const fs_builder abld = bld.annotate("compute sample id");
1429    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
1430
1431    if (key->compute_sample_id) {
1432       fs_reg t1 = vgrf(glsl_type::int_type);
1433       fs_reg t2 = vgrf(glsl_type::int_type);
1434       t2.type = BRW_REGISTER_TYPE_UW;
1435
1436       /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1437        * 8x multisampling, subspan 0 will represent sample N (where N
1438        * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1439        * 7. We can find the value of N by looking at R0.0 bits 7:6
1440        * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1441        * (since samples are always delivered in pairs). That is, we
1442        * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1443        * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1444        * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1445        * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1446        * populating a temporary variable with the sequence (0, 1, 2, 3),
1447        * and then reading from it using vstride=1, width=4, hstride=0.
1448        * These computations hold good for 4x multisampling as well.
1449        *
1450        * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1451        * the first four slots are sample 0 of subspan 0; the next four
1452        * are sample 1 of subspan 0; the third group is sample 0 of
1453        * subspan 1, and finally sample 1 of subspan 1.
1454        */
1455       abld.exec_all()
1456           .AND(t1, fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1457                fs_reg(0xc0));
1458       abld.exec_all().SHR(t1, t1, fs_reg(5));
1459
1460       /* This works for both SIMD8 and SIMD16 */
1461       abld.exec_all()
1462           .MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210));
1463
1464       /* This special instruction takes care of setting vstride=1,
1465        * width=4, hstride=0 of t2 during an ADD instruction.
1466        */
1467       abld.emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1468    } else {
1469       /* As per GL_ARB_sample_shading specification:
1470        * "When rendering to a non-multisample buffer, or if multisample
1471        *  rasterization is disabled, gl_SampleID will always be zero."
1472        */
1473       abld.MOV(*reg, fs_reg(0));
1474    }
1475
1476    return reg;
1477 }
1478
1479 void
1480 fs_visitor::resolve_source_modifiers(fs_reg *src)
1481 {
1482    if (!src->abs && !src->negate)
1483       return;
1484
1485    fs_reg temp = retype(vgrf(1), src->type);
1486    emit(MOV(temp, *src));
1487    *src = temp;
1488 }
1489
1490 fs_reg
1491 fs_visitor::fix_math_operand(fs_reg src)
1492 {
1493    /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1494     * might be able to do better by doing execsize = 1 math and then
1495     * expanding that result out, but we would need to be careful with
1496     * masking.
1497     *
1498     * The hardware ignores source modifiers (negate and abs) on math
1499     * instructions, so we also move to a temp to set those up.
1500     */
1501    if (devinfo->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1502        !src.abs && !src.negate)
1503       return src;
1504
1505    /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1506     * operands to math
1507     */
1508    if (devinfo->gen >= 7 && src.file != IMM)
1509       return src;
1510
1511    fs_reg expanded = vgrf(glsl_type::float_type);
1512    expanded.type = src.type;
1513    emit(BRW_OPCODE_MOV, expanded, src);
1514    return expanded;
1515 }
1516
1517 fs_inst *
1518 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1519 {
1520    switch (opcode) {
1521    case SHADER_OPCODE_RCP:
1522    case SHADER_OPCODE_RSQ:
1523    case SHADER_OPCODE_SQRT:
1524    case SHADER_OPCODE_EXP2:
1525    case SHADER_OPCODE_LOG2:
1526    case SHADER_OPCODE_SIN:
1527    case SHADER_OPCODE_COS:
1528       break;
1529    default:
1530       unreachable("not reached: bad math opcode");
1531    }
1532
1533    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
1534     * might be able to do better by doing execsize = 1 math and then
1535     * expanding that result out, but we would need to be careful with
1536     * masking.
1537     *
1538     * Gen 6 hardware ignores source modifiers (negate and abs) on math
1539     * instructions, so we also move to a temp to set those up.
1540     */
1541    if (devinfo->gen == 6 || devinfo->gen == 7)
1542       src = fix_math_operand(src);
1543
1544    fs_inst *inst = emit(opcode, dst, src);
1545
1546    if (devinfo->gen < 6) {
1547       inst->base_mrf = 2;
1548       inst->mlen = dispatch_width / 8;
1549    }
1550
1551    return inst;
1552 }
1553
1554 fs_inst *
1555 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1556 {
1557    int base_mrf = 2;
1558    fs_inst *inst;
1559
1560    if (devinfo->gen >= 8) {
1561       inst = emit(opcode, dst, src0, src1);
1562    } else if (devinfo->gen >= 6) {
1563       src0 = fix_math_operand(src0);
1564       src1 = fix_math_operand(src1);
1565
1566       inst = emit(opcode, dst, src0, src1);
1567    } else {
1568       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1569        * "Message Payload":
1570        *
1571        * "Operand0[7].  For the INT DIV functions, this operand is the
1572        *  denominator."
1573        *  ...
1574        * "Operand1[7].  For the INT DIV functions, this operand is the
1575        *  numerator."
1576        */
1577       bool is_int_div = opcode != SHADER_OPCODE_POW;
1578       fs_reg &op0 = is_int_div ? src1 : src0;
1579       fs_reg &op1 = is_int_div ? src0 : src1;
1580
1581       emit(MOV(fs_reg(MRF, base_mrf + 1, op1.type, dispatch_width), op1));
1582       inst = emit(opcode, dst, op0, reg_null_f);
1583
1584       inst->base_mrf = base_mrf;
1585       inst->mlen = 2 * dispatch_width / 8;
1586    }
1587    return inst;
1588 }
1589
1590 void
1591 fs_visitor::emit_discard_jump()
1592 {
1593    assert(((brw_wm_prog_data*) this->prog_data)->uses_kill);
1594
1595    /* For performance, after a discard, jump to the end of the
1596     * shader if all relevant channels have been discarded.
1597     */
1598    fs_inst *discard_jump = bld.emit(FS_OPCODE_DISCARD_JUMP);
1599    discard_jump->flag_subreg = 1;
1600
1601    discard_jump->predicate = (dispatch_width == 8)
1602                              ? BRW_PREDICATE_ALIGN1_ANY8H
1603                              : BRW_PREDICATE_ALIGN1_ANY16H;
1604    discard_jump->predicate_inverse = true;
1605 }
1606
1607 void
1608 fs_visitor::assign_curb_setup()
1609 {
1610    if (dispatch_width == 8) {
1611       prog_data->dispatch_grf_start_reg = payload.num_regs;
1612    } else {
1613       if (stage == MESA_SHADER_FRAGMENT) {
1614          brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1615          prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1616       } else if (stage == MESA_SHADER_COMPUTE) {
1617          brw_cs_prog_data *prog_data = (brw_cs_prog_data*) this->prog_data;
1618          prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1619       } else {
1620          unreachable("Unsupported shader type!");
1621       }
1622    }
1623
1624    prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1625
1626    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1627    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1628       for (unsigned int i = 0; i < inst->sources; i++) {
1629          if (inst->src[i].file == UNIFORM) {
1630             int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1631             int constant_nr;
1632             if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1633                constant_nr = push_constant_loc[uniform_nr];
1634             } else {
1635                /* Section 5.11 of the OpenGL 4.1 spec says:
1636                 * "Out-of-bounds reads return undefined values, which include
1637                 *  values from other variables of the active program or zero."
1638                 * Just return the first push constant.
1639                 */
1640                constant_nr = 0;
1641             }
1642
1643             struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1644                                                   constant_nr / 8,
1645                                                   constant_nr % 8);
1646
1647             inst->src[i].file = HW_REG;
1648             inst->src[i].fixed_hw_reg = byte_offset(
1649                retype(brw_reg, inst->src[i].type),
1650                inst->src[i].subreg_offset);
1651          }
1652       }
1653    }
1654 }
1655
1656 void
1657 fs_visitor::calculate_urb_setup()
1658 {
1659    assert(stage == MESA_SHADER_FRAGMENT);
1660    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1661    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1662
1663    memset(prog_data->urb_setup, -1,
1664           sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
1665
1666    int urb_next = 0;
1667    /* Figure out where each of the incoming setup attributes lands. */
1668    if (devinfo->gen >= 6) {
1669       if (_mesa_bitcount_64(prog->InputsRead &
1670                             BRW_FS_VARYING_INPUT_MASK) <= 16) {
1671          /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1672           * first 16 varying inputs, so we can put them wherever we want.
1673           * Just put them in order.
1674           *
1675           * This is useful because it means that (a) inputs not used by the
1676           * fragment shader won't take up valuable register space, and (b) we
1677           * won't have to recompile the fragment shader if it gets paired with
1678           * a different vertex (or geometry) shader.
1679           */
1680          for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1681             if (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1682                 BITFIELD64_BIT(i)) {
1683                prog_data->urb_setup[i] = urb_next++;
1684             }
1685          }
1686       } else {
1687          /* We have enough input varyings that the SF/SBE pipeline stage can't
1688           * arbitrarily rearrange them to suit our whim; we have to put them
1689           * in an order that matches the output of the previous pipeline stage
1690           * (geometry or vertex shader).
1691           */
1692          struct brw_vue_map prev_stage_vue_map;
1693          brw_compute_vue_map(devinfo, &prev_stage_vue_map,
1694                              key->input_slots_valid);
1695          int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1696          assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1697          for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1698               slot++) {
1699             int varying = prev_stage_vue_map.slot_to_varying[slot];
1700             /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1701              * unused.
1702              */
1703             if (varying != BRW_VARYING_SLOT_COUNT &&
1704                 (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1705                  BITFIELD64_BIT(varying))) {
1706                prog_data->urb_setup[varying] = slot - first_slot;
1707             }
1708          }
1709          urb_next = prev_stage_vue_map.num_slots - first_slot;
1710       }
1711    } else {
1712       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1713       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1714          /* Point size is packed into the header, not as a general attribute */
1715          if (i == VARYING_SLOT_PSIZ)
1716             continue;
1717
1718          if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1719             /* The back color slot is skipped when the front color is
1720              * also written to.  In addition, some slots can be
1721              * written in the vertex shader and not read in the
1722              * fragment shader.  So the register number must always be
1723              * incremented, mapped or not.
1724              */
1725             if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1726                prog_data->urb_setup[i] = urb_next;
1727             urb_next++;
1728          }
1729       }
1730
1731       /*
1732        * It's a FS only attribute, and we did interpolation for this attribute
1733        * in SF thread. So, count it here, too.
1734        *
1735        * See compile_sf_prog() for more info.
1736        */
1737       if (prog->InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1738          prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1739    }
1740
1741    prog_data->num_varying_inputs = urb_next;
1742 }
1743
1744 void
1745 fs_visitor::assign_urb_setup()
1746 {
1747    assert(stage == MESA_SHADER_FRAGMENT);
1748    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1749
1750    int urb_start = payload.num_regs + prog_data->base.curb_read_length;
1751
1752    /* Offset all the urb_setup[] index by the actual position of the
1753     * setup regs, now that the location of the constants has been chosen.
1754     */
1755    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1756       if (inst->opcode == FS_OPCODE_LINTERP) {
1757          assert(inst->src[1].file == HW_REG);
1758          inst->src[1].fixed_hw_reg.nr += urb_start;
1759       }
1760
1761       if (inst->opcode == FS_OPCODE_CINTERP) {
1762          assert(inst->src[0].file == HW_REG);
1763          inst->src[0].fixed_hw_reg.nr += urb_start;
1764       }
1765    }
1766
1767    /* Each attribute is 4 setup channels, each of which is half a reg. */
1768    this->first_non_payload_grf =
1769       urb_start + prog_data->num_varying_inputs * 2;
1770 }
1771
1772 void
1773 fs_visitor::assign_vs_urb_setup()
1774 {
1775    brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
1776    int grf, count, slot, channel, attr;
1777
1778    assert(stage == MESA_SHADER_VERTEX);
1779    count = _mesa_bitcount_64(vs_prog_data->inputs_read);
1780    if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid)
1781       count++;
1782
1783    /* Each attribute is 4 regs. */
1784    this->first_non_payload_grf =
1785       payload.num_regs + prog_data->curb_read_length + count * 4;
1786
1787    unsigned vue_entries =
1788       MAX2(count, vs_prog_data->base.vue_map.num_slots);
1789
1790    vs_prog_data->base.urb_entry_size = ALIGN(vue_entries, 4) / 4;
1791    vs_prog_data->base.urb_read_length = (count + 1) / 2;
1792
1793    assert(vs_prog_data->base.urb_read_length <= 15);
1794
1795    /* Rewrite all ATTR file references to the hw grf that they land in. */
1796    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1797       for (int i = 0; i < inst->sources; i++) {
1798          if (inst->src[i].file == ATTR) {
1799
1800             if (inst->src[i].reg == VERT_ATTRIB_MAX) {
1801                slot = count - 1;
1802             } else {
1803                /* Attributes come in in a contiguous block, ordered by their
1804                 * gl_vert_attrib value.  That means we can compute the slot
1805                 * number for an attribute by masking out the enabled
1806                 * attributes before it and counting the bits.
1807                 */
1808                attr = inst->src[i].reg + inst->src[i].reg_offset / 4;
1809                slot = _mesa_bitcount_64(vs_prog_data->inputs_read &
1810                                         BITFIELD64_MASK(attr));
1811             }
1812
1813             channel = inst->src[i].reg_offset & 3;
1814
1815             grf = payload.num_regs +
1816                prog_data->curb_read_length +
1817                slot * 4 + channel;
1818
1819             inst->src[i].file = HW_REG;
1820             inst->src[i].fixed_hw_reg =
1821                retype(brw_vec8_grf(grf, 0), inst->src[i].type);
1822          }
1823       }
1824    }
1825 }
1826
1827 /**
1828  * Split large virtual GRFs into separate components if we can.
1829  *
1830  * This is mostly duplicated with what brw_fs_vector_splitting does,
1831  * but that's really conservative because it's afraid of doing
1832  * splitting that doesn't result in real progress after the rest of
1833  * the optimization phases, which would cause infinite looping in
1834  * optimization.  We can do it once here, safely.  This also has the
1835  * opportunity to split interpolated values, or maybe even uniforms,
1836  * which we don't have at the IR level.
1837  *
1838  * We want to split, because virtual GRFs are what we register
1839  * allocate and spill (due to contiguousness requirements for some
1840  * instructions), and they're what we naturally generate in the
1841  * codegen process, but most virtual GRFs don't actually need to be
1842  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1843  * live intervals and better dead code elimination and coalescing.
1844  */
1845 void
1846 fs_visitor::split_virtual_grfs()
1847 {
1848    int num_vars = this->alloc.count;
1849
1850    /* Count the total number of registers */
1851    int reg_count = 0;
1852    int vgrf_to_reg[num_vars];
1853    for (int i = 0; i < num_vars; i++) {
1854       vgrf_to_reg[i] = reg_count;
1855       reg_count += alloc.sizes[i];
1856    }
1857
1858    /* An array of "split points".  For each register slot, this indicates
1859     * if this slot can be separated from the previous slot.  Every time an
1860     * instruction uses multiple elements of a register (as a source or
1861     * destination), we mark the used slots as inseparable.  Then we go
1862     * through and split the registers into the smallest pieces we can.
1863     */
1864    bool split_points[reg_count];
1865    memset(split_points, 0, sizeof(split_points));
1866
1867    /* Mark all used registers as fully splittable */
1868    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1869       if (inst->dst.file == GRF) {
1870          int reg = vgrf_to_reg[inst->dst.reg];
1871          for (unsigned j = 1; j < this->alloc.sizes[inst->dst.reg]; j++)
1872             split_points[reg + j] = true;
1873       }
1874
1875       for (int i = 0; i < inst->sources; i++) {
1876          if (inst->src[i].file == GRF) {
1877             int reg = vgrf_to_reg[inst->src[i].reg];
1878             for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].reg]; j++)
1879                split_points[reg + j] = true;
1880          }
1881       }
1882    }
1883
1884    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1885       if (inst->dst.file == GRF) {
1886          int reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
1887          for (int j = 1; j < inst->regs_written; j++)
1888             split_points[reg + j] = false;
1889       }
1890       for (int i = 0; i < inst->sources; i++) {
1891          if (inst->src[i].file == GRF) {
1892             int reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
1893             for (int j = 1; j < inst->regs_read(i); j++)
1894                split_points[reg + j] = false;
1895          }
1896       }
1897    }
1898
1899    int new_virtual_grf[reg_count];
1900    int new_reg_offset[reg_count];
1901
1902    int reg = 0;
1903    for (int i = 0; i < num_vars; i++) {
1904       /* The first one should always be 0 as a quick sanity check. */
1905       assert(split_points[reg] == false);
1906
1907       /* j = 0 case */
1908       new_reg_offset[reg] = 0;
1909       reg++;
1910       int offset = 1;
1911
1912       /* j > 0 case */
1913       for (unsigned j = 1; j < alloc.sizes[i]; j++) {
1914          /* If this is a split point, reset the offset to 0 and allocate a
1915           * new virtual GRF for the previous offset many registers
1916           */
1917          if (split_points[reg]) {
1918             assert(offset <= MAX_VGRF_SIZE);
1919             int grf = alloc.allocate(offset);
1920             for (int k = reg - offset; k < reg; k++)
1921                new_virtual_grf[k] = grf;
1922             offset = 0;
1923          }
1924          new_reg_offset[reg] = offset;
1925          offset++;
1926          reg++;
1927       }
1928
1929       /* The last one gets the original register number */
1930       assert(offset <= MAX_VGRF_SIZE);
1931       alloc.sizes[i] = offset;
1932       for (int k = reg - offset; k < reg; k++)
1933          new_virtual_grf[k] = i;
1934    }
1935    assert(reg == reg_count);
1936
1937    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1938       if (inst->dst.file == GRF) {
1939          reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
1940          inst->dst.reg = new_virtual_grf[reg];
1941          inst->dst.reg_offset = new_reg_offset[reg];
1942          assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
1943       }
1944       for (int i = 0; i < inst->sources; i++) {
1945          if (inst->src[i].file == GRF) {
1946             reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
1947             inst->src[i].reg = new_virtual_grf[reg];
1948             inst->src[i].reg_offset = new_reg_offset[reg];
1949             assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
1950          }
1951       }
1952    }
1953    invalidate_live_intervals();
1954 }
1955
1956 /**
1957  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1958  *
1959  * During code generation, we create tons of temporary variables, many of
1960  * which get immediately killed and are never used again.  Yet, in later
1961  * optimization and analysis passes, such as compute_live_intervals, we need
1962  * to loop over all the virtual GRFs.  Compacting them can save a lot of
1963  * overhead.
1964  */
1965 bool
1966 fs_visitor::compact_virtual_grfs()
1967 {
1968    bool progress = false;
1969    int remap_table[this->alloc.count];
1970    memset(remap_table, -1, sizeof(remap_table));
1971
1972    /* Mark which virtual GRFs are used. */
1973    foreach_block_and_inst(block, const fs_inst, inst, cfg) {
1974       if (inst->dst.file == GRF)
1975          remap_table[inst->dst.reg] = 0;
1976
1977       for (int i = 0; i < inst->sources; i++) {
1978          if (inst->src[i].file == GRF)
1979             remap_table[inst->src[i].reg] = 0;
1980       }
1981    }
1982
1983    /* Compact the GRF arrays. */
1984    int new_index = 0;
1985    for (unsigned i = 0; i < this->alloc.count; i++) {
1986       if (remap_table[i] == -1) {
1987          /* We just found an unused register.  This means that we are
1988           * actually going to compact something.
1989           */
1990          progress = true;
1991       } else {
1992          remap_table[i] = new_index;
1993          alloc.sizes[new_index] = alloc.sizes[i];
1994          invalidate_live_intervals();
1995          ++new_index;
1996       }
1997    }
1998
1999    this->alloc.count = new_index;
2000
2001    /* Patch all the instructions to use the newly renumbered registers */
2002    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2003       if (inst->dst.file == GRF)
2004          inst->dst.reg = remap_table[inst->dst.reg];
2005
2006       for (int i = 0; i < inst->sources; i++) {
2007          if (inst->src[i].file == GRF)
2008             inst->src[i].reg = remap_table[inst->src[i].reg];
2009       }
2010    }
2011
2012    /* Patch all the references to delta_xy, since they're used in register
2013     * allocation.  If they're unused, switch them to BAD_FILE so we don't
2014     * think some random VGRF is delta_xy.
2015     */
2016    for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
2017       if (delta_xy[i].file == GRF) {
2018          if (remap_table[delta_xy[i].reg] != -1) {
2019             delta_xy[i].reg = remap_table[delta_xy[i].reg];
2020          } else {
2021             delta_xy[i].file = BAD_FILE;
2022          }
2023       }
2024    }
2025
2026    return progress;
2027 }
2028
2029 /*
2030  * Implements array access of uniforms by inserting a
2031  * PULL_CONSTANT_LOAD instruction.
2032  *
2033  * Unlike temporary GRF array access (where we don't support it due to
2034  * the difficulty of doing relative addressing on instruction
2035  * destinations), we could potentially do array access of uniforms
2036  * that were loaded in GRF space as push constants.  In real-world
2037  * usage we've seen, though, the arrays being used are always larger
2038  * than we could load as push constants, so just always move all
2039  * uniform array access out to a pull constant buffer.
2040  */
2041 void
2042 fs_visitor::move_uniform_array_access_to_pull_constants()
2043 {
2044    if (dispatch_width != 8)
2045       return;
2046
2047    pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2048    memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
2049
2050    /* Walk through and find array access of uniforms.  Put a copy of that
2051     * uniform in the pull constant buffer.
2052     *
2053     * Note that we don't move constant-indexed accesses to arrays.  No
2054     * testing has been done of the performance impact of this choice.
2055     */
2056    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2057       for (int i = 0 ; i < inst->sources; i++) {
2058          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2059             continue;
2060
2061          int uniform = inst->src[i].reg;
2062
2063          /* If this array isn't already present in the pull constant buffer,
2064           * add it.
2065           */
2066          if (pull_constant_loc[uniform] == -1) {
2067             const gl_constant_value **values = &stage_prog_data->param[uniform];
2068
2069             assert(param_size[uniform]);
2070
2071             for (int j = 0; j < param_size[uniform]; j++) {
2072                pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
2073
2074                stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
2075                   values[j];
2076             }
2077          }
2078       }
2079    }
2080 }
2081
2082 /**
2083  * Assign UNIFORM file registers to either push constants or pull constants.
2084  *
2085  * We allow a fragment shader to have more than the specified minimum
2086  * maximum number of fragment shader uniform components (64).  If
2087  * there are too many of these, they'd fill up all of register space.
2088  * So, this will push some of them out to the pull constant buffer and
2089  * update the program to load them.
2090  */
2091 void
2092 fs_visitor::assign_constant_locations()
2093 {
2094    /* Only the first compile (SIMD8 mode) gets to decide on locations. */
2095    if (dispatch_width != 8)
2096       return;
2097
2098    /* Find which UNIFORM registers are still in use. */
2099    bool is_live[uniforms];
2100    for (unsigned int i = 0; i < uniforms; i++) {
2101       is_live[i] = false;
2102    }
2103
2104    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2105       for (int i = 0; i < inst->sources; i++) {
2106          if (inst->src[i].file != UNIFORM)
2107             continue;
2108
2109          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
2110          if (constant_nr >= 0 && constant_nr < (int) uniforms)
2111             is_live[constant_nr] = true;
2112       }
2113    }
2114
2115    /* Only allow 16 registers (128 uniform components) as push constants.
2116     *
2117     * Just demote the end of the list.  We could probably do better
2118     * here, demoting things that are rarely used in the program first.
2119     *
2120     * If changing this value, note the limitation about total_regs in
2121     * brw_curbe.c.
2122     */
2123    unsigned int max_push_components = 16 * 8;
2124    unsigned int num_push_constants = 0;
2125
2126    push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2127
2128    for (unsigned int i = 0; i < uniforms; i++) {
2129       if (!is_live[i] || pull_constant_loc[i] != -1) {
2130          /* This UNIFORM register is either dead, or has already been demoted
2131           * to a pull const.  Mark it as no longer living in the param[] array.
2132           */
2133          push_constant_loc[i] = -1;
2134          continue;
2135       }
2136
2137       if (num_push_constants < max_push_components) {
2138          /* Retain as a push constant.  Record the location in the params[]
2139           * array.
2140           */
2141          push_constant_loc[i] = num_push_constants++;
2142       } else {
2143          /* Demote to a pull constant. */
2144          push_constant_loc[i] = -1;
2145
2146          int pull_index = stage_prog_data->nr_pull_params++;
2147          stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
2148          pull_constant_loc[i] = pull_index;
2149       }
2150    }
2151
2152    stage_prog_data->nr_params = num_push_constants;
2153
2154    /* Up until now, the param[] array has been indexed by reg + reg_offset
2155     * of UNIFORM registers.  Condense it to only contain the uniforms we
2156     * chose to upload as push constants.
2157     */
2158    for (unsigned int i = 0; i < uniforms; i++) {
2159       int remapped = push_constant_loc[i];
2160
2161       if (remapped == -1)
2162          continue;
2163
2164       assert(remapped <= (int)i);
2165       stage_prog_data->param[remapped] = stage_prog_data->param[i];
2166    }
2167 }
2168
2169 /**
2170  * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
2171  * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
2172  */
2173 void
2174 fs_visitor::demote_pull_constants()
2175 {
2176    foreach_block_and_inst (block, fs_inst, inst, cfg) {
2177       for (int i = 0; i < inst->sources; i++) {
2178          if (inst->src[i].file != UNIFORM)
2179             continue;
2180
2181          int pull_index;
2182          unsigned location = inst->src[i].reg + inst->src[i].reg_offset;
2183          if (location >= uniforms) /* Out of bounds access */
2184             pull_index = -1;
2185          else
2186             pull_index = pull_constant_loc[location];
2187
2188          if (pull_index == -1)
2189             continue;
2190
2191          /* Set up the annotation tracking for new generated instructions. */
2192          const fs_builder ibld = bld.annotate(inst->annotation, inst->ir)
2193                                     .at(block, inst);
2194          fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
2195          fs_reg dst = vgrf(glsl_type::float_type);
2196
2197          /* Generate a pull load into dst. */
2198          if (inst->src[i].reladdr) {
2199             VARYING_PULL_CONSTANT_LOAD(ibld, dst,
2200                                        surf_index,
2201                                        *inst->src[i].reladdr,
2202                                        pull_index);
2203             inst->src[i].reladdr = NULL;
2204          } else {
2205             fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
2206             ibld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
2207                       dst, surf_index, offset);
2208             inst->src[i].set_smear(pull_index & 3);
2209          }
2210
2211          /* Rewrite the instruction to use the temporary VGRF. */
2212          inst->src[i].file = GRF;
2213          inst->src[i].reg = dst.reg;
2214          inst->src[i].reg_offset = 0;
2215          inst->src[i].width = dispatch_width;
2216       }
2217    }
2218    invalidate_live_intervals();
2219 }
2220
2221 bool
2222 fs_visitor::opt_algebraic()
2223 {
2224    bool progress = false;
2225
2226    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2227       switch (inst->opcode) {
2228       case BRW_OPCODE_MOV:
2229          if (inst->src[0].file != IMM)
2230             break;
2231
2232          if (inst->saturate) {
2233             if (inst->dst.type != inst->src[0].type)
2234                assert(!"unimplemented: saturate mixed types");
2235
2236             if (brw_saturate_immediate(inst->dst.type,
2237                                        &inst->src[0].fixed_hw_reg)) {
2238                inst->saturate = false;
2239                progress = true;
2240             }
2241          }
2242          break;
2243
2244       case BRW_OPCODE_MUL:
2245          if (inst->src[1].file != IMM)
2246             continue;
2247
2248          /* a * 1.0 = a */
2249          if (inst->src[1].is_one()) {
2250             inst->opcode = BRW_OPCODE_MOV;
2251             inst->src[1] = reg_undef;
2252             progress = true;
2253             break;
2254          }
2255
2256          /* a * -1.0 = -a */
2257          if (inst->src[1].is_negative_one()) {
2258             inst->opcode = BRW_OPCODE_MOV;
2259             inst->src[0].negate = !inst->src[0].negate;
2260             inst->src[1] = reg_undef;
2261             progress = true;
2262             break;
2263          }
2264
2265          /* a * 0.0 = 0.0 */
2266          if (inst->src[1].is_zero()) {
2267             inst->opcode = BRW_OPCODE_MOV;
2268             inst->src[0] = inst->src[1];
2269             inst->src[1] = reg_undef;
2270             progress = true;
2271             break;
2272          }
2273
2274          if (inst->src[0].file == IMM) {
2275             assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2276             inst->opcode = BRW_OPCODE_MOV;
2277             inst->src[0].fixed_hw_reg.dw1.f *= inst->src[1].fixed_hw_reg.dw1.f;
2278             inst->src[1] = reg_undef;
2279             progress = true;
2280             break;
2281          }
2282          break;
2283       case BRW_OPCODE_ADD:
2284          if (inst->src[1].file != IMM)
2285             continue;
2286
2287          /* a + 0.0 = a */
2288          if (inst->src[1].is_zero()) {
2289             inst->opcode = BRW_OPCODE_MOV;
2290             inst->src[1] = reg_undef;
2291             progress = true;
2292             break;
2293          }
2294
2295          if (inst->src[0].file == IMM) {
2296             assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2297             inst->opcode = BRW_OPCODE_MOV;
2298             inst->src[0].fixed_hw_reg.dw1.f += inst->src[1].fixed_hw_reg.dw1.f;
2299             inst->src[1] = reg_undef;
2300             progress = true;
2301             break;
2302          }
2303          break;
2304       case BRW_OPCODE_OR:
2305          if (inst->src[0].equals(inst->src[1])) {
2306             inst->opcode = BRW_OPCODE_MOV;
2307             inst->src[1] = reg_undef;
2308             progress = true;
2309             break;
2310          }
2311          break;
2312       case BRW_OPCODE_LRP:
2313          if (inst->src[1].equals(inst->src[2])) {
2314             inst->opcode = BRW_OPCODE_MOV;
2315             inst->src[0] = inst->src[1];
2316             inst->src[1] = reg_undef;
2317             inst->src[2] = reg_undef;
2318             progress = true;
2319             break;
2320          }
2321          break;
2322       case BRW_OPCODE_CMP:
2323          if (inst->conditional_mod == BRW_CONDITIONAL_GE &&
2324              inst->src[0].abs &&
2325              inst->src[0].negate &&
2326              inst->src[1].is_zero()) {
2327             inst->src[0].abs = false;
2328             inst->src[0].negate = false;
2329             inst->conditional_mod = BRW_CONDITIONAL_Z;
2330             progress = true;
2331             break;
2332          }
2333          break;
2334       case BRW_OPCODE_SEL:
2335          if (inst->src[0].equals(inst->src[1])) {
2336             inst->opcode = BRW_OPCODE_MOV;
2337             inst->src[1] = reg_undef;
2338             inst->predicate = BRW_PREDICATE_NONE;
2339             inst->predicate_inverse = false;
2340             progress = true;
2341          } else if (inst->saturate && inst->src[1].file == IMM) {
2342             switch (inst->conditional_mod) {
2343             case BRW_CONDITIONAL_LE:
2344             case BRW_CONDITIONAL_L:
2345                switch (inst->src[1].type) {
2346                case BRW_REGISTER_TYPE_F:
2347                   if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
2348                      inst->opcode = BRW_OPCODE_MOV;
2349                      inst->src[1] = reg_undef;
2350                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
2351                      progress = true;
2352                   }
2353                   break;
2354                default:
2355                   break;
2356                }
2357                break;
2358             case BRW_CONDITIONAL_GE:
2359             case BRW_CONDITIONAL_G:
2360                switch (inst->src[1].type) {
2361                case BRW_REGISTER_TYPE_F:
2362                   if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
2363                      inst->opcode = BRW_OPCODE_MOV;
2364                      inst->src[1] = reg_undef;
2365                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
2366                      progress = true;
2367                   }
2368                   break;
2369                default:
2370                   break;
2371                }
2372             default:
2373                break;
2374             }
2375          }
2376          break;
2377       case BRW_OPCODE_MAD:
2378          if (inst->src[1].is_zero() || inst->src[2].is_zero()) {
2379             inst->opcode = BRW_OPCODE_MOV;
2380             inst->src[1] = reg_undef;
2381             inst->src[2] = reg_undef;
2382             progress = true;
2383          } else if (inst->src[0].is_zero()) {
2384             inst->opcode = BRW_OPCODE_MUL;
2385             inst->src[0] = inst->src[2];
2386             inst->src[2] = reg_undef;
2387             progress = true;
2388          } else if (inst->src[1].is_one()) {
2389             inst->opcode = BRW_OPCODE_ADD;
2390             inst->src[1] = inst->src[2];
2391             inst->src[2] = reg_undef;
2392             progress = true;
2393          } else if (inst->src[2].is_one()) {
2394             inst->opcode = BRW_OPCODE_ADD;
2395             inst->src[2] = reg_undef;
2396             progress = true;
2397          } else if (inst->src[1].file == IMM && inst->src[2].file == IMM) {
2398             inst->opcode = BRW_OPCODE_ADD;
2399             inst->src[1].fixed_hw_reg.dw1.f *= inst->src[2].fixed_hw_reg.dw1.f;
2400             inst->src[2] = reg_undef;
2401             progress = true;
2402          }
2403          break;
2404       case SHADER_OPCODE_RCP: {
2405          fs_inst *prev = (fs_inst *)inst->prev;
2406          if (prev->opcode == SHADER_OPCODE_SQRT) {
2407             if (inst->src[0].equals(prev->dst)) {
2408                inst->opcode = SHADER_OPCODE_RSQ;
2409                inst->src[0] = prev->src[0];
2410                progress = true;
2411             }
2412          }
2413          break;
2414       }
2415       case SHADER_OPCODE_BROADCAST:
2416          if (is_uniform(inst->src[0])) {
2417             inst->opcode = BRW_OPCODE_MOV;
2418             inst->sources = 1;
2419             inst->force_writemask_all = true;
2420             progress = true;
2421          } else if (inst->src[1].file == IMM) {
2422             inst->opcode = BRW_OPCODE_MOV;
2423             inst->src[0] = component(inst->src[0],
2424                                      inst->src[1].fixed_hw_reg.dw1.ud);
2425             inst->sources = 1;
2426             inst->force_writemask_all = true;
2427             progress = true;
2428          }
2429          break;
2430
2431       default:
2432          break;
2433       }
2434
2435       /* Swap if src[0] is immediate. */
2436       if (progress && inst->is_commutative()) {
2437          if (inst->src[0].file == IMM) {
2438             fs_reg tmp = inst->src[1];
2439             inst->src[1] = inst->src[0];
2440             inst->src[0] = tmp;
2441          }
2442       }
2443    }
2444    return progress;
2445 }
2446
2447 /**
2448  * Optimize sample messages that have constant zero values for the trailing
2449  * texture coordinates. We can just reduce the message length for these
2450  * instructions instead of reserving a register for it. Trailing parameters
2451  * that aren't sent default to zero anyway. This will cause the dead code
2452  * eliminator to remove the MOV instruction that would otherwise be emitted to
2453  * set up the zero value.
2454  */
2455 bool
2456 fs_visitor::opt_zero_samples()
2457 {
2458    /* Gen4 infers the texturing opcode based on the message length so we can't
2459     * change it.
2460     */
2461    if (devinfo->gen < 5)
2462       return false;
2463
2464    bool progress = false;
2465
2466    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2467       if (!inst->is_tex())
2468          continue;
2469
2470       fs_inst *load_payload = (fs_inst *) inst->prev;
2471
2472       if (load_payload->is_head_sentinel() ||
2473           load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2474          continue;
2475
2476       /* We don't want to remove the message header or the first parameter.
2477        * Removing the first parameter is not allowed, see the Haswell PRM
2478        * volume 7, page 149:
2479        *
2480        *     "Parameter 0 is required except for the sampleinfo message, which
2481        *      has no parameter 0"
2482        */
2483       while (inst->mlen > inst->header_size + dispatch_width / 8 &&
2484              load_payload->src[(inst->mlen - inst->header_size) /
2485                                (dispatch_width / 8) +
2486                                inst->header_size - 1].is_zero()) {
2487          inst->mlen -= dispatch_width / 8;
2488          progress = true;
2489       }
2490    }
2491
2492    if (progress)
2493       invalidate_live_intervals();
2494
2495    return progress;
2496 }
2497
2498 /**
2499  * Optimize sample messages which are followed by the final RT write.
2500  *
2501  * CHV, and GEN9+ can mark a texturing SEND instruction with EOT to have its
2502  * results sent directly to the framebuffer, bypassing the EU.  Recognize the
2503  * final texturing results copied to the framebuffer write payload and modify
2504  * them to write to the framebuffer directly.
2505  */
2506 bool
2507 fs_visitor::opt_sampler_eot()
2508 {
2509    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2510
2511    if (stage != MESA_SHADER_FRAGMENT)
2512       return false;
2513
2514    if (devinfo->gen < 9 && !devinfo->is_cherryview)
2515       return false;
2516
2517    /* FINISHME: It should be possible to implement this optimization when there
2518     * are multiple drawbuffers.
2519     */
2520    if (key->nr_color_regions != 1)
2521       return false;
2522
2523    /* Look for a texturing instruction immediately before the final FB_WRITE. */
2524    fs_inst *fb_write = (fs_inst *) cfg->blocks[cfg->num_blocks - 1]->end();
2525    assert(fb_write->eot);
2526    assert(fb_write->opcode == FS_OPCODE_FB_WRITE);
2527
2528    fs_inst *tex_inst = (fs_inst *) fb_write->prev;
2529
2530    /* There wasn't one; nothing to do. */
2531    if (unlikely(tex_inst->is_head_sentinel()) || !tex_inst->is_tex())
2532       return false;
2533
2534    /* This optimisation doesn't seem to work for textureGather for some
2535     * reason. I can't find any documentation or known workarounds to indicate
2536     * that this is expected, but considering that it is probably pretty
2537     * unlikely that a shader would directly write out the results from
2538     * textureGather we might as well just disable it.
2539     */
2540    if (tex_inst->opcode == SHADER_OPCODE_TG4 ||
2541        tex_inst->opcode == SHADER_OPCODE_TG4_OFFSET)
2542       return false;
2543
2544    /* If there's no header present, we need to munge the LOAD_PAYLOAD as well.
2545     * It's very likely to be the previous instruction.
2546     */
2547    fs_inst *load_payload = (fs_inst *) tex_inst->prev;
2548    if (load_payload->is_head_sentinel() ||
2549        load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2550       return false;
2551
2552    assert(!tex_inst->eot); /* We can't get here twice */
2553    assert((tex_inst->offset & (0xff << 24)) == 0);
2554
2555    tex_inst->offset |= fb_write->target << 24;
2556    tex_inst->eot = true;
2557    tex_inst->dst = bld.null_reg_ud();
2558    fb_write->remove(cfg->blocks[cfg->num_blocks - 1]);
2559
2560    /* If a header is present, marking the eot is sufficient. Otherwise, we need
2561     * to create a new LOAD_PAYLOAD command with the same sources and a space
2562     * saved for the header. Using a new destination register not only makes sure
2563     * we have enough space, but it will make sure the dead code eliminator kills
2564     * the instruction that this will replace.
2565     */
2566    if (tex_inst->header_size != 0)
2567       return true;
2568
2569    fs_reg send_header = bld.vgrf(BRW_REGISTER_TYPE_F,
2570                                  load_payload->sources + 1);
2571    fs_reg *new_sources =
2572       ralloc_array(mem_ctx, fs_reg, load_payload->sources + 1);
2573
2574    new_sources[0] = fs_reg();
2575    for (int i = 0; i < load_payload->sources; i++)
2576       new_sources[i+1] = load_payload->src[i];
2577
2578    /* The LOAD_PAYLOAD helper seems like the obvious choice here. However, it
2579     * requires a lot of information about the sources to appropriately figure
2580     * out the number of registers needed to be used. Given this stage in our
2581     * optimization, we may not have the appropriate GRFs required by
2582     * LOAD_PAYLOAD at this point (copy propagation). Therefore, we need to
2583     * manually emit the instruction.
2584     */
2585    fs_inst *new_load_payload = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD,
2586                                                     load_payload->exec_size,
2587                                                     send_header,
2588                                                     new_sources,
2589                                                     load_payload->sources + 1);
2590
2591    new_load_payload->regs_written = load_payload->regs_written + 1;
2592    new_load_payload->header_size = 1;
2593    tex_inst->mlen++;
2594    tex_inst->header_size = 1;
2595    tex_inst->insert_before(cfg->blocks[cfg->num_blocks - 1], new_load_payload);
2596    tex_inst->src[0] = send_header;
2597
2598    return true;
2599 }
2600
2601 bool
2602 fs_visitor::opt_register_renaming()
2603 {
2604    bool progress = false;
2605    int depth = 0;
2606
2607    int remap[alloc.count];
2608    memset(remap, -1, sizeof(int) * alloc.count);
2609
2610    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2611       if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
2612          depth++;
2613       } else if (inst->opcode == BRW_OPCODE_ENDIF ||
2614                  inst->opcode == BRW_OPCODE_WHILE) {
2615          depth--;
2616       }
2617
2618       /* Rewrite instruction sources. */
2619       for (int i = 0; i < inst->sources; i++) {
2620          if (inst->src[i].file == GRF &&
2621              remap[inst->src[i].reg] != -1 &&
2622              remap[inst->src[i].reg] != inst->src[i].reg) {
2623             inst->src[i].reg = remap[inst->src[i].reg];
2624             progress = true;
2625          }
2626       }
2627
2628       const int dst = inst->dst.reg;
2629
2630       if (depth == 0 &&
2631           inst->dst.file == GRF &&
2632           alloc.sizes[inst->dst.reg] == inst->dst.width / 8 &&
2633           !inst->is_partial_write()) {
2634          if (remap[dst] == -1) {
2635             remap[dst] = dst;
2636          } else {
2637             remap[dst] = alloc.allocate(inst->dst.width / 8);
2638             inst->dst.reg = remap[dst];
2639             progress = true;
2640          }
2641       } else if (inst->dst.file == GRF &&
2642                  remap[dst] != -1 &&
2643                  remap[dst] != dst) {
2644          inst->dst.reg = remap[dst];
2645          progress = true;
2646       }
2647    }
2648
2649    if (progress) {
2650       invalidate_live_intervals();
2651
2652       for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
2653          if (delta_xy[i].file == GRF && remap[delta_xy[i].reg] != -1) {
2654             delta_xy[i].reg = remap[delta_xy[i].reg];
2655          }
2656       }
2657    }
2658
2659    return progress;
2660 }
2661
2662 /**
2663  * Remove redundant or useless discard jumps.
2664  *
2665  * For example, we can eliminate jumps in the following sequence:
2666  *
2667  * discard-jump       (redundant with the next jump)
2668  * discard-jump       (useless; jumps to the next instruction)
2669  * placeholder-halt
2670  */
2671 bool
2672 fs_visitor::opt_redundant_discard_jumps()
2673 {
2674    bool progress = false;
2675
2676    bblock_t *last_bblock = cfg->blocks[cfg->num_blocks - 1];
2677
2678    fs_inst *placeholder_halt = NULL;
2679    foreach_inst_in_block_reverse(fs_inst, inst, last_bblock) {
2680       if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT) {
2681          placeholder_halt = inst;
2682          break;
2683       }
2684    }
2685
2686    if (!placeholder_halt)
2687       return false;
2688
2689    /* Delete any HALTs immediately before the placeholder halt. */
2690    for (fs_inst *prev = (fs_inst *) placeholder_halt->prev;
2691         !prev->is_head_sentinel() && prev->opcode == FS_OPCODE_DISCARD_JUMP;
2692         prev = (fs_inst *) placeholder_halt->prev) {
2693       prev->remove(last_bblock);
2694       progress = true;
2695    }
2696
2697    if (progress)
2698       invalidate_live_intervals();
2699
2700    return progress;
2701 }
2702
2703 bool
2704 fs_visitor::compute_to_mrf()
2705 {
2706    bool progress = false;
2707    int next_ip = 0;
2708
2709    /* No MRFs on Gen >= 7. */
2710    if (devinfo->gen >= 7)
2711       return false;
2712
2713    calculate_live_intervals();
2714
2715    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2716       int ip = next_ip;
2717       next_ip++;
2718
2719       if (inst->opcode != BRW_OPCODE_MOV ||
2720           inst->is_partial_write() ||
2721           inst->dst.file != MRF || inst->src[0].file != GRF ||
2722           inst->dst.type != inst->src[0].type ||
2723           inst->src[0].abs || inst->src[0].negate ||
2724           !inst->src[0].is_contiguous() ||
2725           inst->src[0].subreg_offset)
2726          continue;
2727
2728       /* Work out which hardware MRF registers are written by this
2729        * instruction.
2730        */
2731       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2732       int mrf_high;
2733       if (inst->dst.reg & BRW_MRF_COMPR4) {
2734          mrf_high = mrf_low + 4;
2735       } else if (inst->exec_size == 16) {
2736          mrf_high = mrf_low + 1;
2737       } else {
2738          mrf_high = mrf_low;
2739       }
2740
2741       /* Can't compute-to-MRF this GRF if someone else was going to
2742        * read it later.
2743        */
2744       if (this->virtual_grf_end[inst->src[0].reg] > ip)
2745          continue;
2746
2747       /* Found a move of a GRF to a MRF.  Let's see if we can go
2748        * rewrite the thing that made this GRF to write into the MRF.
2749        */
2750       foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2751          if (scan_inst->dst.file == GRF &&
2752              scan_inst->dst.reg == inst->src[0].reg) {
2753             /* Found the last thing to write our reg we want to turn
2754              * into a compute-to-MRF.
2755              */
2756
2757             /* If this one instruction didn't populate all the
2758              * channels, bail.  We might be able to rewrite everything
2759              * that writes that reg, but it would require smarter
2760              * tracking to delay the rewriting until complete success.
2761              */
2762             if (scan_inst->is_partial_write())
2763                break;
2764
2765             /* Things returning more than one register would need us to
2766              * understand coalescing out more than one MOV at a time.
2767              */
2768             if (scan_inst->regs_written > scan_inst->dst.width / 8)
2769                break;
2770
2771             /* SEND instructions can't have MRF as a destination. */
2772             if (scan_inst->mlen)
2773                break;
2774
2775             if (devinfo->gen == 6) {
2776                /* gen6 math instructions must have the destination be
2777                 * GRF, so no compute-to-MRF for them.
2778                 */
2779                if (scan_inst->is_math()) {
2780                   break;
2781                }
2782             }
2783
2784             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2785                /* Found the creator of our MRF's source value. */
2786                scan_inst->dst.file = MRF;
2787                scan_inst->dst.reg = inst->dst.reg;
2788                scan_inst->saturate |= inst->saturate;
2789                inst->remove(block);
2790                progress = true;
2791             }
2792             break;
2793          }
2794
2795          /* We don't handle control flow here.  Most computation of
2796           * values that end up in MRFs are shortly before the MRF
2797           * write anyway.
2798           */
2799          if (block->start() == scan_inst)
2800             break;
2801
2802          /* You can't read from an MRF, so if someone else reads our
2803           * MRF's source GRF that we wanted to rewrite, that stops us.
2804           */
2805          bool interfered = false;
2806          for (int i = 0; i < scan_inst->sources; i++) {
2807             if (scan_inst->src[i].file == GRF &&
2808                 scan_inst->src[i].reg == inst->src[0].reg &&
2809                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2810                interfered = true;
2811             }
2812          }
2813          if (interfered)
2814             break;
2815
2816          if (scan_inst->dst.file == MRF) {
2817             /* If somebody else writes our MRF here, we can't
2818              * compute-to-MRF before that.
2819              */
2820             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2821             int scan_mrf_high;
2822
2823             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2824                scan_mrf_high = scan_mrf_low + 4;
2825             } else if (scan_inst->exec_size == 16) {
2826                scan_mrf_high = scan_mrf_low + 1;
2827             } else {
2828                scan_mrf_high = scan_mrf_low;
2829             }
2830
2831             if (mrf_low == scan_mrf_low ||
2832                 mrf_low == scan_mrf_high ||
2833                 mrf_high == scan_mrf_low ||
2834                 mrf_high == scan_mrf_high) {
2835                break;
2836             }
2837          }
2838
2839          if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2840             /* Found a SEND instruction, which means that there are
2841              * live values in MRFs from base_mrf to base_mrf +
2842              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2843              * above it.
2844              */
2845             if (mrf_low >= scan_inst->base_mrf &&
2846                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2847                break;
2848             }
2849             if (mrf_high >= scan_inst->base_mrf &&
2850                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2851                break;
2852             }
2853          }
2854       }
2855    }
2856
2857    if (progress)
2858       invalidate_live_intervals();
2859
2860    return progress;
2861 }
2862
2863 /**
2864  * Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control
2865  * flow.  We could probably do better here with some form of divergence
2866  * analysis.
2867  */
2868 bool
2869 fs_visitor::eliminate_find_live_channel()
2870 {
2871    bool progress = false;
2872    unsigned depth = 0;
2873
2874    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2875       switch (inst->opcode) {
2876       case BRW_OPCODE_IF:
2877       case BRW_OPCODE_DO:
2878          depth++;
2879          break;
2880
2881       case BRW_OPCODE_ENDIF:
2882       case BRW_OPCODE_WHILE:
2883          depth--;
2884          break;
2885
2886       case FS_OPCODE_DISCARD_JUMP:
2887          /* This can potentially make control flow non-uniform until the end
2888           * of the program.
2889           */
2890          return progress;
2891
2892       case SHADER_OPCODE_FIND_LIVE_CHANNEL:
2893          if (depth == 0) {
2894             inst->opcode = BRW_OPCODE_MOV;
2895             inst->src[0] = fs_reg(0);
2896             inst->sources = 1;
2897             inst->force_writemask_all = true;
2898             progress = true;
2899          }
2900          break;
2901
2902       default:
2903          break;
2904       }
2905    }
2906
2907    return progress;
2908 }
2909
2910 /**
2911  * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
2912  * instructions to FS_OPCODE_REP_FB_WRITE.
2913  */
2914 void
2915 fs_visitor::emit_repclear_shader()
2916 {
2917    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2918    int base_mrf = 1;
2919    int color_mrf = base_mrf + 2;
2920
2921    fs_inst *mov = bld.exec_all().MOV(vec4(brw_message_reg(color_mrf)),
2922                                      fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F));
2923
2924    fs_inst *write;
2925    if (key->nr_color_regions == 1) {
2926       write = bld.emit(FS_OPCODE_REP_FB_WRITE);
2927       write->saturate = key->clamp_fragment_color;
2928       write->base_mrf = color_mrf;
2929       write->target = 0;
2930       write->header_size = 0;
2931       write->mlen = 1;
2932    } else {
2933       assume(key->nr_color_regions > 0);
2934       for (int i = 0; i < key->nr_color_regions; ++i) {
2935          write = bld.emit(FS_OPCODE_REP_FB_WRITE);
2936          write->saturate = key->clamp_fragment_color;
2937          write->base_mrf = base_mrf;
2938          write->target = i;
2939          write->header_size = 2;
2940          write->mlen = 3;
2941       }
2942    }
2943    write->eot = true;
2944
2945    calculate_cfg();
2946
2947    assign_constant_locations();
2948    assign_curb_setup();
2949
2950    /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
2951    assert(mov->src[0].file == HW_REG);
2952    mov->src[0] = brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
2953 }
2954
2955 /**
2956  * Walks through basic blocks, looking for repeated MRF writes and
2957  * removing the later ones.
2958  */
2959 bool
2960 fs_visitor::remove_duplicate_mrf_writes()
2961 {
2962    fs_inst *last_mrf_move[16];
2963    bool progress = false;
2964
2965    /* Need to update the MRF tracking for compressed instructions. */
2966    if (dispatch_width == 16)
2967       return false;
2968
2969    memset(last_mrf_move, 0, sizeof(last_mrf_move));
2970
2971    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
2972       if (inst->is_control_flow()) {
2973          memset(last_mrf_move, 0, sizeof(last_mrf_move));
2974       }
2975
2976       if (inst->opcode == BRW_OPCODE_MOV &&
2977           inst->dst.file == MRF) {
2978          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2979          if (prev_inst && inst->equals(prev_inst)) {
2980             inst->remove(block);
2981             progress = true;
2982             continue;
2983          }
2984       }
2985
2986       /* Clear out the last-write records for MRFs that were overwritten. */
2987       if (inst->dst.file == MRF) {
2988          last_mrf_move[inst->dst.reg] = NULL;
2989       }
2990
2991       if (inst->mlen > 0 && inst->base_mrf != -1) {
2992          /* Found a SEND instruction, which will include two or fewer
2993           * implied MRF writes.  We could do better here.
2994           */
2995          for (int i = 0; i < implied_mrf_writes(inst); i++) {
2996             last_mrf_move[inst->base_mrf + i] = NULL;
2997          }
2998       }
2999
3000       /* Clear out any MRF move records whose sources got overwritten. */
3001       if (inst->dst.file == GRF) {
3002          for (unsigned int i = 0; i < ARRAY_SIZE(last_mrf_move); i++) {
3003             if (last_mrf_move[i] &&
3004                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
3005                last_mrf_move[i] = NULL;
3006             }
3007          }
3008       }
3009
3010       if (inst->opcode == BRW_OPCODE_MOV &&
3011           inst->dst.file == MRF &&
3012           inst->src[0].file == GRF &&
3013           !inst->is_partial_write()) {
3014          last_mrf_move[inst->dst.reg] = inst;
3015       }
3016    }
3017
3018    if (progress)
3019       invalidate_live_intervals();
3020
3021    return progress;
3022 }
3023
3024 static void
3025 clear_deps_for_inst_src(fs_inst *inst, bool *deps, int first_grf, int grf_len)
3026 {
3027    /* Clear the flag for registers that actually got read (as expected). */
3028    for (int i = 0; i < inst->sources; i++) {
3029       int grf;
3030       if (inst->src[i].file == GRF) {
3031          grf = inst->src[i].reg;
3032       } else if (inst->src[i].file == HW_REG &&
3033                  inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
3034          grf = inst->src[i].fixed_hw_reg.nr;
3035       } else {
3036          continue;
3037       }
3038
3039       if (grf >= first_grf &&
3040           grf < first_grf + grf_len) {
3041          deps[grf - first_grf] = false;
3042          if (inst->exec_size == 16)
3043             deps[grf - first_grf + 1] = false;
3044       }
3045    }
3046 }
3047
3048 /**
3049  * Implements this workaround for the original 965:
3050  *
3051  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
3052  *      check for post destination dependencies on this instruction, software
3053  *      must ensure that there is no destination hazard for the case of ‘write
3054  *      followed by a posted write’ shown in the following example.
3055  *
3056  *      1. mov r3 0
3057  *      2. send r3.xy <rest of send instruction>
3058  *      3. mov r2 r3
3059  *
3060  *      Due to no post-destination dependency check on the ‘send’, the above
3061  *      code sequence could have two instructions (1 and 2) in flight at the
3062  *      same time that both consider ‘r3’ as the target of their final writes.
3063  */
3064 void
3065 fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
3066                                                         fs_inst *inst)
3067 {
3068    int write_len = inst->regs_written;
3069    int first_write_grf = inst->dst.reg;
3070    bool needs_dep[BRW_MAX_MRF];
3071    assert(write_len < (int)sizeof(needs_dep) - 1);
3072
3073    memset(needs_dep, false, sizeof(needs_dep));
3074    memset(needs_dep, true, write_len);
3075
3076    clear_deps_for_inst_src(inst, needs_dep, first_write_grf, write_len);
3077
3078    /* Walk backwards looking for writes to registers we're writing which
3079     * aren't read since being written.  If we hit the start of the program,
3080     * we assume that there are no outstanding dependencies on entry to the
3081     * program.
3082     */
3083    foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
3084       /* If we hit control flow, assume that there *are* outstanding
3085        * dependencies, and force their cleanup before our instruction.
3086        */
3087       if (block->start() == scan_inst) {
3088          for (int i = 0; i < write_len; i++) {
3089             if (needs_dep[i])
3090                DEP_RESOLVE_MOV(bld.at(block, inst), first_write_grf + i);
3091          }
3092          return;
3093       }
3094
3095       /* We insert our reads as late as possible on the assumption that any
3096        * instruction but a MOV that might have left us an outstanding
3097        * dependency has more latency than a MOV.
3098        */
3099       if (scan_inst->dst.file == GRF) {
3100          for (int i = 0; i < scan_inst->regs_written; i++) {
3101             int reg = scan_inst->dst.reg + i;
3102
3103             if (reg >= first_write_grf &&
3104                 reg < first_write_grf + write_len &&
3105                 needs_dep[reg - first_write_grf]) {
3106                DEP_RESOLVE_MOV(bld.at(block, inst), reg);
3107                needs_dep[reg - first_write_grf] = false;
3108                if (scan_inst->exec_size == 16)
3109                   needs_dep[reg - first_write_grf + 1] = false;
3110             }
3111          }
3112       }
3113
3114       /* Clear the flag for registers that actually got read (as expected). */
3115       clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3116
3117       /* Continue the loop only if we haven't resolved all the dependencies */
3118       int i;
3119       for (i = 0; i < write_len; i++) {
3120          if (needs_dep[i])
3121             break;
3122       }
3123       if (i == write_len)
3124          return;
3125    }
3126 }
3127
3128 /**
3129  * Implements this workaround for the original 965:
3130  *
3131  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
3132  *      used as a destination register until after it has been sourced by an
3133  *      instruction with a different destination register.
3134  */
3135 void
3136 fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
3137 {
3138    int write_len = inst->regs_written;
3139    int first_write_grf = inst->dst.reg;
3140    bool needs_dep[BRW_MAX_MRF];
3141    assert(write_len < (int)sizeof(needs_dep) - 1);
3142
3143    memset(needs_dep, false, sizeof(needs_dep));
3144    memset(needs_dep, true, write_len);
3145    /* Walk forwards looking for writes to registers we're writing which aren't
3146     * read before being written.
3147     */
3148    foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst, block) {
3149       /* If we hit control flow, force resolve all remaining dependencies. */
3150       if (block->end() == scan_inst) {
3151          for (int i = 0; i < write_len; i++) {
3152             if (needs_dep[i])
3153                DEP_RESOLVE_MOV(bld.at(block, scan_inst), first_write_grf + i);
3154          }
3155          return;
3156       }
3157
3158       /* Clear the flag for registers that actually got read (as expected). */
3159       clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3160
3161       /* We insert our reads as late as possible since they're reading the
3162        * result of a SEND, which has massive latency.
3163        */
3164       if (scan_inst->dst.file == GRF &&
3165           scan_inst->dst.reg >= first_write_grf &&
3166           scan_inst->dst.reg < first_write_grf + write_len &&
3167           needs_dep[scan_inst->dst.reg - first_write_grf]) {
3168          DEP_RESOLVE_MOV(bld.at(block, scan_inst), scan_inst->dst.reg);
3169          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
3170       }
3171
3172       /* Continue the loop only if we haven't resolved all the dependencies */
3173       int i;
3174       for (i = 0; i < write_len; i++) {
3175          if (needs_dep[i])
3176             break;
3177       }
3178       if (i == write_len)
3179          return;
3180    }
3181 }
3182
3183 void
3184 fs_visitor::insert_gen4_send_dependency_workarounds()
3185 {
3186    if (devinfo->gen != 4 || devinfo->is_g4x)
3187       return;
3188
3189    bool progress = false;
3190
3191    /* Note that we're done with register allocation, so GRF fs_regs always
3192     * have a .reg_offset of 0.
3193     */
3194
3195    foreach_block_and_inst(block, fs_inst, inst, cfg) {
3196       if (inst->mlen != 0 && inst->dst.file == GRF) {
3197          insert_gen4_pre_send_dependency_workarounds(block, inst);
3198          insert_gen4_post_send_dependency_workarounds(block, inst);
3199          progress = true;
3200       }
3201    }
3202
3203    if (progress)
3204       invalidate_live_intervals();
3205 }
3206
3207 /**
3208  * Turns the generic expression-style uniform pull constant load instruction
3209  * into a hardware-specific series of instructions for loading a pull
3210  * constant.
3211  *
3212  * The expression style allows the CSE pass before this to optimize out
3213  * repeated loads from the same offset, and gives the pre-register-allocation
3214  * scheduling full flexibility, while the conversion to native instructions
3215  * allows the post-register-allocation scheduler the best information
3216  * possible.
3217  *
3218  * Note that execution masking for setting up pull constant loads is special:
3219  * the channels that need to be written are unrelated to the current execution
3220  * mask, since a later instruction will use one of the result channels as a
3221  * source operand for all 8 or 16 of its channels.
3222  */
3223 void
3224 fs_visitor::lower_uniform_pull_constant_loads()
3225 {
3226    foreach_block_and_inst (block, fs_inst, inst, cfg) {
3227       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
3228          continue;
3229
3230       if (devinfo->gen >= 7) {
3231          /* The offset arg before was a vec4-aligned byte offset.  We need to
3232           * turn it into a dword offset.
3233           */
3234          fs_reg const_offset_reg = inst->src[1];
3235          assert(const_offset_reg.file == IMM &&
3236                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
3237          const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
3238          fs_reg payload = fs_reg(GRF, alloc.allocate(1));
3239
3240          /* We have to use a message header on Skylake to get SIMD4x2 mode.
3241           * Reserve space for the register.
3242           */
3243          if (devinfo->gen >= 9) {
3244             payload.reg_offset++;
3245             alloc.sizes[payload.reg] = 2;
3246          }
3247
3248          /* This is actually going to be a MOV, but since only the first dword
3249           * is accessed, we have a special opcode to do just that one.  Note
3250           * that this needs to be an operation that will be considered a def
3251           * by live variable analysis, or register allocation will explode.
3252           */
3253          fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
3254                                                8, payload, const_offset_reg);
3255          setup->force_writemask_all = true;
3256
3257          setup->ir = inst->ir;
3258          setup->annotation = inst->annotation;
3259          inst->insert_before(block, setup);
3260
3261          /* Similarly, this will only populate the first 4 channels of the
3262           * result register (since we only use smear values from 0-3), but we
3263           * don't tell the optimizer.
3264           */
3265          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
3266          inst->src[1] = payload;
3267
3268          invalidate_live_intervals();
3269       } else {
3270          /* Before register allocation, we didn't tell the scheduler about the
3271           * MRF we use.  We know it's safe to use this MRF because nothing
3272           * else does except for register spill/unspill, which generates and
3273           * uses its MRF within a single IR instruction.
3274           */
3275          inst->base_mrf = 14;
3276          inst->mlen = 1;
3277       }
3278    }
3279 }
3280
3281 bool
3282 fs_visitor::lower_load_payload()
3283 {
3284    bool progress = false;
3285
3286    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3287       if (inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
3288          continue;
3289
3290       assert(inst->dst.file == MRF || inst->dst.file == GRF);
3291       assert(inst->saturate == false);
3292
3293       const fs_builder ibld = bld.group(inst->exec_size, inst->force_sechalf)
3294                                  .exec_all(inst->force_writemask_all)
3295                                  .at(block, inst);
3296       fs_reg dst = inst->dst;
3297
3298       /* Get rid of COMPR4.  We'll add it back in if we need it */
3299       if (dst.file == MRF)
3300          dst.reg = dst.reg & ~BRW_MRF_COMPR4;
3301
3302       dst.width = 8;
3303       for (uint8_t i = 0; i < inst->header_size; i++) {
3304          if (inst->src[i].file != BAD_FILE) {
3305             fs_reg mov_dst = retype(dst, BRW_REGISTER_TYPE_UD);
3306             fs_reg mov_src = retype(inst->src[i], BRW_REGISTER_TYPE_UD);
3307             mov_src.width = 8;
3308             ibld.exec_all().MOV(mov_dst, mov_src);
3309          }
3310          dst = offset(dst, 1);
3311       }
3312
3313       dst.width = inst->exec_size;
3314       if (inst->dst.file == MRF && (inst->dst.reg & BRW_MRF_COMPR4) &&
3315           inst->exec_size > 8) {
3316          /* In this case, the payload portion of the LOAD_PAYLOAD isn't
3317           * a straightforward copy.  Instead, the result of the
3318           * LOAD_PAYLOAD is treated as interleaved and the first four
3319           * non-header sources are unpacked as:
3320           *
3321           * m + 0: r0
3322           * m + 1: g0
3323           * m + 2: b0
3324           * m + 3: a0
3325           * m + 4: r1
3326           * m + 5: g1
3327           * m + 6: b1
3328           * m + 7: a1
3329           *
3330           * This is used for gen <= 5 fb writes.
3331           */
3332          assert(inst->exec_size == 16);
3333          assert(inst->header_size + 4 <= inst->sources);
3334          for (uint8_t i = inst->header_size; i < inst->header_size + 4; i++) {
3335             if (inst->src[i].file != BAD_FILE) {
3336                if (devinfo->has_compr4) {
3337                   fs_reg compr4_dst = retype(dst, inst->src[i].type);
3338                   compr4_dst.reg |= BRW_MRF_COMPR4;
3339                   ibld.MOV(compr4_dst, inst->src[i]);
3340                } else {
3341                   /* Platform doesn't have COMPR4.  We have to fake it */
3342                   fs_reg mov_dst = retype(dst, inst->src[i].type);
3343                   mov_dst.width = 8;
3344                   ibld.half(0).MOV(mov_dst, half(inst->src[i], 0));
3345                   ibld.half(1).MOV(offset(mov_dst, 4), half(inst->src[i], 1));
3346                }
3347             }
3348
3349             dst.reg++;
3350          }
3351
3352          /* The loop above only ever incremented us through the first set
3353           * of 4 registers.  However, thanks to the magic of COMPR4, we
3354           * actually wrote to the first 8 registers, so we need to take
3355           * that into account now.
3356           */
3357          dst.reg += 4;
3358
3359          /* The COMPR4 code took care of the first 4 sources.  We'll let
3360           * the regular path handle any remaining sources.  Yes, we are
3361           * modifying the instruction but we're about to delete it so
3362           * this really doesn't hurt anything.
3363           */
3364          inst->header_size += 4;
3365       }
3366
3367       for (uint8_t i = inst->header_size; i < inst->sources; i++) {
3368          if (inst->src[i].file != BAD_FILE)
3369             ibld.MOV(retype(dst, inst->src[i].type), inst->src[i]);
3370          dst = offset(dst, 1);
3371       }
3372
3373       inst->remove(block);
3374       progress = true;
3375    }
3376
3377    if (progress)
3378       invalidate_live_intervals();
3379
3380    return progress;
3381 }
3382
3383 bool
3384 fs_visitor::lower_integer_multiplication()
3385 {
3386    bool progress = false;
3387
3388    /* Gen8's MUL instruction can do a 32-bit x 32-bit -> 32-bit operation
3389     * directly, but Cherryview cannot.
3390     */
3391    if (devinfo->gen >= 8 && !devinfo->is_cherryview)
3392       return false;
3393
3394    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
3395       if (inst->opcode != BRW_OPCODE_MUL ||
3396           inst->dst.is_accumulator() ||
3397           (inst->dst.type != BRW_REGISTER_TYPE_D &&
3398            inst->dst.type != BRW_REGISTER_TYPE_UD))
3399          continue;
3400
3401       const fs_builder ibld = bld.at(block, inst);
3402
3403       /* The MUL instruction isn't commutative. On Gen <= 6, only the low
3404        * 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of
3405        * src1 are used.
3406        *
3407        * If multiplying by an immediate value that fits in 16-bits, do a
3408        * single MUL instruction with that value in the proper location.
3409        */
3410       if (inst->src[1].file == IMM &&
3411           inst->src[1].fixed_hw_reg.dw1.ud < (1 << 16)) {
3412          if (devinfo->gen < 7) {
3413             fs_reg imm(GRF, alloc.allocate(dispatch_width / 8),
3414                        inst->dst.type, dispatch_width);
3415             ibld.MOV(imm, inst->src[1]);
3416             ibld.MUL(inst->dst, imm, inst->src[0]);
3417          } else {
3418             ibld.MUL(inst->dst, inst->src[0], inst->src[1]);
3419          }
3420       } else {
3421          /* Gen < 8 (and some Gen8+ low-power parts like Cherryview) cannot
3422           * do 32-bit integer multiplication in one instruction, but instead
3423           * must do a sequence (which actually calculates a 64-bit result):
3424           *
3425           *    mul(8)  acc0<1>D   g3<8,8,1>D      g4<8,8,1>D
3426           *    mach(8) null       g3<8,8,1>D      g4<8,8,1>D
3427           *    mov(8)  g2<1>D     acc0<8,8,1>D
3428           *
3429           * But on Gen > 6, the ability to use second accumulator register
3430           * (acc1) for non-float data types was removed, preventing a simple
3431           * implementation in SIMD16. A 16-channel result can be calculated by
3432           * executing the three instructions twice in SIMD8, once with quarter
3433           * control of 1Q for the first eight channels and again with 2Q for
3434           * the second eight channels.
3435           *
3436           * Which accumulator register is implicitly accessed (by AccWrEnable
3437           * for instance) is determined by the quarter control. Unfortunately
3438           * Ivybridge (and presumably Baytrail) has a hardware bug in which an
3439           * implicit accumulator access by an instruction with 2Q will access
3440           * acc1 regardless of whether the data type is usable in acc1.
3441           *
3442           * Specifically, the 2Q mach(8) writes acc1 which does not exist for
3443           * integer data types.
3444           *
3445           * Since we only want the low 32-bits of the result, we can do two
3446           * 32-bit x 16-bit multiplies (like the mul and mach are doing), and
3447           * adjust the high result and add them (like the mach is doing):
3448           *
3449           *    mul(8)  g7<1>D     g3<8,8,1>D      g4.0<8,8,1>UW
3450           *    mul(8)  g8<1>D     g3<8,8,1>D      g4.1<8,8,1>UW
3451           *    shl(8)  g9<1>D     g8<8,8,1>D      16D
3452           *    add(8)  g2<1>D     g7<8,8,1>D      g8<8,8,1>D
3453           *
3454           * We avoid the shl instruction by realizing that we only want to add
3455           * the low 16-bits of the "high" result to the high 16-bits of the
3456           * "low" result and using proper regioning on the add:
3457           *
3458           *    mul(8)  g7<1>D     g3<8,8,1>D      g4.0<16,8,2>UW
3459           *    mul(8)  g8<1>D     g3<8,8,1>D      g4.1<16,8,2>UW
3460           *    add(8)  g7.1<2>UW  g7.1<16,8,2>UW  g8<16,8,2>UW
3461           *
3462           * Since it does not use the (single) accumulator register, we can
3463           * schedule multi-component multiplications much better.
3464           */
3465
3466          if (inst->conditional_mod && inst->dst.is_null()) {
3467             inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
3468                                inst->dst.type, dispatch_width);
3469          }
3470          fs_reg low = inst->dst;
3471          fs_reg high(GRF, alloc.allocate(dispatch_width / 8),
3472                      inst->dst.type, dispatch_width);
3473
3474          if (brw->gen >= 7) {
3475             fs_reg src1_0_w = inst->src[1];
3476             fs_reg src1_1_w = inst->src[1];
3477
3478             if (inst->src[1].file == IMM) {
3479                src1_0_w.fixed_hw_reg.dw1.ud &= 0xffff;
3480                src1_1_w.fixed_hw_reg.dw1.ud >>= 16;
3481             } else {
3482                src1_0_w.type = BRW_REGISTER_TYPE_UW;
3483                src1_0_w.stride = 2;
3484
3485                src1_1_w.type = BRW_REGISTER_TYPE_UW;
3486                src1_1_w.stride = 2;
3487                src1_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
3488             }
3489             ibld.MUL(low, inst->src[0], src1_0_w);
3490             ibld.MUL(high, inst->src[0], src1_1_w);
3491          } else {
3492             fs_reg src0_0_w = inst->src[0];
3493             fs_reg src0_1_w = inst->src[0];
3494
3495             src0_0_w.type = BRW_REGISTER_TYPE_UW;
3496             src0_0_w.stride = 2;
3497
3498             src0_1_w.type = BRW_REGISTER_TYPE_UW;
3499             src0_1_w.stride = 2;
3500             src0_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
3501
3502             ibld.MUL(low, src0_0_w, inst->src[1]);
3503             ibld.MUL(high, src0_1_w, inst->src[1]);
3504          }
3505
3506          fs_reg dst = inst->dst;
3507          dst.type = BRW_REGISTER_TYPE_UW;
3508          dst.subreg_offset = 2;
3509          dst.stride = 2;
3510
3511          high.type = BRW_REGISTER_TYPE_UW;
3512          high.stride = 2;
3513
3514          low.type = BRW_REGISTER_TYPE_UW;
3515          low.subreg_offset = 2;
3516          low.stride = 2;
3517
3518          ibld.ADD(dst, low, high);
3519
3520          if (inst->conditional_mod) {
3521             fs_reg null(retype(brw_null_reg(), inst->dst.type));
3522             set_condmod(inst->conditional_mod,
3523                         ibld.MOV(null, inst->dst));
3524          }
3525       }
3526
3527       inst->remove(block);
3528       progress = true;
3529    }
3530
3531    if (progress)
3532       invalidate_live_intervals();
3533
3534    return progress;
3535 }
3536
3537 void
3538 fs_visitor::dump_instructions()
3539 {
3540    dump_instructions(NULL);
3541 }
3542
3543 void
3544 fs_visitor::dump_instructions(const char *name)
3545 {
3546    FILE *file = stderr;
3547    if (name && geteuid() != 0) {
3548       file = fopen(name, "w");
3549       if (!file)
3550          file = stderr;
3551    }
3552
3553    if (cfg) {
3554       calculate_register_pressure();
3555       int ip = 0, max_pressure = 0;
3556       foreach_block_and_inst(block, backend_instruction, inst, cfg) {
3557          max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
3558          fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
3559          dump_instruction(inst, file);
3560          ip++;
3561       }
3562       fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
3563    } else {
3564       int ip = 0;
3565       foreach_in_list(backend_instruction, inst, &instructions) {
3566          fprintf(file, "%4d: ", ip++);
3567          dump_instruction(inst, file);
3568       }
3569    }
3570
3571    if (file != stderr) {
3572       fclose(file);
3573    }
3574 }
3575
3576 void
3577 fs_visitor::dump_instruction(backend_instruction *be_inst)
3578 {
3579    dump_instruction(be_inst, stderr);
3580 }
3581
3582 void
3583 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
3584 {
3585    fs_inst *inst = (fs_inst *)be_inst;
3586
3587    if (inst->predicate) {
3588       fprintf(file, "(%cf0.%d) ",
3589              inst->predicate_inverse ? '-' : '+',
3590              inst->flag_subreg);
3591    }
3592
3593    fprintf(file, "%s", brw_instruction_name(inst->opcode));
3594    if (inst->saturate)
3595       fprintf(file, ".sat");
3596    if (inst->conditional_mod) {
3597       fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
3598       if (!inst->predicate &&
3599           (devinfo->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
3600                               inst->opcode != BRW_OPCODE_IF &&
3601                               inst->opcode != BRW_OPCODE_WHILE))) {
3602          fprintf(file, ".f0.%d", inst->flag_subreg);
3603       }
3604    }
3605    fprintf(file, "(%d) ", inst->exec_size);
3606
3607    if (inst->mlen) {
3608       fprintf(file, "(mlen: %d) ", inst->mlen);
3609    }
3610
3611    switch (inst->dst.file) {
3612    case GRF:
3613       fprintf(file, "vgrf%d", inst->dst.reg);
3614       if (inst->dst.width != dispatch_width)
3615          fprintf(file, "@%d", inst->dst.width);
3616       if (alloc.sizes[inst->dst.reg] != inst->dst.width / 8 ||
3617           inst->dst.subreg_offset)
3618          fprintf(file, "+%d.%d",
3619                  inst->dst.reg_offset, inst->dst.subreg_offset);
3620       break;
3621    case MRF:
3622       fprintf(file, "m%d", inst->dst.reg);
3623       break;
3624    case BAD_FILE:
3625       fprintf(file, "(null)");
3626       break;
3627    case UNIFORM:
3628       fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
3629       break;
3630    case ATTR:
3631       fprintf(file, "***attr%d***", inst->dst.reg + inst->dst.reg_offset);
3632       break;
3633    case HW_REG:
3634       if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3635          switch (inst->dst.fixed_hw_reg.nr) {
3636          case BRW_ARF_NULL:
3637             fprintf(file, "null");
3638             break;
3639          case BRW_ARF_ADDRESS:
3640             fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
3641             break;
3642          case BRW_ARF_ACCUMULATOR:
3643             fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
3644             break;
3645          case BRW_ARF_FLAG:
3646             fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3647                              inst->dst.fixed_hw_reg.subnr);
3648             break;
3649          default:
3650             fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3651                                inst->dst.fixed_hw_reg.subnr);
3652             break;
3653          }
3654       } else {
3655          fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
3656       }
3657       if (inst->dst.fixed_hw_reg.subnr)
3658          fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
3659       break;
3660    default:
3661       fprintf(file, "???");
3662       break;
3663    }
3664    fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
3665
3666    for (int i = 0; i < inst->sources; i++) {
3667       if (inst->src[i].negate)
3668          fprintf(file, "-");
3669       if (inst->src[i].abs)
3670          fprintf(file, "|");
3671       switch (inst->src[i].file) {
3672       case GRF:
3673          fprintf(file, "vgrf%d", inst->src[i].reg);
3674          if (inst->src[i].width != dispatch_width)
3675             fprintf(file, "@%d", inst->src[i].width);
3676          if (alloc.sizes[inst->src[i].reg] != inst->src[i].width / 8 ||
3677              inst->src[i].subreg_offset)
3678             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3679                     inst->src[i].subreg_offset);
3680          break;
3681       case MRF:
3682          fprintf(file, "***m%d***", inst->src[i].reg);
3683          break;
3684       case ATTR:
3685          fprintf(file, "attr%d", inst->src[i].reg + inst->src[i].reg_offset);
3686          break;
3687       case UNIFORM:
3688          fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
3689          if (inst->src[i].reladdr) {
3690             fprintf(file, "+reladdr");
3691          } else if (inst->src[i].subreg_offset) {
3692             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3693                     inst->src[i].subreg_offset);
3694          }
3695          break;
3696       case BAD_FILE:
3697          fprintf(file, "(null)");
3698          break;
3699       case IMM:
3700          switch (inst->src[i].type) {
3701          case BRW_REGISTER_TYPE_F:
3702             fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
3703             break;
3704          case BRW_REGISTER_TYPE_W:
3705          case BRW_REGISTER_TYPE_D:
3706             fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
3707             break;
3708          case BRW_REGISTER_TYPE_UW:
3709          case BRW_REGISTER_TYPE_UD:
3710             fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
3711             break;
3712          case BRW_REGISTER_TYPE_VF:
3713             fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
3714                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >>  0) & 0xff),
3715                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >>  8) & 0xff),
3716                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 16) & 0xff),
3717                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 24) & 0xff));
3718             break;
3719          default:
3720             fprintf(file, "???");
3721             break;
3722          }
3723          break;
3724       case HW_REG:
3725          if (inst->src[i].fixed_hw_reg.negate)
3726             fprintf(file, "-");
3727          if (inst->src[i].fixed_hw_reg.abs)
3728             fprintf(file, "|");
3729          if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3730             switch (inst->src[i].fixed_hw_reg.nr) {
3731             case BRW_ARF_NULL:
3732                fprintf(file, "null");
3733                break;
3734             case BRW_ARF_ADDRESS:
3735                fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
3736                break;
3737             case BRW_ARF_ACCUMULATOR:
3738                fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
3739                break;
3740             case BRW_ARF_FLAG:
3741                fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3742                                 inst->src[i].fixed_hw_reg.subnr);
3743                break;
3744             default:
3745                fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3746                                   inst->src[i].fixed_hw_reg.subnr);
3747                break;
3748             }
3749          } else {
3750             fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3751          }
3752          if (inst->src[i].fixed_hw_reg.subnr)
3753             fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
3754          if (inst->src[i].fixed_hw_reg.abs)
3755             fprintf(file, "|");
3756          break;
3757       default:
3758          fprintf(file, "???");
3759          break;
3760       }
3761       if (inst->src[i].abs)
3762          fprintf(file, "|");
3763
3764       if (inst->src[i].file != IMM) {
3765          fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
3766       }
3767
3768       if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
3769          fprintf(file, ", ");
3770    }
3771
3772    fprintf(file, " ");
3773
3774    if (dispatch_width == 16 && inst->exec_size == 8) {
3775       if (inst->force_sechalf)
3776          fprintf(file, "2ndhalf ");
3777       else
3778          fprintf(file, "1sthalf ");
3779    }
3780
3781    fprintf(file, "\n");
3782 }
3783
3784 /**
3785  * Possibly returns an instruction that set up @param reg.
3786  *
3787  * Sometimes we want to take the result of some expression/variable
3788  * dereference tree and rewrite the instruction generating the result
3789  * of the tree.  When processing the tree, we know that the
3790  * instructions generated are all writing temporaries that are dead
3791  * outside of this tree.  So, if we have some instructions that write
3792  * a temporary, we're free to point that temp write somewhere else.
3793  *
3794  * Note that this doesn't guarantee that the instruction generated
3795  * only reg -- it might be the size=4 destination of a texture instruction.
3796  */
3797 fs_inst *
3798 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3799                                            fs_inst *end,
3800                                            const fs_reg &reg)
3801 {
3802    if (end == start ||
3803        end->is_partial_write() ||
3804        reg.reladdr ||
3805        !reg.equals(end->dst)) {
3806       return NULL;
3807    } else {
3808       return end;
3809    }
3810 }
3811
3812 void
3813 fs_visitor::setup_payload_gen6()
3814 {
3815    bool uses_depth =
3816       (prog->InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3817    unsigned barycentric_interp_modes =
3818       (stage == MESA_SHADER_FRAGMENT) ?
3819       ((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
3820
3821    assert(devinfo->gen >= 6);
3822
3823    /* R0-1: masks, pixel X/Y coordinates. */
3824    payload.num_regs = 2;
3825    /* R2: only for 32-pixel dispatch.*/
3826
3827    /* R3-26: barycentric interpolation coordinates.  These appear in the
3828     * same order that they appear in the brw_wm_barycentric_interp_mode
3829     * enum.  Each set of coordinates occupies 2 registers if dispatch width
3830     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
3831     * appear if they were enabled using the "Barycentric Interpolation
3832     * Mode" bits in WM_STATE.
3833     */
3834    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3835       if (barycentric_interp_modes & (1 << i)) {
3836          payload.barycentric_coord_reg[i] = payload.num_regs;
3837          payload.num_regs += 2;
3838          if (dispatch_width == 16) {
3839             payload.num_regs += 2;
3840          }
3841       }
3842    }
3843
3844    /* R27: interpolated depth if uses source depth */
3845    if (uses_depth) {
3846       payload.source_depth_reg = payload.num_regs;
3847       payload.num_regs++;
3848       if (dispatch_width == 16) {
3849          /* R28: interpolated depth if not SIMD8. */
3850          payload.num_regs++;
3851       }
3852    }
3853    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3854    if (uses_depth) {
3855       payload.source_w_reg = payload.num_regs;
3856       payload.num_regs++;
3857       if (dispatch_width == 16) {
3858          /* R30: interpolated W if not SIMD8. */
3859          payload.num_regs++;
3860       }
3861    }
3862
3863    if (stage == MESA_SHADER_FRAGMENT) {
3864       brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3865       brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3866       prog_data->uses_pos_offset = key->compute_pos_offset;
3867       /* R31: MSAA position offsets. */
3868       if (prog_data->uses_pos_offset) {
3869          payload.sample_pos_reg = payload.num_regs;
3870          payload.num_regs++;
3871       }
3872    }
3873
3874    /* R32: MSAA input coverage mask */
3875    if (prog->SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3876       assert(devinfo->gen >= 7);
3877       payload.sample_mask_in_reg = payload.num_regs;
3878       payload.num_regs++;
3879       if (dispatch_width == 16) {
3880          /* R33: input coverage mask if not SIMD8. */
3881          payload.num_regs++;
3882       }
3883    }
3884
3885    /* R34-: bary for 32-pixel. */
3886    /* R58-59: interp W for 32-pixel. */
3887
3888    if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3889       source_depth_to_render_target = true;
3890    }
3891 }
3892
3893 void
3894 fs_visitor::setup_vs_payload()
3895 {
3896    /* R0: thread header, R1: urb handles */
3897    payload.num_regs = 2;
3898 }
3899
3900 void
3901 fs_visitor::setup_cs_payload()
3902 {
3903    assert(brw->gen >= 7);
3904
3905    payload.num_regs = 1;
3906 }
3907
3908 void
3909 fs_visitor::assign_binding_table_offsets()
3910 {
3911    assert(stage == MESA_SHADER_FRAGMENT);
3912    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3913    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3914    uint32_t next_binding_table_offset = 0;
3915
3916    /* If there are no color regions, we still perform an FB write to a null
3917     * renderbuffer, which we place at surface index 0.
3918     */
3919    prog_data->binding_table.render_target_start = next_binding_table_offset;
3920    next_binding_table_offset += MAX2(key->nr_color_regions, 1);
3921
3922    assign_common_binding_table_offsets(next_binding_table_offset);
3923 }
3924
3925 void
3926 fs_visitor::calculate_register_pressure()
3927 {
3928    invalidate_live_intervals();
3929    calculate_live_intervals();
3930
3931    unsigned num_instructions = 0;
3932    foreach_block(block, cfg)
3933       num_instructions += block->instructions.length();
3934
3935    regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3936
3937    for (unsigned reg = 0; reg < alloc.count; reg++) {
3938       for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3939          regs_live_at_ip[ip] += alloc.sizes[reg];
3940    }
3941 }
3942
3943 void
3944 fs_visitor::optimize()
3945 {
3946    /* bld is the common builder object pointing at the end of the program we
3947     * used to translate it into i965 IR.  For the optimization and lowering
3948     * passes coming next, any code added after the end of the program without
3949     * having explicitly called fs_builder::at() clearly points at a mistake.
3950     * Ideally optimization passes wouldn't be part of the visitor so they
3951     * wouldn't have access to bld at all, but they do, so just in case some
3952     * pass forgets to ask for a location explicitly set it to NULL here to
3953     * make it trip.
3954     */
3955    bld = bld.at(NULL, NULL);
3956
3957    split_virtual_grfs();
3958
3959    move_uniform_array_access_to_pull_constants();
3960    assign_constant_locations();
3961    demote_pull_constants();
3962
3963 #define OPT(pass, args...) ({                                           \
3964       pass_num++;                                                       \
3965       bool this_progress = pass(args);                                  \
3966                                                                         \
3967       if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) {   \
3968          char filename[64];                                             \
3969          snprintf(filename, 64, "%s%d-%04d-%02d-%02d-" #pass,              \
3970                   stage_abbrev, dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
3971                                                                         \
3972          backend_shader::dump_instructions(filename);                   \
3973       }                                                                 \
3974                                                                         \
3975       progress = progress || this_progress;                             \
3976       this_progress;                                                    \
3977    })
3978
3979    if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
3980       char filename[64];
3981       snprintf(filename, 64, "%s%d-%04d-00-start",
3982                stage_abbrev, dispatch_width,
3983                shader_prog ? shader_prog->Name : 0);
3984
3985       backend_shader::dump_instructions(filename);
3986    }
3987
3988    bool progress;
3989    int iteration = 0;
3990    int pass_num = 0;
3991    do {
3992       progress = false;
3993       pass_num = 0;
3994       iteration++;
3995
3996       OPT(remove_duplicate_mrf_writes);
3997
3998       OPT(opt_algebraic);
3999       OPT(opt_cse);
4000       OPT(opt_copy_propagate);
4001       OPT(opt_peephole_predicated_break);
4002       OPT(opt_cmod_propagation);
4003       OPT(dead_code_eliminate);
4004       OPT(opt_peephole_sel);
4005       OPT(dead_control_flow_eliminate, this);
4006       OPT(opt_register_renaming);
4007       OPT(opt_redundant_discard_jumps);
4008       OPT(opt_saturate_propagation);
4009       OPT(opt_zero_samples);
4010       OPT(register_coalesce);
4011       OPT(compute_to_mrf);
4012       OPT(eliminate_find_live_channel);
4013
4014       OPT(compact_virtual_grfs);
4015    } while (progress);
4016
4017    pass_num = 0;
4018
4019    OPT(opt_sampler_eot);
4020
4021    if (OPT(lower_load_payload)) {
4022       split_virtual_grfs();
4023       OPT(register_coalesce);
4024       OPT(compute_to_mrf);
4025       OPT(dead_code_eliminate);
4026    }
4027
4028    OPT(opt_combine_constants);
4029    OPT(lower_integer_multiplication);
4030
4031    lower_uniform_pull_constant_loads();
4032 }
4033
4034 /**
4035  * Three source instruction must have a GRF/MRF destination register.
4036  * ARF NULL is not allowed.  Fix that up by allocating a temporary GRF.
4037  */
4038 void
4039 fs_visitor::fixup_3src_null_dest()
4040 {
4041    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
4042       if (inst->is_3src() && inst->dst.is_null()) {
4043          inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
4044                             inst->dst.type);
4045       }
4046    }
4047 }
4048
4049 void
4050 fs_visitor::allocate_registers()
4051 {
4052    bool allocated_without_spills;
4053
4054    static const enum instruction_scheduler_mode pre_modes[] = {
4055       SCHEDULE_PRE,
4056       SCHEDULE_PRE_NON_LIFO,
4057       SCHEDULE_PRE_LIFO,
4058    };
4059
4060    /* Try each scheduling heuristic to see if it can successfully register
4061     * allocate without spilling.  They should be ordered by decreasing
4062     * performance but increasing likelihood of allocating.
4063     */
4064    for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
4065       schedule_instructions(pre_modes[i]);
4066
4067       if (0) {
4068          assign_regs_trivial();
4069          allocated_without_spills = true;
4070       } else {
4071          allocated_without_spills = assign_regs(false);
4072       }
4073       if (allocated_without_spills)
4074          break;
4075    }
4076
4077    if (!allocated_without_spills) {
4078       /* We assume that any spilling is worse than just dropping back to
4079        * SIMD8.  There's probably actually some intermediate point where
4080        * SIMD16 with a couple of spills is still better.
4081        */
4082       if (dispatch_width == 16) {
4083          fail("Failure to register allocate.  Reduce number of "
4084               "live scalar values to avoid this.");
4085       } else {
4086          perf_debug("%s shader triggered register spilling.  "
4087                     "Try reducing the number of live scalar values to "
4088                     "improve performance.\n", stage_name);
4089       }
4090
4091       /* Since we're out of heuristics, just go spill registers until we
4092        * get an allocation.
4093        */
4094       while (!assign_regs(true)) {
4095          if (failed)
4096             break;
4097       }
4098    }
4099
4100    /* This must come after all optimization and register allocation, since
4101     * it inserts dead code that happens to have side effects, and it does
4102     * so based on the actual physical registers in use.
4103     */
4104    insert_gen4_send_dependency_workarounds();
4105
4106    if (failed)
4107       return;
4108
4109    if (!allocated_without_spills)
4110       schedule_instructions(SCHEDULE_POST);
4111
4112    if (last_scratch > 0)
4113       prog_data->total_scratch = brw_get_scratch_size(last_scratch);
4114 }
4115
4116 bool
4117 fs_visitor::run_vs()
4118 {
4119    assert(stage == MESA_SHADER_VERTEX);
4120
4121    assign_common_binding_table_offsets(0);
4122    setup_vs_payload();
4123
4124    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4125       emit_shader_time_begin();
4126
4127    emit_nir_code();
4128
4129    if (failed)
4130       return false;
4131
4132    emit_urb_writes();
4133
4134    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4135       emit_shader_time_end();
4136
4137    calculate_cfg();
4138
4139    optimize();
4140
4141    assign_curb_setup();
4142    assign_vs_urb_setup();
4143
4144    fixup_3src_null_dest();
4145    allocate_registers();
4146
4147    return !failed;
4148 }
4149
4150 bool
4151 fs_visitor::run_fs()
4152 {
4153    brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
4154    brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
4155
4156    assert(stage == MESA_SHADER_FRAGMENT);
4157
4158    sanity_param_count = prog->Parameters->NumParameters;
4159
4160    assign_binding_table_offsets();
4161
4162    if (devinfo->gen >= 6)
4163       setup_payload_gen6();
4164    else
4165       setup_payload_gen4();
4166
4167    if (0) {
4168       emit_dummy_fs();
4169    } else if (brw->use_rep_send && dispatch_width == 16) {
4170       emit_repclear_shader();
4171    } else {
4172       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4173          emit_shader_time_begin();
4174
4175       calculate_urb_setup();
4176       if (prog->InputsRead > 0) {
4177          if (devinfo->gen < 6)
4178             emit_interpolation_setup_gen4();
4179          else
4180             emit_interpolation_setup_gen6();
4181       }
4182
4183       /* We handle discards by keeping track of the still-live pixels in f0.1.
4184        * Initialize it with the dispatched pixels.
4185        */
4186       if (wm_prog_data->uses_kill) {
4187          fs_inst *discard_init = bld.emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
4188          discard_init->flag_subreg = 1;
4189       }
4190
4191       /* Generate FS IR for main().  (the visitor only descends into
4192        * functions called "main").
4193        */
4194       emit_nir_code();
4195
4196       if (failed)
4197          return false;
4198
4199       if (wm_prog_data->uses_kill)
4200          bld.emit(FS_OPCODE_PLACEHOLDER_HALT);
4201
4202       if (wm_key->alpha_test_func)
4203          emit_alpha_test();
4204
4205       emit_fb_writes();
4206
4207       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4208          emit_shader_time_end();
4209
4210       calculate_cfg();
4211
4212       optimize();
4213
4214       assign_curb_setup();
4215       assign_urb_setup();
4216
4217       fixup_3src_null_dest();
4218       allocate_registers();
4219
4220       if (failed)
4221          return false;
4222    }
4223
4224    if (dispatch_width == 8)
4225       wm_prog_data->reg_blocks = brw_register_blocks(grf_used);
4226    else
4227       wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
4228
4229    /* If any state parameters were appended, then ParameterValues could have
4230     * been realloced, in which case the driver uniform storage set up by
4231     * _mesa_associate_uniform_storage() would point to freed memory.  Make
4232     * sure that didn't happen.
4233     */
4234    assert(sanity_param_count == prog->Parameters->NumParameters);
4235
4236    return !failed;
4237 }
4238
4239 bool
4240 fs_visitor::run_cs()
4241 {
4242    assert(stage == MESA_SHADER_COMPUTE);
4243    assert(shader);
4244
4245    sanity_param_count = prog->Parameters->NumParameters;
4246
4247    assign_common_binding_table_offsets(0);
4248
4249    setup_cs_payload();
4250
4251    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4252       emit_shader_time_begin();
4253
4254    emit_nir_code();
4255
4256    if (failed)
4257       return false;
4258
4259    emit_cs_terminate();
4260
4261    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4262       emit_shader_time_end();
4263
4264    calculate_cfg();
4265
4266    optimize();
4267
4268    assign_curb_setup();
4269
4270    fixup_3src_null_dest();
4271    allocate_registers();
4272
4273    if (failed)
4274       return false;
4275
4276    /* If any state parameters were appended, then ParameterValues could have
4277     * been realloced, in which case the driver uniform storage set up by
4278     * _mesa_associate_uniform_storage() would point to freed memory.  Make
4279     * sure that didn't happen.
4280     */
4281    assert(sanity_param_count == prog->Parameters->NumParameters);
4282
4283    return !failed;
4284 }
4285
4286 const unsigned *
4287 brw_wm_fs_emit(struct brw_context *brw,
4288                void *mem_ctx,
4289                const struct brw_wm_prog_key *key,
4290                struct brw_wm_prog_data *prog_data,
4291                struct gl_fragment_program *fp,
4292                struct gl_shader_program *prog,
4293                unsigned *final_assembly_size)
4294 {
4295    bool start_busy = false;
4296    double start_time = 0;
4297
4298    if (unlikely(brw->perf_debug)) {
4299       start_busy = (brw->batch.last_bo &&
4300                     drm_intel_bo_busy(brw->batch.last_bo));
4301       start_time = get_time();
4302    }
4303
4304    struct brw_shader *shader = NULL;
4305    if (prog)
4306       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
4307
4308    if (unlikely(INTEL_DEBUG & DEBUG_WM))
4309       brw_dump_ir("fragment", prog, &shader->base, &fp->Base);
4310
4311    /* Now the main event: Visit the shader IR and generate our FS IR for it.
4312     */
4313    fs_visitor v(brw, mem_ctx, MESA_SHADER_FRAGMENT, key, &prog_data->base,
4314                 prog, &fp->Base, 8);
4315    if (!v.run_fs()) {
4316       if (prog) {
4317          prog->LinkStatus = false;
4318          ralloc_strcat(&prog->InfoLog, v.fail_msg);
4319       }
4320
4321       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
4322                     v.fail_msg);
4323
4324       return NULL;
4325    }
4326
4327    cfg_t *simd16_cfg = NULL;
4328    fs_visitor v2(brw, mem_ctx, MESA_SHADER_FRAGMENT, key, &prog_data->base,
4329                  prog, &fp->Base, 16);
4330    if (likely(!(INTEL_DEBUG & DEBUG_NO16) || brw->use_rep_send)) {
4331       if (!v.simd16_unsupported) {
4332          /* Try a SIMD16 compile */
4333          v2.import_uniforms(&v);
4334          if (!v2.run_fs()) {
4335             perf_debug("SIMD16 shader failed to compile, falling back to "
4336                        "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
4337          } else {
4338             simd16_cfg = v2.cfg;
4339          }
4340       } else {
4341          perf_debug("SIMD16 shader unsupported, falling back to "
4342                     "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
4343       }
4344    }
4345
4346    cfg_t *simd8_cfg;
4347    int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8;
4348    if ((no_simd8 || brw->gen < 5) && simd16_cfg) {
4349       simd8_cfg = NULL;
4350       prog_data->no_8 = true;
4351    } else {
4352       simd8_cfg = v.cfg;
4353       prog_data->no_8 = false;
4354    }
4355
4356    fs_generator g(brw, mem_ctx, (void *) key, &prog_data->base,
4357                   &fp->Base, v.promoted_constants, v.runtime_check_aads_emit, "FS");
4358
4359    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
4360       char *name;
4361       if (prog)
4362          name = ralloc_asprintf(mem_ctx, "%s fragment shader %d",
4363                                 prog->Label ? prog->Label : "unnamed",
4364                                 prog->Name);
4365       else
4366          name = ralloc_asprintf(mem_ctx, "fragment program %d", fp->Base.Id);
4367
4368       g.enable_debug(name);
4369    }
4370
4371    if (simd8_cfg)
4372       g.generate_code(simd8_cfg, 8);
4373    if (simd16_cfg)
4374       prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
4375
4376    if (unlikely(brw->perf_debug) && shader) {
4377       if (shader->compiled_once)
4378          brw_wm_debug_recompile(brw, prog, key);
4379       shader->compiled_once = true;
4380
4381       if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
4382          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
4383                     (get_time() - start_time) * 1000);
4384       }
4385    }
4386
4387    return g.get_assembly(final_assembly_size);
4388 }
4389
4390 extern "C" bool
4391 brw_fs_precompile(struct gl_context *ctx,
4392                   struct gl_shader_program *shader_prog,
4393                   struct gl_program *prog)
4394 {
4395    struct brw_context *brw = brw_context(ctx);
4396    struct brw_wm_prog_key key;
4397
4398    struct gl_fragment_program *fp = (struct gl_fragment_program *) prog;
4399    struct brw_fragment_program *bfp = brw_fragment_program(fp);
4400    bool program_uses_dfdy = fp->UsesDFdy;
4401
4402    memset(&key, 0, sizeof(key));
4403
4404    if (brw->gen < 6) {
4405       if (fp->UsesKill)
4406          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
4407
4408       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
4409          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
4410
4411       /* Just assume depth testing. */
4412       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
4413       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
4414    }
4415
4416    if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
4417                                          BRW_FS_VARYING_INPUT_MASK) > 16)
4418       key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
4419
4420    brw_setup_tex_for_precompile(brw, &key.tex, &fp->Base);
4421
4422    if (fp->Base.InputsRead & VARYING_BIT_POS) {
4423       key.drawable_height = ctx->DrawBuffer->Height;
4424    }
4425
4426    key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
4427          ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
4428          BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
4429
4430    if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
4431       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
4432                           key.nr_color_regions > 1;
4433    }
4434
4435    key.program_string_id = bfp->id;
4436
4437    uint32_t old_prog_offset = brw->wm.base.prog_offset;
4438    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
4439
4440    bool success = brw_codegen_wm_prog(brw, shader_prog, bfp, &key);
4441
4442    brw->wm.base.prog_offset = old_prog_offset;
4443    brw->wm.prog_data = old_prog_data;
4444
4445    return success;
4446 }
4447
4448 void
4449 brw_setup_tex_for_precompile(struct brw_context *brw,
4450                              struct brw_sampler_prog_key_data *tex,
4451                              struct gl_program *prog)
4452 {
4453    const bool has_shader_channel_select = brw->is_haswell || brw->gen >= 8;
4454    unsigned sampler_count = _mesa_fls(prog->SamplersUsed);
4455    for (unsigned i = 0; i < sampler_count; i++) {
4456       if (!has_shader_channel_select && (prog->ShadowSamplers & (1 << i))) {
4457          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
4458          tex->swizzles[i] =
4459             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
4460       } else {
4461          /* Color sampler: assume no swizzling. */
4462          tex->swizzles[i] = SWIZZLE_XYZW;
4463       }
4464    }
4465 }