src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 #include <sys/types.h>
  32
  33 #include "util/hash_table.h"
  34 #include "main/macros.h"
  35 #include "main/shaderobj.h"
  36 #include "main/fbobject.h"
  37 #include "program/prog_parameter.h"
  38 #include "program/prog_print.h"
  39 #include "util/register_allocate.h"
  40 #include "program/hash_table.h"
  41 #include "brw_context.h"
  42 #include "brw_eu.h"
  43 #include "brw_wm.h"
  44 #include "brw_fs.h"
  45 #include "brw_cfg.h"
  46 #include "brw_dead_control_flow.h"
  47 #include "main/uniforms.h"
  48 #include "brw_fs_live_variables.h"
  49 #include "glsl/glsl_types.h"
  50 #include "program/sampler.h"
  51
  52 using namespace brw;
  53
  54 void
  55 fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
  56               const fs_reg *src, unsigned sources)
  57 {
  58    memset(this, 0, sizeof(*this));
  59
  60    this->src = new fs_reg[MAX2(sources, 3)];
  61    for (unsigned i = 0; i < sources; i++)
  62       this->src[i] = src[i];
  63
  64    this->opcode = opcode;
  65    this->dst = dst;
  66    this->sources = sources;
  67    this->exec_size = exec_size;
  68
  69    assert(dst.file != IMM && dst.file != UNIFORM);
  70
  71    assert(this->exec_size != 0);
  72
  73    this->conditional_mod = BRW_CONDITIONAL_NONE;
  74
  75    /* This will be the case for almost all instructions. */
  76    switch (dst.file) {
  77    case GRF:
  78    case HW_REG:
  79    case MRF:
  80    case ATTR:
  81       this->regs_written = DIV_ROUND_UP(dst.component_size(exec_size),
  82                                         REG_SIZE);
  83       break;
  84    case BAD_FILE:
  85       this->regs_written = 0;
  86       break;
  87    case IMM:
  88    case UNIFORM:
  89       unreachable("Invalid destination register file");
  90    default:
  91       unreachable("Invalid register file");
  92    }
  93
  94    this->writes_accumulator = false;
  95 }
  96
  97 fs_inst::fs_inst()
  98 {
  99    init(BRW_OPCODE_NOP, 8, dst, NULL, 0);
 100 }
 101
 102 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
 103 {
 104    init(opcode, exec_size, reg_undef, NULL, 0);
 105 }
 106
 107 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst)
 108 {
 109    init(opcode, exec_size, dst, NULL, 0);
 110 }
 111
 112 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 113                  const fs_reg &src0)
 114 {
 115    const fs_reg src[1] = { src0 };
 116    init(opcode, exec_size, dst, src, 1);
 117 }
 118
 119 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 120                  const fs_reg &src0, const fs_reg &src1)
 121 {
 122    const fs_reg src[2] = { src0, src1 };
 123    init(opcode, exec_size, dst, src, 2);
 124 }
 125
 126 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 127                  const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
 128 {
 129    const fs_reg src[3] = { src0, src1, src2 };
 130    init(opcode, exec_size, dst, src, 3);
 131 }
 132
 133 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
 134                  const fs_reg src[], unsigned sources)
 135 {
 136    init(opcode, exec_width, dst, src, sources);
 137 }
 138
 139 fs_inst::fs_inst(const fs_inst &that)
 140 {
 141    memcpy(this, &that, sizeof(that));
 142
 143    this->src = new fs_reg[MAX2(that.sources, 3)];
 144
 145    for (unsigned i = 0; i < that.sources; i++)
 146       this->src[i] = that.src[i];
 147 }
 148
 149 fs_inst::~fs_inst()
 150 {
 151    delete[] this->src;
 152 }
 153
 154 void
 155 fs_inst::resize_sources(uint8_t num_sources)
 156 {
 157    if (this->sources != num_sources) {
 158       fs_reg *src = new fs_reg[MAX2(num_sources, 3)];
 159
 160       for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i)
 161          src[i] = this->src[i];
 162
 163       delete[] this->src;
 164       this->src = src;
 165       this->sources = num_sources;
 166    }
 167 }
 168
 169 void
 170 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld,
 171                                        const fs_reg &dst,
 172                                        const fs_reg &surf_index,
 173                                        const fs_reg &varying_offset,
 174                                        uint32_t const_offset)
 175 {
 176    /* We have our constant surface use a pitch of 4 bytes, so our index can
 177     * be any component of a vector, and then we load 4 contiguous
 178     * components starting from that.
 179     *
 180     * We break down the const_offset to a portion added to the variable
 181     * offset and a portion done using reg_offset, which means that if you
 182     * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
 183     * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
 184     * CSE can later notice that those loads are all the same and eliminate
 185     * the redundant ones.
 186     */
 187    fs_reg vec4_offset = vgrf(glsl_type::int_type);
 188    bld.ADD(vec4_offset, varying_offset, fs_reg(const_offset & ~3));
 189
 190    int scale = 1;
 191    if (devinfo->gen == 4 && bld.dispatch_width() == 8) {
 192       /* Pre-gen5, we can either use a SIMD8 message that requires (header,
 193        * u, v, r) as parameters, or we can just use the SIMD16 message
 194        * consisting of (header, u).  We choose the second, at the cost of a
 195        * longer return length.
 196        */
 197       scale = 2;
 198    }
 199
 200    enum opcode op;
 201    if (devinfo->gen >= 7)
 202       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
 203    else
 204       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
 205
 206    int regs_written = 4 * (bld.dispatch_width() / 8) * scale;
 207    fs_reg vec4_result = fs_reg(GRF, alloc.allocate(regs_written), dst.type);
 208    fs_inst *inst = bld.emit(op, vec4_result, surf_index, vec4_offset);
 209    inst->regs_written = regs_written;
 210
 211    if (devinfo->gen < 7) {
 212       inst->base_mrf = 13;
 213       inst->header_size = 1;
 214       if (devinfo->gen == 4)
 215          inst->mlen = 3;
 216       else
 217          inst->mlen = 1 + bld.dispatch_width() / 8;
 218    }
 219
 220    bld.MOV(dst, offset(vec4_result, bld, (const_offset & 3) * scale));
 221 }
 222
 223 /**
 224  * A helper for MOV generation for fixing up broken hardware SEND dependency
 225  * handling.
 226  */
 227 void
 228 fs_visitor::DEP_RESOLVE_MOV(const fs_builder &bld, int grf)
 229 {
 230    /* The caller always wants uncompressed to emit the minimal extra
 231     * dependencies, and to avoid having to deal with aligning its regs to 2.
 232     */
 233    const fs_builder ubld = bld.annotate("send dependency resolve")
 234                               .half(0);
 235
 236    ubld.MOV(ubld.null_reg_f(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
 237 }
 238
 239 bool
 240 fs_inst::equals(fs_inst *inst) const
 241 {
 242    return (opcode == inst->opcode &&
 243            dst.equals(inst->dst) &&
 244            src[0].equals(inst->src[0]) &&
 245            src[1].equals(inst->src[1]) &&
 246            src[2].equals(inst->src[2]) &&
 247            saturate == inst->saturate &&
 248            predicate == inst->predicate &&
 249            conditional_mod == inst->conditional_mod &&
 250            mlen == inst->mlen &&
 251            base_mrf == inst->base_mrf &&
 252            target == inst->target &&
 253            eot == inst->eot &&
 254            header_size == inst->header_size &&
 255            shadow_compare == inst->shadow_compare &&
 256            exec_size == inst->exec_size &&
 257            offset == inst->offset);
 258 }
 259
 260 bool
 261 fs_inst::overwrites_reg(const fs_reg &reg) const
 262 {
 263    return reg.in_range(dst, regs_written);
 264 }
 265
 266 bool
 267 fs_inst::is_send_from_grf() const
 268 {
 269    switch (opcode) {
 270    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
 271    case SHADER_OPCODE_SHADER_TIME_ADD:
 272    case FS_OPCODE_INTERPOLATE_AT_CENTROID:
 273    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
 274    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
 275    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
 276    case SHADER_OPCODE_UNTYPED_ATOMIC:
 277    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
 278    case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
 279    case SHADER_OPCODE_TYPED_ATOMIC:
 280    case SHADER_OPCODE_TYPED_SURFACE_READ:
 281    case SHADER_OPCODE_TYPED_SURFACE_WRITE:
 282    case SHADER_OPCODE_URB_WRITE_SIMD8:
 283       return true;
 284    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 285       return src[1].file == GRF;
 286    case FS_OPCODE_FB_WRITE:
 287       return src[0].file == GRF;
 288    default:
 289       if (is_tex())
 290          return src[0].file == GRF;
 291
 292       return false;
 293    }
 294 }
 295
 296 bool
 297 fs_inst::is_copy_payload(const brw::simple_allocator &grf_alloc) const
 298 {
 299    if (this->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
 300       return false;
 301
 302    fs_reg reg = this->src[0];
 303    if (reg.file != GRF || reg.reg_offset != 0 || reg.stride == 0)
 304       return false;
 305
 306    if (grf_alloc.sizes[reg.reg] != this->regs_written)
 307       return false;
 308
 309    for (int i = 0; i < this->sources; i++) {
 310       reg.type = this->src[i].type;
 311       if (!this->src[i].equals(reg))
 312          return false;
 313
 314       if (i < this->header_size) {
 315          reg.reg_offset += 1;
 316       } else {
 317          reg.reg_offset += this->exec_size / 8;
 318       }
 319    }
 320
 321    return true;
 322 }
 323
 324 bool
 325 fs_inst::can_do_source_mods(const struct brw_device_info *devinfo)
 326 {
 327    if (devinfo->gen == 6 && is_math())
 328       return false;
 329
 330    if (is_send_from_grf())
 331       return false;
 332
 333    if (!backend_instruction::can_do_source_mods())
 334       return false;
 335
 336    return true;
 337 }
 338
 339 bool
 340 fs_inst::has_side_effects() const
 341 {
 342    return this->eot || backend_instruction::has_side_effects();
 343 }
 344
 345 void
 346 fs_reg::init()
 347 {
 348    memset(this, 0, sizeof(*this));
 349    stride = 1;
 350 }
 351
 352 /** Generic unset register constructor. */
 353 fs_reg::fs_reg()
 354 {
 355    init();
 356    this->file = BAD_FILE;
 357 }
 358
 359 /** Immediate value constructor. */
 360 fs_reg::fs_reg(float f)
 361 {
 362    init();
 363    this->file = IMM;
 364    this->type = BRW_REGISTER_TYPE_F;
 365    this->stride = 0;
 366    this->fixed_hw_reg.dw1.f = f;
 367 }
 368
 369 /** Immediate value constructor. */
 370 fs_reg::fs_reg(int32_t i)
 371 {
 372    init();
 373    this->file = IMM;
 374    this->type = BRW_REGISTER_TYPE_D;
 375    this->stride = 0;
 376    this->fixed_hw_reg.dw1.d = i;
 377 }
 378
 379 /** Immediate value constructor. */
 380 fs_reg::fs_reg(uint32_t u)
 381 {
 382    init();
 383    this->file = IMM;
 384    this->type = BRW_REGISTER_TYPE_UD;
 385    this->stride = 0;
 386    this->fixed_hw_reg.dw1.ud = u;
 387 }
 388
 389 /** Vector float immediate value constructor. */
 390 fs_reg::fs_reg(uint8_t vf[4])
 391 {
 392    init();
 393    this->file = IMM;
 394    this->type = BRW_REGISTER_TYPE_VF;
 395    memcpy(&this->fixed_hw_reg.dw1.ud, vf, sizeof(unsigned));
 396 }
 397
 398 /** Vector float immediate value constructor. */
 399 fs_reg::fs_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
 400 {
 401    init();
 402    this->file = IMM;
 403    this->type = BRW_REGISTER_TYPE_VF;
 404    this->fixed_hw_reg.dw1.ud = (vf0 <<  0) |
 405                                (vf1 <<  8) |
 406                                (vf2 << 16) |
 407                                (vf3 << 24);
 408 }
 409
 410 /** Fixed brw_reg. */
 411 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 412 {
 413    init();
 414    this->file = HW_REG;
 415    this->fixed_hw_reg = fixed_hw_reg;
 416    this->type = fixed_hw_reg.type;
 417 }
 418
 419 bool
 420 fs_reg::equals(const fs_reg &r) const
 421 {
 422    return (file == r.file &&
 423            reg == r.reg &&
 424            reg_offset == r.reg_offset &&
 425            subreg_offset == r.subreg_offset &&
 426            type == r.type &&
 427            negate == r.negate &&
 428            abs == r.abs &&
 429            !reladdr && !r.reladdr &&
 430            memcmp(&fixed_hw_reg, &r.fixed_hw_reg, sizeof(fixed_hw_reg)) == 0 &&
 431            stride == r.stride);
 432 }
 433
 434 fs_reg &
 435 fs_reg::set_smear(unsigned subreg)
 436 {
 437    assert(file != HW_REG && file != IMM);
 438    subreg_offset = subreg * type_sz(type);
 439    stride = 0;
 440    return *this;
 441 }
 442
 443 bool
 444 fs_reg::is_contiguous() const
 445 {
 446    return stride == 1;
 447 }
 448
 449 unsigned
 450 fs_reg::component_size(unsigned width) const
 451 {
 452    const unsigned stride = (file != HW_REG ? this->stride :
 453                             fixed_hw_reg.hstride == 0 ? 0 :
 454                             1 << (fixed_hw_reg.hstride - 1));
 455    return MAX2(width * stride, 1) * type_sz(type);
 456 }
 457
 458 int
 459 fs_visitor::type_size(const struct glsl_type *type)
 460 {
 461    unsigned int size, i;
 462
 463    switch (type->base_type) {
 464    case GLSL_TYPE_UINT:
 465    case GLSL_TYPE_INT:
 466    case GLSL_TYPE_FLOAT:
 467    case GLSL_TYPE_BOOL:
 468       return type->components();
 469    case GLSL_TYPE_ARRAY:
 470       return type_size(type->fields.array) * type->length;
 471    case GLSL_TYPE_STRUCT:
 472       size = 0;
 473       for (i = 0; i < type->length; i++) {
 474          size += type_size(type->fields.structure[i].type);
 475       }
 476       return size;
 477    case GLSL_TYPE_SAMPLER:
 478       /* Samplers take up no register space, since they're baked in at
 479        * link time.
 480        */
 481       return 0;
 482    case GLSL_TYPE_ATOMIC_UINT:
 483       return 0;
 484    case GLSL_TYPE_IMAGE:
 485    case GLSL_TYPE_VOID:
 486    case GLSL_TYPE_ERROR:
 487    case GLSL_TYPE_INTERFACE:
 488    case GLSL_TYPE_DOUBLE:
 489       unreachable("not reached");
 490    }
 491
 492    return 0;
 493 }
 494
 495 /**
 496  * Create a MOV to read the timestamp register.
 497  *
 498  * The caller is responsible for emitting the MOV.  The return value is
 499  * the destination of the MOV, with extra parameters set.
 500  */
 501 fs_reg
 502 fs_visitor::get_timestamp(const fs_builder &bld)
 503 {
 504    assert(devinfo->gen >= 7);
 505
 506    fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 507                                           BRW_ARF_TIMESTAMP,
 508                                           0),
 509                              BRW_REGISTER_TYPE_UD));
 510
 511    fs_reg dst = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
 512
 513    /* We want to read the 3 fields we care about even if it's not enabled in
 514     * the dispatch.
 515     */
 516    bld.group(4, 0).exec_all().MOV(dst, ts);
 517
 518    /* The caller wants the low 32 bits of the timestamp.  Since it's running
 519     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 520     * which is plenty of time for our purposes.  It is identical across the
 521     * EUs, but since it's tracking GPU core speed it will increment at a
 522     * varying rate as render P-states change.
 523     *
 524     * The caller could also check if render P-states have changed (or anything
 525     * else that might disrupt timing) by setting smear to 2 and checking if
 526     * that field is != 0.
 527     */
 528    dst.set_smear(0);
 529
 530    return dst;
 531 }
 532
 533 void
 534 fs_visitor::emit_shader_time_begin()
 535 {
 536    shader_start_time = get_timestamp(bld.annotate("shader time start"));
 537 }
 538
 539 void
 540 fs_visitor::emit_shader_time_end()
 541 {
 542    /* Insert our code just before the final SEND with EOT. */
 543    exec_node *end = this->instructions.get_tail();
 544    assert(end && ((fs_inst *) end)->eot);
 545    const fs_builder ibld = bld.annotate("shader time end")
 546                               .exec_all().at(NULL, end);
 547
 548    fs_reg shader_end_time = get_timestamp(ibld);
 549
 550    /* Check that there weren't any timestamp reset events (assuming these
 551     * were the only two timestamp reads that happened).
 552     */
 553    fs_reg reset = shader_end_time;
 554    reset.set_smear(2);
 555    set_condmod(BRW_CONDITIONAL_Z,
 556                ibld.AND(ibld.null_reg_ud(), reset, fs_reg(1u)));
 557    ibld.IF(BRW_PREDICATE_NORMAL);
 558
 559    fs_reg start = shader_start_time;
 560    start.negate = true;
 561    fs_reg diff = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
 562    diff.set_smear(0);
 563
 564    const fs_builder cbld = ibld.group(1, 0);
 565    cbld.group(1, 0).ADD(diff, start, shader_end_time);
 566
 567    /* If there were no instructions between the two timestamp gets, the diff
 568     * is 2 cycles.  Remove that overhead, so I can forget about that when
 569     * trying to determine the time taken for single instructions.
 570     */
 571    cbld.ADD(diff, diff, fs_reg(-2u));
 572    SHADER_TIME_ADD(cbld, 0, diff);
 573    SHADER_TIME_ADD(cbld, 1, fs_reg(1u));
 574    ibld.emit(BRW_OPCODE_ELSE);
 575    SHADER_TIME_ADD(cbld, 2, fs_reg(1u));
 576    ibld.emit(BRW_OPCODE_ENDIF);
 577 }
 578
 579 void
 580 fs_visitor::SHADER_TIME_ADD(const fs_builder &bld,
 581                             int shader_time_subindex,
 582                             fs_reg value)
 583 {
 584    int index = shader_time_index * 3 + shader_time_subindex;
 585    fs_reg offset = fs_reg(index * SHADER_TIME_STRIDE);
 586
 587    fs_reg payload;
 588    if (dispatch_width == 8)
 589       payload = vgrf(glsl_type::uvec2_type);
 590    else
 591       payload = vgrf(glsl_type::uint_type);
 592
 593    bld.emit(SHADER_OPCODE_SHADER_TIME_ADD, fs_reg(), payload, offset, value);
 594 }
 595
 596 void
 597 fs_visitor::vfail(const char *format, va_list va)
 598 {
 599    char *msg;
 600
 601    if (failed)
 602       return;
 603
 604    failed = true;
 605
 606    msg = ralloc_vasprintf(mem_ctx, format, va);
 607    msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
 608
 609    this->fail_msg = msg;
 610
 611    if (debug_enabled) {
 612       fprintf(stderr, "%s",  msg);
 613    }
 614 }
 615
 616 void
 617 fs_visitor::fail(const char *format, ...)
 618 {
 619    va_list va;
 620
 621    va_start(va, format);
 622    vfail(format, va);
 623    va_end(va);
 624 }
 625
 626 /**
 627  * Mark this program as impossible to compile in SIMD16 mode.
 628  *
 629  * During the SIMD8 compile (which happens first), we can detect and flag
 630  * things that are unsupported in SIMD16 mode, so the compiler can skip
 631  * the SIMD16 compile altogether.
 632  *
 633  * During a SIMD16 compile (if one happens anyway), this just calls fail().
 634  */
 635 void
 636 fs_visitor::no16(const char *msg)
 637 {
 638    if (dispatch_width == 16) {
 639       fail("%s", msg);
 640    } else {
 641       simd16_unsupported = true;
 642
 643       compiler->shader_perf_log(log_data,
 644                                 "SIMD16 shader failed to compile: %s", msg);
 645    }
 646 }
 647
 648 /**
 649  * Returns true if the instruction has a flag that means it won't
 650  * update an entire destination register.
 651  *
 652  * For example, dead code elimination and live variable analysis want to know
 653  * when a write to a variable screens off any preceding values that were in
 654  * it.
 655  */
 656 bool
 657 fs_inst::is_partial_write() const
 658 {
 659    return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
 660            (this->exec_size * type_sz(this->dst.type)) < 32 ||
 661            !this->dst.is_contiguous());
 662 }
 663
 664 int
 665 fs_inst::regs_read(int arg) const
 666 {
 667    unsigned components = 1;
 668    switch (opcode) {
 669    case FS_OPCODE_FB_WRITE:
 670    case SHADER_OPCODE_URB_WRITE_SIMD8:
 671    case SHADER_OPCODE_UNTYPED_ATOMIC:
 672    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
 673    case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
 674    case SHADER_OPCODE_TYPED_ATOMIC:
 675    case SHADER_OPCODE_TYPED_SURFACE_READ:
 676    case SHADER_OPCODE_TYPED_SURFACE_WRITE:
 677    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
 678       if (arg == 0)
 679          return mlen;
 680       break;
 681
 682    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7:
 683       /* The payload is actually stored in src1 */
 684       if (arg == 1)
 685          return mlen;
 686       break;
 687
 688    case FS_OPCODE_LINTERP:
 689       if (arg == 0)
 690          return exec_size / 4;
 691       else
 692          return 1;
 693
 694    case FS_OPCODE_PIXEL_X:
 695    case FS_OPCODE_PIXEL_Y:
 696       if (arg == 0)
 697          components = 2;
 698       break;
 699
 700    case SHADER_OPCODE_LOAD_PAYLOAD:
 701       if (arg < this->header_size)
 702          return 1;
 703       break;
 704
 705    case CS_OPCODE_CS_TERMINATE:
 706       return 1;
 707
 708    default:
 709       if (is_tex() && arg == 0 && src[0].file == GRF)
 710          return mlen;
 711       break;
 712    }
 713
 714    switch (src[arg].file) {
 715    case BAD_FILE:
 716    case UNIFORM:
 717    case IMM:
 718       return 1;
 719    case GRF:
 720    case HW_REG:
 721       return DIV_ROUND_UP(components * src[arg].component_size(exec_size),
 722                           REG_SIZE);
 723    case MRF:
 724       unreachable("MRF registers are not allowed as sources");
 725    default:
 726       unreachable("Invalid register file");
 727    }
 728 }
 729
 730 bool
 731 fs_inst::reads_flag() const
 732 {
 733    return predicate;
 734 }
 735
 736 bool
 737 fs_inst::writes_flag() const
 738 {
 739    return (conditional_mod && (opcode != BRW_OPCODE_SEL &&
 740                                opcode != BRW_OPCODE_IF &&
 741                                opcode != BRW_OPCODE_WHILE)) ||
 742           opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
 743 }
 744
 745 /**
 746  * Returns how many MRFs an FS opcode will write over.
 747  *
 748  * Note that this is not the 0 or 1 implied writes in an actual gen
 749  * instruction -- the FS opcodes often generate MOVs in addition.
 750  */
 751 int
 752 fs_visitor::implied_mrf_writes(fs_inst *inst)
 753 {
 754    if (inst->mlen == 0)
 755       return 0;
 756
 757    if (inst->base_mrf == -1)
 758       return 0;
 759
 760    switch (inst->opcode) {
 761    case SHADER_OPCODE_RCP:
 762    case SHADER_OPCODE_RSQ:
 763    case SHADER_OPCODE_SQRT:
 764    case SHADER_OPCODE_EXP2:
 765    case SHADER_OPCODE_LOG2:
 766    case SHADER_OPCODE_SIN:
 767    case SHADER_OPCODE_COS:
 768       return 1 * dispatch_width / 8;
 769    case SHADER_OPCODE_POW:
 770    case SHADER_OPCODE_INT_QUOTIENT:
 771    case SHADER_OPCODE_INT_REMAINDER:
 772       return 2 * dispatch_width / 8;
 773    case SHADER_OPCODE_TEX:
 774    case FS_OPCODE_TXB:
 775    case SHADER_OPCODE_TXD:
 776    case SHADER_OPCODE_TXF:
 777    case SHADER_OPCODE_TXF_CMS:
 778    case SHADER_OPCODE_TXF_MCS:
 779    case SHADER_OPCODE_TG4:
 780    case SHADER_OPCODE_TG4_OFFSET:
 781    case SHADER_OPCODE_TXL:
 782    case SHADER_OPCODE_TXS:
 783    case SHADER_OPCODE_LOD:
 784       return 1;
 785    case FS_OPCODE_FB_WRITE:
 786       return 2;
 787    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 788    case SHADER_OPCODE_GEN4_SCRATCH_READ:
 789       return 1;
 790    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
 791       return inst->mlen;
 792    case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
 793       return inst->mlen;
 794    case SHADER_OPCODE_UNTYPED_ATOMIC:
 795    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
 796    case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
 797    case SHADER_OPCODE_TYPED_ATOMIC:
 798    case SHADER_OPCODE_TYPED_SURFACE_READ:
 799    case SHADER_OPCODE_TYPED_SURFACE_WRITE:
 800    case SHADER_OPCODE_URB_WRITE_SIMD8:
 801    case FS_OPCODE_INTERPOLATE_AT_CENTROID:
 802    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
 803    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
 804    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
 805       return 0;
 806    default:
 807       unreachable("not reached");
 808    }
 809 }
 810
 811 fs_reg
 812 fs_visitor::vgrf(const glsl_type *const type)
 813 {
 814    int reg_width = dispatch_width / 8;
 815    return fs_reg(GRF, alloc.allocate(type_size(type) * reg_width),
 816                  brw_type_for_base_type(type));
 817 }
 818
 819 /** Fixed HW reg constructor. */
 820 fs_reg::fs_reg(enum register_file file, int reg)
 821 {
 822    init();
 823    this->file = file;
 824    this->reg = reg;
 825    this->type = BRW_REGISTER_TYPE_F;
 826 }
 827
 828 /** Fixed HW reg constructor. */
 829 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
 830 {
 831    init();
 832    this->file = file;
 833    this->reg = reg;
 834    this->type = type;
 835 }
 836
 837 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
 838  * This brings in those uniform definitions
 839  */
 840 void
 841 fs_visitor::import_uniforms(fs_visitor *v)
 842 {
 843    this->push_constant_loc = v->push_constant_loc;
 844    this->pull_constant_loc = v->pull_constant_loc;
 845    this->uniforms = v->uniforms;
 846    this->param_size = v->param_size;
 847 }
 848
 849 fs_reg *
 850 fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
 851                                          bool origin_upper_left)
 852 {
 853    assert(stage == MESA_SHADER_FRAGMENT);
 854    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
 855    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec4_type));
 856    fs_reg wpos = *reg;
 857    bool flip = !origin_upper_left ^ key->render_to_fbo;
 858
 859    /* gl_FragCoord.x */
 860    if (pixel_center_integer) {
 861       bld.MOV(wpos, this->pixel_x);
 862    } else {
 863       bld.ADD(wpos, this->pixel_x, fs_reg(0.5f));
 864    }
 865    wpos = offset(wpos, bld, 1);
 866
 867    /* gl_FragCoord.y */
 868    if (!flip && pixel_center_integer) {
 869       bld.MOV(wpos, this->pixel_y);
 870    } else {
 871       fs_reg pixel_y = this->pixel_y;
 872       float offset = (pixel_center_integer ? 0.0 : 0.5);
 873
 874       if (flip) {
 875          pixel_y.negate = true;
 876          offset += key->drawable_height - 1.0;
 877       }
 878
 879       bld.ADD(wpos, pixel_y, fs_reg(offset));
 880    }
 881    wpos = offset(wpos, bld, 1);
 882
 883    /* gl_FragCoord.z */
 884    if (devinfo->gen >= 6) {
 885       bld.MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0)));
 886    } else {
 887       bld.emit(FS_OPCODE_LINTERP, wpos,
 888            this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 889            interp_reg(VARYING_SLOT_POS, 2));
 890    }
 891    wpos = offset(wpos, bld, 1);
 892
 893    /* gl_FragCoord.w: Already set up in emit_interpolation */
 894    bld.MOV(wpos, this->wpos_w);
 895
 896    return reg;
 897 }
 898
 899 fs_inst *
 900 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
 901                          glsl_interp_qualifier interpolation_mode,
 902                          bool is_centroid, bool is_sample)
 903 {
 904    brw_wm_barycentric_interp_mode barycoord_mode;
 905    if (devinfo->gen >= 6) {
 906       if (is_centroid) {
 907          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
 908             barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
 909          else
 910             barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
 911       } else if (is_sample) {
 912           if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
 913             barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
 914          else
 915             barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
 916       } else {
 917          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
 918             barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
 919          else
 920             barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
 921       }
 922    } else {
 923       /* On Ironlake and below, there is only one interpolation mode.
 924        * Centroid interpolation doesn't mean anything on this hardware --
 925        * there is no multisampling.
 926        */
 927       barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
 928    }
 929    return bld.emit(FS_OPCODE_LINTERP, attr,
 930                    this->delta_xy[barycoord_mode], interp);
 931 }
 932
 933 void
 934 fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
 935                                        const glsl_type *type,
 936                                        glsl_interp_qualifier interpolation_mode,
 937                                        int location, bool mod_centroid,
 938                                        bool mod_sample)
 939 {
 940    attr.type = brw_type_for_base_type(type->get_scalar_type());
 941
 942    assert(stage == MESA_SHADER_FRAGMENT);
 943    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
 944    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
 945
 946    unsigned int array_elements;
 947
 948    if (type->is_array()) {
 949       array_elements = type->length;
 950       if (array_elements == 0) {
 951          fail("dereferenced array '%s' has length 0\n", name);
 952       }
 953       type = type->fields.array;
 954    } else {
 955       array_elements = 1;
 956    }
 957
 958    if (interpolation_mode == INTERP_QUALIFIER_NONE) {
 959       bool is_gl_Color =
 960          location == VARYING_SLOT_COL0 || location == VARYING_SLOT_COL1;
 961       if (key->flat_shade && is_gl_Color) {
 962          interpolation_mode = INTERP_QUALIFIER_FLAT;
 963       } else {
 964          interpolation_mode = INTERP_QUALIFIER_SMOOTH;
 965       }
 966    }
 967
 968    for (unsigned int i = 0; i < array_elements; i++) {
 969       for (unsigned int j = 0; j < type->matrix_columns; j++) {
 970          if (prog_data->urb_setup[location] == -1) {
 971             /* If there's no incoming setup data for this slot, don't
 972              * emit interpolation for it.
 973              */
 974             attr = offset(attr, bld, type->vector_elements);
 975             location++;
 976             continue;
 977          }
 978
 979          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
 980             /* Constant interpolation (flat shading) case. The SF has
 981              * handed us defined values in only the constant offset
 982              * field of the setup reg.
 983              */
 984             for (unsigned int k = 0; k < type->vector_elements; k++) {
 985                struct brw_reg interp = interp_reg(location, k);
 986                interp = suboffset(interp, 3);
 987                interp.type = attr.type;
 988                bld.emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
 989                attr = offset(attr, bld, 1);
 990             }
 991          } else {
 992             /* Smooth/noperspective interpolation case. */
 993             for (unsigned int k = 0; k < type->vector_elements; k++) {
 994                struct brw_reg interp = interp_reg(location, k);
 995                if (devinfo->needs_unlit_centroid_workaround && mod_centroid) {
 996                   /* Get the pixel/sample mask into f0 so that we know
 997                    * which pixels are lit.  Then, for each channel that is
 998                    * unlit, replace the centroid data with non-centroid
 999                    * data.
1000                    */
1001                   bld.emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1002
1003                   fs_inst *inst;
1004                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1005                                       false, false);
1006                   inst->predicate = BRW_PREDICATE_NORMAL;
1007                   inst->predicate_inverse = true;
1008                   if (devinfo->has_pln)
1009                      inst->no_dd_clear = true;
1010
1011                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1012                                       mod_centroid && !key->persample_shading,
1013                                       mod_sample || key->persample_shading);
1014                   inst->predicate = BRW_PREDICATE_NORMAL;
1015                   inst->predicate_inverse = false;
1016                   if (devinfo->has_pln)
1017                      inst->no_dd_check = true;
1018
1019                } else {
1020                   emit_linterp(attr, fs_reg(interp), interpolation_mode,
1021                                mod_centroid && !key->persample_shading,
1022                                mod_sample || key->persample_shading);
1023                }
1024                if (devinfo->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1025                   bld.MUL(attr, attr, this->pixel_w);
1026                }
1027                attr = offset(attr, bld, 1);
1028             }
1029
1030          }
1031          location++;
1032       }
1033    }
1034 }
1035
1036 fs_reg *
1037 fs_visitor::emit_frontfacing_interpolation()
1038 {
1039    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type));
1040
1041    if (devinfo->gen >= 6) {
1042       /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
1043        * a boolean result from this (~0/true or 0/false).
1044        *
1045        * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
1046        * this task in only one instruction:
1047        *    - a negation source modifier will flip the bit; and
1048        *    - a W -> D type conversion will sign extend the bit into the high
1049        *      word of the destination.
1050        *
1051        * An ASR 15 fills the low word of the destination.
1052        */
1053       fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
1054       g0.negate = true;
1055
1056       bld.ASR(*reg, g0, fs_reg(15));
1057    } else {
1058       /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
1059        * a boolean result from this (1/true or 0/false).
1060        *
1061        * Like in the above case, since the bit is the MSB of g1.6:UD we can use
1062        * the negation source modifier to flip it. Unfortunately the SHR
1063        * instruction only operates on UD (or D with an abs source modifier)
1064        * sources without negation.
1065        *
1066        * Instead, use ASR (which will give ~0/true or 0/false).
1067        */
1068       fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
1069       g1_6.negate = true;
1070
1071       bld.ASR(*reg, g1_6, fs_reg(31));
1072    }
1073
1074    return reg;
1075 }
1076
1077 void
1078 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1079 {
1080    assert(stage == MESA_SHADER_FRAGMENT);
1081    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1082    assert(dst.type == BRW_REGISTER_TYPE_F);
1083
1084    if (key->compute_pos_offset) {
1085       /* Convert int_sample_pos to floating point */
1086       bld.MOV(dst, int_sample_pos);
1087       /* Scale to the range [0, 1] */
1088       bld.MUL(dst, dst, fs_reg(1 / 16.0f));
1089    }
1090    else {
1091       /* From ARB_sample_shading specification:
1092        * "When rendering to a non-multisample buffer, or if multisample
1093        *  rasterization is disabled, gl_SamplePosition will always be
1094        *  (0.5, 0.5).
1095        */
1096       bld.MOV(dst, fs_reg(0.5f));
1097    }
1098 }
1099
1100 fs_reg *
1101 fs_visitor::emit_samplepos_setup()
1102 {
1103    assert(devinfo->gen >= 6);
1104
1105    const fs_builder abld = bld.annotate("compute sample position");
1106    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type));
1107    fs_reg pos = *reg;
1108    fs_reg int_sample_x = vgrf(glsl_type::int_type);
1109    fs_reg int_sample_y = vgrf(glsl_type::int_type);
1110
1111    /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1112     * mode will be enabled.
1113     *
1114     * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1115     * R31.1:0         Position Offset X/Y for Slot[3:0]
1116     * R31.3:2         Position Offset X/Y for Slot[7:4]
1117     * .....
1118     *
1119     * The X, Y sample positions come in as bytes in  thread payload. So, read
1120     * the positions using vstride=16, width=8, hstride=2.
1121     */
1122    struct brw_reg sample_pos_reg =
1123       stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1124                     BRW_REGISTER_TYPE_B), 16, 8, 2);
1125
1126    if (dispatch_width == 8) {
1127       abld.MOV(int_sample_x, fs_reg(sample_pos_reg));
1128    } else {
1129       abld.half(0).MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg));
1130       abld.half(1).MOV(half(int_sample_x, 1),
1131                        fs_reg(suboffset(sample_pos_reg, 16)));
1132    }
1133    /* Compute gl_SamplePosition.x */
1134    compute_sample_position(pos, int_sample_x);
1135    pos = offset(pos, abld, 1);
1136    if (dispatch_width == 8) {
1137       abld.MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1)));
1138    } else {
1139       abld.half(0).MOV(half(int_sample_y, 0),
1140                        fs_reg(suboffset(sample_pos_reg, 1)));
1141       abld.half(1).MOV(half(int_sample_y, 1),
1142                        fs_reg(suboffset(sample_pos_reg, 17)));
1143    }
1144    /* Compute gl_SamplePosition.y */
1145    compute_sample_position(pos, int_sample_y);
1146    return reg;
1147 }
1148
1149 fs_reg *
1150 fs_visitor::emit_sampleid_setup()
1151 {
1152    assert(stage == MESA_SHADER_FRAGMENT);
1153    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1154    assert(devinfo->gen >= 6);
1155
1156    const fs_builder abld = bld.annotate("compute sample id");
1157    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
1158
1159    if (key->compute_sample_id) {
1160       fs_reg t1 = vgrf(glsl_type::int_type);
1161       fs_reg t2 = vgrf(glsl_type::int_type);
1162       t2.type = BRW_REGISTER_TYPE_UW;
1163
1164       /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1165        * 8x multisampling, subspan 0 will represent sample N (where N
1166        * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1167        * 7. We can find the value of N by looking at R0.0 bits 7:6
1168        * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1169        * (since samples are always delivered in pairs). That is, we
1170        * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1171        * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1172        * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1173        * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1174        * populating a temporary variable with the sequence (0, 1, 2, 3),
1175        * and then reading from it using vstride=1, width=4, hstride=0.
1176        * These computations hold good for 4x multisampling as well.
1177        *
1178        * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1179        * the first four slots are sample 0 of subspan 0; the next four
1180        * are sample 1 of subspan 0; the third group is sample 0 of
1181        * subspan 1, and finally sample 1 of subspan 1.
1182        */
1183       abld.exec_all()
1184           .AND(t1, fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1185                fs_reg(0xc0));
1186       abld.exec_all().SHR(t1, t1, fs_reg(5));
1187
1188       /* This works for both SIMD8 and SIMD16 */
1189       abld.exec_all()
1190           .MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210));
1191
1192       /* This special instruction takes care of setting vstride=1,
1193        * width=4, hstride=0 of t2 during an ADD instruction.
1194        */
1195       abld.emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1196    } else {
1197       /* As per GL_ARB_sample_shading specification:
1198        * "When rendering to a non-multisample buffer, or if multisample
1199        *  rasterization is disabled, gl_SampleID will always be zero."
1200        */
1201       abld.MOV(*reg, fs_reg(0));
1202    }
1203
1204    return reg;
1205 }
1206
1207 void
1208 fs_visitor::resolve_source_modifiers(fs_reg *src)
1209 {
1210    if (!src->abs && !src->negate)
1211       return;
1212
1213    fs_reg temp = bld.vgrf(src->type);
1214    bld.MOV(temp, *src);
1215    *src = temp;
1216 }
1217
1218 void
1219 fs_visitor::emit_discard_jump()
1220 {
1221    assert(((brw_wm_prog_data*) this->prog_data)->uses_kill);
1222
1223    /* For performance, after a discard, jump to the end of the
1224     * shader if all relevant channels have been discarded.
1225     */
1226    fs_inst *discard_jump = bld.emit(FS_OPCODE_DISCARD_JUMP);
1227    discard_jump->flag_subreg = 1;
1228
1229    discard_jump->predicate = (dispatch_width == 8)
1230                              ? BRW_PREDICATE_ALIGN1_ANY8H
1231                              : BRW_PREDICATE_ALIGN1_ANY16H;
1232    discard_jump->predicate_inverse = true;
1233 }
1234
1235 void
1236 fs_visitor::assign_curb_setup()
1237 {
1238    if (dispatch_width == 8) {
1239       prog_data->dispatch_grf_start_reg = payload.num_regs;
1240    } else {
1241       if (stage == MESA_SHADER_FRAGMENT) {
1242          brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1243          prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1244       } else if (stage == MESA_SHADER_COMPUTE) {
1245          brw_cs_prog_data *prog_data = (brw_cs_prog_data*) this->prog_data;
1246          prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1247       } else {
1248          unreachable("Unsupported shader type!");
1249       }
1250    }
1251
1252    prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1253
1254    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1255    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1256       for (unsigned int i = 0; i < inst->sources; i++) {
1257          if (inst->src[i].file == UNIFORM) {
1258             int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1259             int constant_nr;
1260             if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1261                constant_nr = push_constant_loc[uniform_nr];
1262             } else {
1263                /* Section 5.11 of the OpenGL 4.1 spec says:
1264                 * "Out-of-bounds reads return undefined values, which include
1265                 *  values from other variables of the active program or zero."
1266                 * Just return the first push constant.
1267                 */
1268                constant_nr = 0;
1269             }
1270
1271             struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1272                                                   constant_nr / 8,
1273                                                   constant_nr % 8);
1274
1275             inst->src[i].file = HW_REG;
1276             inst->src[i].fixed_hw_reg = byte_offset(
1277                retype(brw_reg, inst->src[i].type),
1278                inst->src[i].subreg_offset);
1279          }
1280       }
1281    }
1282 }
1283
1284 void
1285 fs_visitor::calculate_urb_setup()
1286 {
1287    assert(stage == MESA_SHADER_FRAGMENT);
1288    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1289    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1290
1291    memset(prog_data->urb_setup, -1,
1292           sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
1293
1294    int urb_next = 0;
1295    /* Figure out where each of the incoming setup attributes lands. */
1296    if (devinfo->gen >= 6) {
1297       if (_mesa_bitcount_64(prog->InputsRead &
1298                             BRW_FS_VARYING_INPUT_MASK) <= 16) {
1299          /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1300           * first 16 varying inputs, so we can put them wherever we want.
1301           * Just put them in order.
1302           *
1303           * This is useful because it means that (a) inputs not used by the
1304           * fragment shader won't take up valuable register space, and (b) we
1305           * won't have to recompile the fragment shader if it gets paired with
1306           * a different vertex (or geometry) shader.
1307           */
1308          for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1309             if (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1310                 BITFIELD64_BIT(i)) {
1311                prog_data->urb_setup[i] = urb_next++;
1312             }
1313          }
1314       } else {
1315          /* We have enough input varyings that the SF/SBE pipeline stage can't
1316           * arbitrarily rearrange them to suit our whim; we have to put them
1317           * in an order that matches the output of the previous pipeline stage
1318           * (geometry or vertex shader).
1319           */
1320          struct brw_vue_map prev_stage_vue_map;
1321          brw_compute_vue_map(devinfo, &prev_stage_vue_map,
1322                              key->input_slots_valid);
1323          int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1324          assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1325          for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1326               slot++) {
1327             int varying = prev_stage_vue_map.slot_to_varying[slot];
1328             /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1329              * unused.
1330              */
1331             if (varying != BRW_VARYING_SLOT_COUNT &&
1332                 (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1333                  BITFIELD64_BIT(varying))) {
1334                prog_data->urb_setup[varying] = slot - first_slot;
1335             }
1336          }
1337          urb_next = prev_stage_vue_map.num_slots - first_slot;
1338       }
1339    } else {
1340       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1341       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1342          /* Point size is packed into the header, not as a general attribute */
1343          if (i == VARYING_SLOT_PSIZ)
1344             continue;
1345
1346          if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1347             /* The back color slot is skipped when the front color is
1348              * also written to.  In addition, some slots can be
1349              * written in the vertex shader and not read in the
1350              * fragment shader.  So the register number must always be
1351              * incremented, mapped or not.
1352              */
1353             if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1354                prog_data->urb_setup[i] = urb_next;
1355             urb_next++;
1356          }
1357       }
1358
1359       /*
1360        * It's a FS only attribute, and we did interpolation for this attribute
1361        * in SF thread. So, count it here, too.
1362        *
1363        * See compile_sf_prog() for more info.
1364        */
1365       if (prog->InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1366          prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1367    }
1368
1369    prog_data->num_varying_inputs = urb_next;
1370 }
1371
1372 void
1373 fs_visitor::assign_urb_setup()
1374 {
1375    assert(stage == MESA_SHADER_FRAGMENT);
1376    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1377
1378    int urb_start = payload.num_regs + prog_data->base.curb_read_length;
1379
1380    /* Offset all the urb_setup[] index by the actual position of the
1381     * setup regs, now that the location of the constants has been chosen.
1382     */
1383    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1384       if (inst->opcode == FS_OPCODE_LINTERP) {
1385          assert(inst->src[1].file == HW_REG);
1386          inst->src[1].fixed_hw_reg.nr += urb_start;
1387       }
1388
1389       if (inst->opcode == FS_OPCODE_CINTERP) {
1390          assert(inst->src[0].file == HW_REG);
1391          inst->src[0].fixed_hw_reg.nr += urb_start;
1392       }
1393    }
1394
1395    /* Each attribute is 4 setup channels, each of which is half a reg. */
1396    this->first_non_payload_grf =
1397       urb_start + prog_data->num_varying_inputs * 2;
1398 }
1399
1400 void
1401 fs_visitor::assign_vs_urb_setup()
1402 {
1403    brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
1404    int grf, count, slot, channel, attr;
1405
1406    assert(stage == MESA_SHADER_VERTEX);
1407    count = _mesa_bitcount_64(vs_prog_data->inputs_read);
1408    if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid)
1409       count++;
1410
1411    /* Each attribute is 4 regs. */
1412    this->first_non_payload_grf =
1413       payload.num_regs + prog_data->curb_read_length + count * 4;
1414
1415    unsigned vue_entries =
1416       MAX2(count, vs_prog_data->base.vue_map.num_slots);
1417
1418    vs_prog_data->base.urb_entry_size = ALIGN(vue_entries, 4) / 4;
1419    vs_prog_data->base.urb_read_length = (count + 1) / 2;
1420
1421    assert(vs_prog_data->base.urb_read_length <= 15);
1422
1423    /* Rewrite all ATTR file references to the hw grf that they land in. */
1424    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1425       for (int i = 0; i < inst->sources; i++) {
1426          if (inst->src[i].file == ATTR) {
1427
1428             if (inst->src[i].reg == VERT_ATTRIB_MAX) {
1429                slot = count - 1;
1430             } else {
1431                /* Attributes come in in a contiguous block, ordered by their
1432                 * gl_vert_attrib value.  That means we can compute the slot
1433                 * number for an attribute by masking out the enabled
1434                 * attributes before it and counting the bits.
1435                 */
1436                attr = inst->src[i].reg + inst->src[i].reg_offset / 4;
1437                slot = _mesa_bitcount_64(vs_prog_data->inputs_read &
1438                                         BITFIELD64_MASK(attr));
1439             }
1440
1441             channel = inst->src[i].reg_offset & 3;
1442
1443             grf = payload.num_regs +
1444                prog_data->curb_read_length +
1445                slot * 4 + channel;
1446
1447             inst->src[i].file = HW_REG;
1448             inst->src[i].fixed_hw_reg =
1449                retype(brw_vec8_grf(grf, 0), inst->src[i].type);
1450          }
1451       }
1452    }
1453 }
1454
1455 /**
1456  * Split large virtual GRFs into separate components if we can.
1457  *
1458  * This is mostly duplicated with what brw_fs_vector_splitting does,
1459  * but that's really conservative because it's afraid of doing
1460  * splitting that doesn't result in real progress after the rest of
1461  * the optimization phases, which would cause infinite looping in
1462  * optimization.  We can do it once here, safely.  This also has the
1463  * opportunity to split interpolated values, or maybe even uniforms,
1464  * which we don't have at the IR level.
1465  *
1466  * We want to split, because virtual GRFs are what we register
1467  * allocate and spill (due to contiguousness requirements for some
1468  * instructions), and they're what we naturally generate in the
1469  * codegen process, but most virtual GRFs don't actually need to be
1470  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1471  * live intervals and better dead code elimination and coalescing.
1472  */
1473 void
1474 fs_visitor::split_virtual_grfs()
1475 {
1476    int num_vars = this->alloc.count;
1477
1478    /* Count the total number of registers */
1479    int reg_count = 0;
1480    int vgrf_to_reg[num_vars];
1481    for (int i = 0; i < num_vars; i++) {
1482       vgrf_to_reg[i] = reg_count;
1483       reg_count += alloc.sizes[i];
1484    }
1485
1486    /* An array of "split points".  For each register slot, this indicates
1487     * if this slot can be separated from the previous slot.  Every time an
1488     * instruction uses multiple elements of a register (as a source or
1489     * destination), we mark the used slots as inseparable.  Then we go
1490     * through and split the registers into the smallest pieces we can.
1491     */
1492    bool split_points[reg_count];
1493    memset(split_points, 0, sizeof(split_points));
1494
1495    /* Mark all used registers as fully splittable */
1496    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1497       if (inst->dst.file == GRF) {
1498          int reg = vgrf_to_reg[inst->dst.reg];
1499          for (unsigned j = 1; j < this->alloc.sizes[inst->dst.reg]; j++)
1500             split_points[reg + j] = true;
1501       }
1502
1503       for (int i = 0; i < inst->sources; i++) {
1504          if (inst->src[i].file == GRF) {
1505             int reg = vgrf_to_reg[inst->src[i].reg];
1506             for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].reg]; j++)
1507                split_points[reg + j] = true;
1508          }
1509       }
1510    }
1511
1512    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1513       if (inst->dst.file == GRF) {
1514          int reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
1515          for (int j = 1; j < inst->regs_written; j++)
1516             split_points[reg + j] = false;
1517       }
1518       for (int i = 0; i < inst->sources; i++) {
1519          if (inst->src[i].file == GRF) {
1520             int reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
1521             for (int j = 1; j < inst->regs_read(i); j++)
1522                split_points[reg + j] = false;
1523          }
1524       }
1525    }
1526
1527    int new_virtual_grf[reg_count];
1528    int new_reg_offset[reg_count];
1529
1530    int reg = 0;
1531    for (int i = 0; i < num_vars; i++) {
1532       /* The first one should always be 0 as a quick sanity check. */
1533       assert(split_points[reg] == false);
1534
1535       /* j = 0 case */
1536       new_reg_offset[reg] = 0;
1537       reg++;
1538       int offset = 1;
1539
1540       /* j > 0 case */
1541       for (unsigned j = 1; j < alloc.sizes[i]; j++) {
1542          /* If this is a split point, reset the offset to 0 and allocate a
1543           * new virtual GRF for the previous offset many registers
1544           */
1545          if (split_points[reg]) {
1546             assert(offset <= MAX_VGRF_SIZE);
1547             int grf = alloc.allocate(offset);
1548             for (int k = reg - offset; k < reg; k++)
1549                new_virtual_grf[k] = grf;
1550             offset = 0;
1551          }
1552          new_reg_offset[reg] = offset;
1553          offset++;
1554          reg++;
1555       }
1556
1557       /* The last one gets the original register number */
1558       assert(offset <= MAX_VGRF_SIZE);
1559       alloc.sizes[i] = offset;
1560       for (int k = reg - offset; k < reg; k++)
1561          new_virtual_grf[k] = i;
1562    }
1563    assert(reg == reg_count);
1564
1565    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1566       if (inst->dst.file == GRF) {
1567          reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
1568          inst->dst.reg = new_virtual_grf[reg];
1569          inst->dst.reg_offset = new_reg_offset[reg];
1570          assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
1571       }
1572       for (int i = 0; i < inst->sources; i++) {
1573          if (inst->src[i].file == GRF) {
1574             reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
1575             inst->src[i].reg = new_virtual_grf[reg];
1576             inst->src[i].reg_offset = new_reg_offset[reg];
1577             assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
1578          }
1579       }
1580    }
1581    invalidate_live_intervals();
1582 }
1583
1584 /**
1585  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1586  *
1587  * During code generation, we create tons of temporary variables, many of
1588  * which get immediately killed and are never used again.  Yet, in later
1589  * optimization and analysis passes, such as compute_live_intervals, we need
1590  * to loop over all the virtual GRFs.  Compacting them can save a lot of
1591  * overhead.
1592  */
1593 bool
1594 fs_visitor::compact_virtual_grfs()
1595 {
1596    bool progress = false;
1597    int remap_table[this->alloc.count];
1598    memset(remap_table, -1, sizeof(remap_table));
1599
1600    /* Mark which virtual GRFs are used. */
1601    foreach_block_and_inst(block, const fs_inst, inst, cfg) {
1602       if (inst->dst.file == GRF)
1603          remap_table[inst->dst.reg] = 0;
1604
1605       for (int i = 0; i < inst->sources; i++) {
1606          if (inst->src[i].file == GRF)
1607             remap_table[inst->src[i].reg] = 0;
1608       }
1609    }
1610
1611    /* Compact the GRF arrays. */
1612    int new_index = 0;
1613    for (unsigned i = 0; i < this->alloc.count; i++) {
1614       if (remap_table[i] == -1) {
1615          /* We just found an unused register.  This means that we are
1616           * actually going to compact something.
1617           */
1618          progress = true;
1619       } else {
1620          remap_table[i] = new_index;
1621          alloc.sizes[new_index] = alloc.sizes[i];
1622          invalidate_live_intervals();
1623          ++new_index;
1624       }
1625    }
1626
1627    this->alloc.count = new_index;
1628
1629    /* Patch all the instructions to use the newly renumbered registers */
1630    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1631       if (inst->dst.file == GRF)
1632          inst->dst.reg = remap_table[inst->dst.reg];
1633
1634       for (int i = 0; i < inst->sources; i++) {
1635          if (inst->src[i].file == GRF)
1636             inst->src[i].reg = remap_table[inst->src[i].reg];
1637       }
1638    }
1639
1640    /* Patch all the references to delta_xy, since they're used in register
1641     * allocation.  If they're unused, switch them to BAD_FILE so we don't
1642     * think some random VGRF is delta_xy.
1643     */
1644    for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
1645       if (delta_xy[i].file == GRF) {
1646          if (remap_table[delta_xy[i].reg] != -1) {
1647             delta_xy[i].reg = remap_table[delta_xy[i].reg];
1648          } else {
1649             delta_xy[i].file = BAD_FILE;
1650          }
1651       }
1652    }
1653
1654    return progress;
1655 }
1656
1657 /*
1658  * Implements array access of uniforms by inserting a
1659  * PULL_CONSTANT_LOAD instruction.
1660  *
1661  * Unlike temporary GRF array access (where we don't support it due to
1662  * the difficulty of doing relative addressing on instruction
1663  * destinations), we could potentially do array access of uniforms
1664  * that were loaded in GRF space as push constants.  In real-world
1665  * usage we've seen, though, the arrays being used are always larger
1666  * than we could load as push constants, so just always move all
1667  * uniform array access out to a pull constant buffer.
1668  */
1669 void
1670 fs_visitor::move_uniform_array_access_to_pull_constants()
1671 {
1672    if (dispatch_width != 8)
1673       return;
1674
1675    pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
1676    memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
1677
1678    /* Walk through and find array access of uniforms.  Put a copy of that
1679     * uniform in the pull constant buffer.
1680     *
1681     * Note that we don't move constant-indexed accesses to arrays.  No
1682     * testing has been done of the performance impact of this choice.
1683     */
1684    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
1685       for (int i = 0 ; i < inst->sources; i++) {
1686          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1687             continue;
1688
1689          int uniform = inst->src[i].reg;
1690
1691          /* If this array isn't already present in the pull constant buffer,
1692           * add it.
1693           */
1694          if (pull_constant_loc[uniform] == -1) {
1695             const gl_constant_value **values = &stage_prog_data->param[uniform];
1696
1697             assert(param_size[uniform]);
1698
1699             for (int j = 0; j < param_size[uniform]; j++) {
1700                pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
1701
1702                stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
1703                   values[j];
1704             }
1705          }
1706       }
1707    }
1708 }
1709
1710 /**
1711  * Assign UNIFORM file registers to either push constants or pull constants.
1712  *
1713  * We allow a fragment shader to have more than the specified minimum
1714  * maximum number of fragment shader uniform components (64).  If
1715  * there are too many of these, they'd fill up all of register space.
1716  * So, this will push some of them out to the pull constant buffer and
1717  * update the program to load them.
1718  */
1719 void
1720 fs_visitor::assign_constant_locations()
1721 {
1722    /* Only the first compile (SIMD8 mode) gets to decide on locations. */
1723    if (dispatch_width != 8)
1724       return;
1725
1726    /* Find which UNIFORM registers are still in use. */
1727    bool is_live[uniforms];
1728    for (unsigned int i = 0; i < uniforms; i++) {
1729       is_live[i] = false;
1730    }
1731
1732    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1733       for (int i = 0; i < inst->sources; i++) {
1734          if (inst->src[i].file != UNIFORM)
1735             continue;
1736
1737          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1738          if (constant_nr >= 0 && constant_nr < (int) uniforms)
1739             is_live[constant_nr] = true;
1740       }
1741    }
1742
1743    /* Only allow 16 registers (128 uniform components) as push constants.
1744     *
1745     * Just demote the end of the list.  We could probably do better
1746     * here, demoting things that are rarely used in the program first.
1747     *
1748     * If changing this value, note the limitation about total_regs in
1749     * brw_curbe.c.
1750     */
1751    unsigned int max_push_components = 16 * 8;
1752    unsigned int num_push_constants = 0;
1753
1754    push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
1755
1756    for (unsigned int i = 0; i < uniforms; i++) {
1757       if (!is_live[i] || pull_constant_loc[i] != -1) {
1758          /* This UNIFORM register is either dead, or has already been demoted
1759           * to a pull const.  Mark it as no longer living in the param[] array.
1760           */
1761          push_constant_loc[i] = -1;
1762          continue;
1763       }
1764
1765       if (num_push_constants < max_push_components) {
1766          /* Retain as a push constant.  Record the location in the params[]
1767           * array.
1768           */
1769          push_constant_loc[i] = num_push_constants++;
1770       } else {
1771          /* Demote to a pull constant. */
1772          push_constant_loc[i] = -1;
1773
1774          int pull_index = stage_prog_data->nr_pull_params++;
1775          stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
1776          pull_constant_loc[i] = pull_index;
1777       }
1778    }
1779
1780    stage_prog_data->nr_params = num_push_constants;
1781
1782    /* Up until now, the param[] array has been indexed by reg + reg_offset
1783     * of UNIFORM registers.  Condense it to only contain the uniforms we
1784     * chose to upload as push constants.
1785     */
1786    for (unsigned int i = 0; i < uniforms; i++) {
1787       int remapped = push_constant_loc[i];
1788
1789       if (remapped == -1)
1790          continue;
1791
1792       assert(remapped <= (int)i);
1793       stage_prog_data->param[remapped] = stage_prog_data->param[i];
1794    }
1795 }
1796
1797 /**
1798  * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
1799  * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
1800  */
1801 void
1802 fs_visitor::demote_pull_constants()
1803 {
1804    foreach_block_and_inst (block, fs_inst, inst, cfg) {
1805       for (int i = 0; i < inst->sources; i++) {
1806          if (inst->src[i].file != UNIFORM)
1807             continue;
1808
1809          int pull_index;
1810          unsigned location = inst->src[i].reg + inst->src[i].reg_offset;
1811          if (location >= uniforms) /* Out of bounds access */
1812             pull_index = -1;
1813          else
1814             pull_index = pull_constant_loc[location];
1815
1816          if (pull_index == -1)
1817             continue;
1818
1819          /* Set up the annotation tracking for new generated instructions. */
1820          const fs_builder ibld = bld.annotate(inst->annotation, inst->ir)
1821                                     .at(block, inst);
1822          fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
1823          fs_reg dst = vgrf(glsl_type::float_type);
1824
1825          /* Generate a pull load into dst. */
1826          if (inst->src[i].reladdr) {
1827             VARYING_PULL_CONSTANT_LOAD(ibld, dst,
1828                                        surf_index,
1829                                        *inst->src[i].reladdr,
1830                                        pull_index);
1831             inst->src[i].reladdr = NULL;
1832          } else {
1833             fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1834             ibld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1835                       dst, surf_index, offset);
1836             inst->src[i].set_smear(pull_index & 3);
1837          }
1838
1839          /* Rewrite the instruction to use the temporary VGRF. */
1840          inst->src[i].file = GRF;
1841          inst->src[i].reg = dst.reg;
1842          inst->src[i].reg_offset = 0;
1843       }
1844    }
1845    invalidate_live_intervals();
1846 }
1847
1848 bool
1849 fs_visitor::opt_algebraic()
1850 {
1851    bool progress = false;
1852
1853    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1854       switch (inst->opcode) {
1855       case BRW_OPCODE_MOV:
1856          if (inst->src[0].file != IMM)
1857             break;
1858
1859          if (inst->saturate) {
1860             if (inst->dst.type != inst->src[0].type)
1861                assert(!"unimplemented: saturate mixed types");
1862
1863             if (brw_saturate_immediate(inst->dst.type,
1864                                        &inst->src[0].fixed_hw_reg)) {
1865                inst->saturate = false;
1866                progress = true;
1867             }
1868          }
1869          break;
1870
1871       case BRW_OPCODE_MUL:
1872          if (inst->src[1].file != IMM)
1873             continue;
1874
1875          /* a * 1.0 = a */
1876          if (inst->src[1].is_one()) {
1877             inst->opcode = BRW_OPCODE_MOV;
1878             inst->src[1] = reg_undef;
1879             progress = true;
1880             break;
1881          }
1882
1883          /* a * -1.0 = -a */
1884          if (inst->src[1].is_negative_one()) {
1885             inst->opcode = BRW_OPCODE_MOV;
1886             inst->src[0].negate = !inst->src[0].negate;
1887             inst->src[1] = reg_undef;
1888             progress = true;
1889             break;
1890          }
1891
1892          /* a * 0.0 = 0.0 */
1893          if (inst->src[1].is_zero()) {
1894             inst->opcode = BRW_OPCODE_MOV;
1895             inst->src[0] = inst->src[1];
1896             inst->src[1] = reg_undef;
1897             progress = true;
1898             break;
1899          }
1900
1901          if (inst->src[0].file == IMM) {
1902             assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
1903             inst->opcode = BRW_OPCODE_MOV;
1904             inst->src[0].fixed_hw_reg.dw1.f *= inst->src[1].fixed_hw_reg.dw1.f;
1905             inst->src[1] = reg_undef;
1906             progress = true;
1907             break;
1908          }
1909          break;
1910       case BRW_OPCODE_ADD:
1911          if (inst->src[1].file != IMM)
1912             continue;
1913
1914          /* a + 0.0 = a */
1915          if (inst->src[1].is_zero()) {
1916             inst->opcode = BRW_OPCODE_MOV;
1917             inst->src[1] = reg_undef;
1918             progress = true;
1919             break;
1920          }
1921
1922          if (inst->src[0].file == IMM) {
1923             assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
1924             inst->opcode = BRW_OPCODE_MOV;
1925             inst->src[0].fixed_hw_reg.dw1.f += inst->src[1].fixed_hw_reg.dw1.f;
1926             inst->src[1] = reg_undef;
1927             progress = true;
1928             break;
1929          }
1930          break;
1931       case BRW_OPCODE_OR:
1932          if (inst->src[0].equals(inst->src[1])) {
1933             inst->opcode = BRW_OPCODE_MOV;
1934             inst->src[1] = reg_undef;
1935             progress = true;
1936             break;
1937          }
1938          break;
1939       case BRW_OPCODE_LRP:
1940          if (inst->src[1].equals(inst->src[2])) {
1941             inst->opcode = BRW_OPCODE_MOV;
1942             inst->src[0] = inst->src[1];
1943             inst->src[1] = reg_undef;
1944             inst->src[2] = reg_undef;
1945             progress = true;
1946             break;
1947          }
1948          break;
1949       case BRW_OPCODE_CMP:
1950          if (inst->conditional_mod == BRW_CONDITIONAL_GE &&
1951              inst->src[0].abs &&
1952              inst->src[0].negate &&
1953              inst->src[1].is_zero()) {
1954             inst->src[0].abs = false;
1955             inst->src[0].negate = false;
1956             inst->conditional_mod = BRW_CONDITIONAL_Z;
1957             progress = true;
1958             break;
1959          }
1960          break;
1961       case BRW_OPCODE_SEL:
1962          if (inst->src[0].equals(inst->src[1])) {
1963             inst->opcode = BRW_OPCODE_MOV;
1964             inst->src[1] = reg_undef;
1965             inst->predicate = BRW_PREDICATE_NONE;
1966             inst->predicate_inverse = false;
1967             progress = true;
1968          } else if (inst->saturate && inst->src[1].file == IMM) {
1969             switch (inst->conditional_mod) {
1970             case BRW_CONDITIONAL_LE:
1971             case BRW_CONDITIONAL_L:
1972                switch (inst->src[1].type) {
1973                case BRW_REGISTER_TYPE_F:
1974                   if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
1975                      inst->opcode = BRW_OPCODE_MOV;
1976                      inst->src[1] = reg_undef;
1977                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
1978                      progress = true;
1979                   }
1980                   break;
1981                default:
1982                   break;
1983                }
1984                break;
1985             case BRW_CONDITIONAL_GE:
1986             case BRW_CONDITIONAL_G:
1987                switch (inst->src[1].type) {
1988                case BRW_REGISTER_TYPE_F:
1989                   if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
1990                      inst->opcode = BRW_OPCODE_MOV;
1991                      inst->src[1] = reg_undef;
1992                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
1993                      progress = true;
1994                   }
1995                   break;
1996                default:
1997                   break;
1998                }
1999             default:
2000                break;
2001             }
2002          }
2003          break;
2004       case BRW_OPCODE_MAD:
2005          if (inst->src[1].is_zero() || inst->src[2].is_zero()) {
2006             inst->opcode = BRW_OPCODE_MOV;
2007             inst->src[1] = reg_undef;
2008             inst->src[2] = reg_undef;
2009             progress = true;
2010          } else if (inst->src[0].is_zero()) {
2011             inst->opcode = BRW_OPCODE_MUL;
2012             inst->src[0] = inst->src[2];
2013             inst->src[2] = reg_undef;
2014             progress = true;
2015          } else if (inst->src[1].is_one()) {
2016             inst->opcode = BRW_OPCODE_ADD;
2017             inst->src[1] = inst->src[2];
2018             inst->src[2] = reg_undef;
2019             progress = true;
2020          } else if (inst->src[2].is_one()) {
2021             inst->opcode = BRW_OPCODE_ADD;
2022             inst->src[2] = reg_undef;
2023             progress = true;
2024          } else if (inst->src[1].file == IMM && inst->src[2].file == IMM) {
2025             inst->opcode = BRW_OPCODE_ADD;
2026             inst->src[1].fixed_hw_reg.dw1.f *= inst->src[2].fixed_hw_reg.dw1.f;
2027             inst->src[2] = reg_undef;
2028             progress = true;
2029          }
2030          break;
2031       case SHADER_OPCODE_RCP: {
2032          fs_inst *prev = (fs_inst *)inst->prev;
2033          if (prev->opcode == SHADER_OPCODE_SQRT) {
2034             if (inst->src[0].equals(prev->dst)) {
2035                inst->opcode = SHADER_OPCODE_RSQ;
2036                inst->src[0] = prev->src[0];
2037                progress = true;
2038             }
2039          }
2040          break;
2041       }
2042       case SHADER_OPCODE_BROADCAST:
2043          if (is_uniform(inst->src[0])) {
2044             inst->opcode = BRW_OPCODE_MOV;
2045             inst->sources = 1;
2046             inst->force_writemask_all = true;
2047             progress = true;
2048          } else if (inst->src[1].file == IMM) {
2049             inst->opcode = BRW_OPCODE_MOV;
2050             inst->src[0] = component(inst->src[0],
2051                                      inst->src[1].fixed_hw_reg.dw1.ud);
2052             inst->sources = 1;
2053             inst->force_writemask_all = true;
2054             progress = true;
2055          }
2056          break;
2057
2058       default:
2059          break;
2060       }
2061
2062       /* Swap if src[0] is immediate. */
2063       if (progress && inst->is_commutative()) {
2064          if (inst->src[0].file == IMM) {
2065             fs_reg tmp = inst->src[1];
2066             inst->src[1] = inst->src[0];
2067             inst->src[0] = tmp;
2068          }
2069       }
2070    }
2071    return progress;
2072 }
2073
2074 /**
2075  * Optimize sample messages that have constant zero values for the trailing
2076  * texture coordinates. We can just reduce the message length for these
2077  * instructions instead of reserving a register for it. Trailing parameters
2078  * that aren't sent default to zero anyway. This will cause the dead code
2079  * eliminator to remove the MOV instruction that would otherwise be emitted to
2080  * set up the zero value.
2081  */
2082 bool
2083 fs_visitor::opt_zero_samples()
2084 {
2085    /* Gen4 infers the texturing opcode based on the message length so we can't
2086     * change it.
2087     */
2088    if (devinfo->gen < 5)
2089       return false;
2090
2091    bool progress = false;
2092
2093    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2094       if (!inst->is_tex())
2095          continue;
2096
2097       fs_inst *load_payload = (fs_inst *) inst->prev;
2098
2099       if (load_payload->is_head_sentinel() ||
2100           load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2101          continue;
2102
2103       /* We don't want to remove the message header or the first parameter.
2104        * Removing the first parameter is not allowed, see the Haswell PRM
2105        * volume 7, page 149:
2106        *
2107        *     "Parameter 0 is required except for the sampleinfo message, which
2108        *      has no parameter 0"
2109        */
2110       while (inst->mlen > inst->header_size + dispatch_width / 8 &&
2111              load_payload->src[(inst->mlen - inst->header_size) /
2112                                (dispatch_width / 8) +
2113                                inst->header_size - 1].is_zero()) {
2114          inst->mlen -= dispatch_width / 8;
2115          progress = true;
2116       }
2117    }
2118
2119    if (progress)
2120       invalidate_live_intervals();
2121
2122    return progress;
2123 }
2124
2125 /**
2126  * Optimize sample messages which are followed by the final RT write.
2127  *
2128  * CHV, and GEN9+ can mark a texturing SEND instruction with EOT to have its
2129  * results sent directly to the framebuffer, bypassing the EU.  Recognize the
2130  * final texturing results copied to the framebuffer write payload and modify
2131  * them to write to the framebuffer directly.
2132  */
2133 bool
2134 fs_visitor::opt_sampler_eot()
2135 {
2136    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2137
2138    if (stage != MESA_SHADER_FRAGMENT)
2139       return false;
2140
2141    if (devinfo->gen < 9 && !devinfo->is_cherryview)
2142       return false;
2143
2144    /* FINISHME: It should be possible to implement this optimization when there
2145     * are multiple drawbuffers.
2146     */
2147    if (key->nr_color_regions != 1)
2148       return false;
2149
2150    /* Look for a texturing instruction immediately before the final FB_WRITE. */
2151    fs_inst *fb_write = (fs_inst *) cfg->blocks[cfg->num_blocks - 1]->end();
2152    assert(fb_write->eot);
2153    assert(fb_write->opcode == FS_OPCODE_FB_WRITE);
2154
2155    fs_inst *tex_inst = (fs_inst *) fb_write->prev;
2156
2157    /* There wasn't one; nothing to do. */
2158    if (unlikely(tex_inst->is_head_sentinel()) || !tex_inst->is_tex())
2159       return false;
2160
2161    /* This optimisation doesn't seem to work for textureGather for some
2162     * reason. I can't find any documentation or known workarounds to indicate
2163     * that this is expected, but considering that it is probably pretty
2164     * unlikely that a shader would directly write out the results from
2165     * textureGather we might as well just disable it.
2166     */
2167    if (tex_inst->opcode == SHADER_OPCODE_TG4 ||
2168        tex_inst->opcode == SHADER_OPCODE_TG4_OFFSET)
2169       return false;
2170
2171    /* If there's no header present, we need to munge the LOAD_PAYLOAD as well.
2172     * It's very likely to be the previous instruction.
2173     */
2174    fs_inst *load_payload = (fs_inst *) tex_inst->prev;
2175    if (load_payload->is_head_sentinel() ||
2176        load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2177       return false;
2178
2179    assert(!tex_inst->eot); /* We can't get here twice */
2180    assert((tex_inst->offset & (0xff << 24)) == 0);
2181
2182    tex_inst->offset |= fb_write->target << 24;
2183    tex_inst->eot = true;
2184    tex_inst->dst = bld.null_reg_ud();
2185    fb_write->remove(cfg->blocks[cfg->num_blocks - 1]);
2186
2187    /* If a header is present, marking the eot is sufficient. Otherwise, we need
2188     * to create a new LOAD_PAYLOAD command with the same sources and a space
2189     * saved for the header. Using a new destination register not only makes sure
2190     * we have enough space, but it will make sure the dead code eliminator kills
2191     * the instruction that this will replace.
2192     */
2193    if (tex_inst->header_size != 0)
2194       return true;
2195
2196    fs_reg send_header = bld.vgrf(BRW_REGISTER_TYPE_F,
2197                                  load_payload->sources + 1);
2198    fs_reg *new_sources =
2199       ralloc_array(mem_ctx, fs_reg, load_payload->sources + 1);
2200
2201    new_sources[0] = fs_reg();
2202    for (int i = 0; i < load_payload->sources; i++)
2203       new_sources[i+1] = load_payload->src[i];
2204
2205    /* The LOAD_PAYLOAD helper seems like the obvious choice here. However, it
2206     * requires a lot of information about the sources to appropriately figure
2207     * out the number of registers needed to be used. Given this stage in our
2208     * optimization, we may not have the appropriate GRFs required by
2209     * LOAD_PAYLOAD at this point (copy propagation). Therefore, we need to
2210     * manually emit the instruction.
2211     */
2212    fs_inst *new_load_payload = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD,
2213                                                     load_payload->exec_size,
2214                                                     send_header,
2215                                                     new_sources,
2216                                                     load_payload->sources + 1);
2217
2218    new_load_payload->regs_written = load_payload->regs_written + 1;
2219    new_load_payload->header_size = 1;
2220    tex_inst->mlen++;
2221    tex_inst->header_size = 1;
2222    tex_inst->insert_before(cfg->blocks[cfg->num_blocks - 1], new_load_payload);
2223    tex_inst->src[0] = send_header;
2224
2225    return true;
2226 }
2227
2228 bool
2229 fs_visitor::opt_register_renaming()
2230 {
2231    bool progress = false;
2232    int depth = 0;
2233
2234    int remap[alloc.count];
2235    memset(remap, -1, sizeof(int) * alloc.count);
2236
2237    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2238       if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
2239          depth++;
2240       } else if (inst->opcode == BRW_OPCODE_ENDIF ||
2241                  inst->opcode == BRW_OPCODE_WHILE) {
2242          depth--;
2243       }
2244
2245       /* Rewrite instruction sources. */
2246       for (int i = 0; i < inst->sources; i++) {
2247          if (inst->src[i].file == GRF &&
2248              remap[inst->src[i].reg] != -1 &&
2249              remap[inst->src[i].reg] != inst->src[i].reg) {
2250             inst->src[i].reg = remap[inst->src[i].reg];
2251             progress = true;
2252          }
2253       }
2254
2255       const int dst = inst->dst.reg;
2256
2257       if (depth == 0 &&
2258           inst->dst.file == GRF &&
2259           alloc.sizes[inst->dst.reg] == inst->exec_size / 8 &&
2260           !inst->is_partial_write()) {
2261          if (remap[dst] == -1) {
2262             remap[dst] = dst;
2263          } else {
2264             remap[dst] = alloc.allocate(inst->exec_size / 8);
2265             inst->dst.reg = remap[dst];
2266             progress = true;
2267          }
2268       } else if (inst->dst.file == GRF &&
2269                  remap[dst] != -1 &&
2270                  remap[dst] != dst) {
2271          inst->dst.reg = remap[dst];
2272          progress = true;
2273       }
2274    }
2275
2276    if (progress) {
2277       invalidate_live_intervals();
2278
2279       for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
2280          if (delta_xy[i].file == GRF && remap[delta_xy[i].reg] != -1) {
2281             delta_xy[i].reg = remap[delta_xy[i].reg];
2282          }
2283       }
2284    }
2285
2286    return progress;
2287 }
2288
2289 /**
2290  * Remove redundant or useless discard jumps.
2291  *
2292  * For example, we can eliminate jumps in the following sequence:
2293  *
2294  * discard-jump       (redundant with the next jump)
2295  * discard-jump       (useless; jumps to the next instruction)
2296  * placeholder-halt
2297  */
2298 bool
2299 fs_visitor::opt_redundant_discard_jumps()
2300 {
2301    bool progress = false;
2302
2303    bblock_t *last_bblock = cfg->blocks[cfg->num_blocks - 1];
2304
2305    fs_inst *placeholder_halt = NULL;
2306    foreach_inst_in_block_reverse(fs_inst, inst, last_bblock) {
2307       if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT) {
2308          placeholder_halt = inst;
2309          break;
2310       }
2311    }
2312
2313    if (!placeholder_halt)
2314       return false;
2315
2316    /* Delete any HALTs immediately before the placeholder halt. */
2317    for (fs_inst *prev = (fs_inst *) placeholder_halt->prev;
2318         !prev->is_head_sentinel() && prev->opcode == FS_OPCODE_DISCARD_JUMP;
2319         prev = (fs_inst *) placeholder_halt->prev) {
2320       prev->remove(last_bblock);
2321       progress = true;
2322    }
2323
2324    if (progress)
2325       invalidate_live_intervals();
2326
2327    return progress;
2328 }
2329
2330 bool
2331 fs_visitor::compute_to_mrf()
2332 {
2333    bool progress = false;
2334    int next_ip = 0;
2335
2336    /* No MRFs on Gen >= 7. */
2337    if (devinfo->gen >= 7)
2338       return false;
2339
2340    calculate_live_intervals();
2341
2342    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2343       int ip = next_ip;
2344       next_ip++;
2345
2346       if (inst->opcode != BRW_OPCODE_MOV ||
2347           inst->is_partial_write() ||
2348           inst->dst.file != MRF || inst->src[0].file != GRF ||
2349           inst->dst.type != inst->src[0].type ||
2350           inst->src[0].abs || inst->src[0].negate ||
2351           !inst->src[0].is_contiguous() ||
2352           inst->src[0].subreg_offset)
2353          continue;
2354
2355       /* Work out which hardware MRF registers are written by this
2356        * instruction.
2357        */
2358       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2359       int mrf_high;
2360       if (inst->dst.reg & BRW_MRF_COMPR4) {
2361          mrf_high = mrf_low + 4;
2362       } else if (inst->exec_size == 16) {
2363          mrf_high = mrf_low + 1;
2364       } else {
2365          mrf_high = mrf_low;
2366       }
2367
2368       /* Can't compute-to-MRF this GRF if someone else was going to
2369        * read it later.
2370        */
2371       if (this->virtual_grf_end[inst->src[0].reg] > ip)
2372          continue;
2373
2374       /* Found a move of a GRF to a MRF.  Let's see if we can go
2375        * rewrite the thing that made this GRF to write into the MRF.
2376        */
2377       foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2378          if (scan_inst->dst.file == GRF &&
2379              scan_inst->dst.reg == inst->src[0].reg) {
2380             /* Found the last thing to write our reg we want to turn
2381              * into a compute-to-MRF.
2382              */
2383
2384             /* If this one instruction didn't populate all the
2385              * channels, bail.  We might be able to rewrite everything
2386              * that writes that reg, but it would require smarter
2387              * tracking to delay the rewriting until complete success.
2388              */
2389             if (scan_inst->is_partial_write())
2390                break;
2391
2392             /* Things returning more than one register would need us to
2393              * understand coalescing out more than one MOV at a time.
2394              */
2395             if (scan_inst->regs_written > scan_inst->exec_size / 8)
2396                break;
2397
2398             /* SEND instructions can't have MRF as a destination. */
2399             if (scan_inst->mlen)
2400                break;
2401
2402             if (devinfo->gen == 6) {
2403                /* gen6 math instructions must have the destination be
2404                 * GRF, so no compute-to-MRF for them.
2405                 */
2406                if (scan_inst->is_math()) {
2407                   break;
2408                }
2409             }
2410
2411             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2412                /* Found the creator of our MRF's source value. */
2413                scan_inst->dst.file = MRF;
2414                scan_inst->dst.reg = inst->dst.reg;
2415                scan_inst->saturate |= inst->saturate;
2416                inst->remove(block);
2417                progress = true;
2418             }
2419             break;
2420          }
2421
2422          /* We don't handle control flow here.  Most computation of
2423           * values that end up in MRFs are shortly before the MRF
2424           * write anyway.
2425           */
2426          if (block->start() == scan_inst)
2427             break;
2428
2429          /* You can't read from an MRF, so if someone else reads our
2430           * MRF's source GRF that we wanted to rewrite, that stops us.
2431           */
2432          bool interfered = false;
2433          for (int i = 0; i < scan_inst->sources; i++) {
2434             if (scan_inst->src[i].file == GRF &&
2435                 scan_inst->src[i].reg == inst->src[0].reg &&
2436                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2437                interfered = true;
2438             }
2439          }
2440          if (interfered)
2441             break;
2442
2443          if (scan_inst->dst.file == MRF) {
2444             /* If somebody else writes our MRF here, we can't
2445              * compute-to-MRF before that.
2446              */
2447             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2448             int scan_mrf_high;
2449
2450             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2451                scan_mrf_high = scan_mrf_low + 4;
2452             } else if (scan_inst->exec_size == 16) {
2453                scan_mrf_high = scan_mrf_low + 1;
2454             } else {
2455                scan_mrf_high = scan_mrf_low;
2456             }
2457
2458             if (mrf_low == scan_mrf_low ||
2459                 mrf_low == scan_mrf_high ||
2460                 mrf_high == scan_mrf_low ||
2461                 mrf_high == scan_mrf_high) {
2462                break;
2463             }
2464          }
2465
2466          if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2467             /* Found a SEND instruction, which means that there are
2468              * live values in MRFs from base_mrf to base_mrf +
2469              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2470              * above it.
2471              */
2472             if (mrf_low >= scan_inst->base_mrf &&
2473                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2474                break;
2475             }
2476             if (mrf_high >= scan_inst->base_mrf &&
2477                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2478                break;
2479             }
2480          }
2481       }
2482    }
2483
2484    if (progress)
2485       invalidate_live_intervals();
2486
2487    return progress;
2488 }
2489
2490 /**
2491  * Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control
2492  * flow.  We could probably do better here with some form of divergence
2493  * analysis.
2494  */
2495 bool
2496 fs_visitor::eliminate_find_live_channel()
2497 {
2498    bool progress = false;
2499    unsigned depth = 0;
2500
2501    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2502       switch (inst->opcode) {
2503       case BRW_OPCODE_IF:
2504       case BRW_OPCODE_DO:
2505          depth++;
2506          break;
2507
2508       case BRW_OPCODE_ENDIF:
2509       case BRW_OPCODE_WHILE:
2510          depth--;
2511          break;
2512
2513       case FS_OPCODE_DISCARD_JUMP:
2514          /* This can potentially make control flow non-uniform until the end
2515           * of the program.
2516           */
2517          return progress;
2518
2519       case SHADER_OPCODE_FIND_LIVE_CHANNEL:
2520          if (depth == 0) {
2521             inst->opcode = BRW_OPCODE_MOV;
2522             inst->src[0] = fs_reg(0);
2523             inst->sources = 1;
2524             inst->force_writemask_all = true;
2525             progress = true;
2526          }
2527          break;
2528
2529       default:
2530          break;
2531       }
2532    }
2533
2534    return progress;
2535 }
2536
2537 /**
2538  * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
2539  * instructions to FS_OPCODE_REP_FB_WRITE.
2540  */
2541 void
2542 fs_visitor::emit_repclear_shader()
2543 {
2544    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2545    int base_mrf = 1;
2546    int color_mrf = base_mrf + 2;
2547
2548    fs_inst *mov = bld.exec_all().MOV(vec4(brw_message_reg(color_mrf)),
2549                                      fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F));
2550
2551    fs_inst *write;
2552    if (key->nr_color_regions == 1) {
2553       write = bld.emit(FS_OPCODE_REP_FB_WRITE);
2554       write->saturate = key->clamp_fragment_color;
2555       write->base_mrf = color_mrf;
2556       write->target = 0;
2557       write->header_size = 0;
2558       write->mlen = 1;
2559    } else {
2560       assume(key->nr_color_regions > 0);
2561       for (int i = 0; i < key->nr_color_regions; ++i) {
2562          write = bld.emit(FS_OPCODE_REP_FB_WRITE);
2563          write->saturate = key->clamp_fragment_color;
2564          write->base_mrf = base_mrf;
2565          write->target = i;
2566          write->header_size = 2;
2567          write->mlen = 3;
2568       }
2569    }
2570    write->eot = true;
2571
2572    calculate_cfg();
2573
2574    assign_constant_locations();
2575    assign_curb_setup();
2576
2577    /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
2578    assert(mov->src[0].file == HW_REG);
2579    mov->src[0] = brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
2580 }
2581
2582 /**
2583  * Walks through basic blocks, looking for repeated MRF writes and
2584  * removing the later ones.
2585  */
2586 bool
2587 fs_visitor::remove_duplicate_mrf_writes()
2588 {
2589    fs_inst *last_mrf_move[16];
2590    bool progress = false;
2591
2592    /* Need to update the MRF tracking for compressed instructions. */
2593    if (dispatch_width == 16)
2594       return false;
2595
2596    memset(last_mrf_move, 0, sizeof(last_mrf_move));
2597
2598    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
2599       if (inst->is_control_flow()) {
2600          memset(last_mrf_move, 0, sizeof(last_mrf_move));
2601       }
2602
2603       if (inst->opcode == BRW_OPCODE_MOV &&
2604           inst->dst.file == MRF) {
2605          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2606          if (prev_inst && inst->equals(prev_inst)) {
2607             inst->remove(block);
2608             progress = true;
2609             continue;
2610          }
2611       }
2612
2613       /* Clear out the last-write records for MRFs that were overwritten. */
2614       if (inst->dst.file == MRF) {
2615          last_mrf_move[inst->dst.reg] = NULL;
2616       }
2617
2618       if (inst->mlen > 0 && inst->base_mrf != -1) {
2619          /* Found a SEND instruction, which will include two or fewer
2620           * implied MRF writes.  We could do better here.
2621           */
2622          for (int i = 0; i < implied_mrf_writes(inst); i++) {
2623             last_mrf_move[inst->base_mrf + i] = NULL;
2624          }
2625       }
2626
2627       /* Clear out any MRF move records whose sources got overwritten. */
2628       if (inst->dst.file == GRF) {
2629          for (unsigned int i = 0; i < ARRAY_SIZE(last_mrf_move); i++) {
2630             if (last_mrf_move[i] &&
2631                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2632                last_mrf_move[i] = NULL;
2633             }
2634          }
2635       }
2636
2637       if (inst->opcode == BRW_OPCODE_MOV &&
2638           inst->dst.file == MRF &&
2639           inst->src[0].file == GRF &&
2640           !inst->is_partial_write()) {
2641          last_mrf_move[inst->dst.reg] = inst;
2642       }
2643    }
2644
2645    if (progress)
2646       invalidate_live_intervals();
2647
2648    return progress;
2649 }
2650
2651 static void
2652 clear_deps_for_inst_src(fs_inst *inst, bool *deps, int first_grf, int grf_len)
2653 {
2654    /* Clear the flag for registers that actually got read (as expected). */
2655    for (int i = 0; i < inst->sources; i++) {
2656       int grf;
2657       if (inst->src[i].file == GRF) {
2658          grf = inst->src[i].reg;
2659       } else if (inst->src[i].file == HW_REG &&
2660                  inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2661          grf = inst->src[i].fixed_hw_reg.nr;
2662       } else {
2663          continue;
2664       }
2665
2666       if (grf >= first_grf &&
2667           grf < first_grf + grf_len) {
2668          deps[grf - first_grf] = false;
2669          if (inst->exec_size == 16)
2670             deps[grf - first_grf + 1] = false;
2671       }
2672    }
2673 }
2674
2675 /**
2676  * Implements this workaround for the original 965:
2677  *
2678  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2679  *      check for post destination dependencies on this instruction, software
2680  *      must ensure that there is no destination hazard for the case of ‘write
2681  *      followed by a posted write’ shown in the following example.
2682  *
2683  *      1. mov r3 0
2684  *      2. send r3.xy <rest of send instruction>
2685  *      3. mov r2 r3
2686  *
2687  *      Due to no post-destination dependency check on the ‘send’, the above
2688  *      code sequence could have two instructions (1 and 2) in flight at the
2689  *      same time that both consider ‘r3’ as the target of their final writes.
2690  */
2691 void
2692 fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
2693                                                         fs_inst *inst)
2694 {
2695    int write_len = inst->regs_written;
2696    int first_write_grf = inst->dst.reg;
2697    bool needs_dep[BRW_MAX_MRF];
2698    assert(write_len < (int)sizeof(needs_dep) - 1);
2699
2700    memset(needs_dep, false, sizeof(needs_dep));
2701    memset(needs_dep, true, write_len);
2702
2703    clear_deps_for_inst_src(inst, needs_dep, first_write_grf, write_len);
2704
2705    /* Walk backwards looking for writes to registers we're writing which
2706     * aren't read since being written.  If we hit the start of the program,
2707     * we assume that there are no outstanding dependencies on entry to the
2708     * program.
2709     */
2710    foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2711       /* If we hit control flow, assume that there *are* outstanding
2712        * dependencies, and force their cleanup before our instruction.
2713        */
2714       if (block->start() == scan_inst) {
2715          for (int i = 0; i < write_len; i++) {
2716             if (needs_dep[i])
2717                DEP_RESOLVE_MOV(bld.at(block, inst), first_write_grf + i);
2718          }
2719          return;
2720       }
2721
2722       /* We insert our reads as late as possible on the assumption that any
2723        * instruction but a MOV that might have left us an outstanding
2724        * dependency has more latency than a MOV.
2725        */
2726       if (scan_inst->dst.file == GRF) {
2727          for (int i = 0; i < scan_inst->regs_written; i++) {
2728             int reg = scan_inst->dst.reg + i;
2729
2730             if (reg >= first_write_grf &&
2731                 reg < first_write_grf + write_len &&
2732                 needs_dep[reg - first_write_grf]) {
2733                DEP_RESOLVE_MOV(bld.at(block, inst), reg);
2734                needs_dep[reg - first_write_grf] = false;
2735                if (scan_inst->exec_size == 16)
2736                   needs_dep[reg - first_write_grf + 1] = false;
2737             }
2738          }
2739       }
2740
2741       /* Clear the flag for registers that actually got read (as expected). */
2742       clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
2743
2744       /* Continue the loop only if we haven't resolved all the dependencies */
2745       int i;
2746       for (i = 0; i < write_len; i++) {
2747          if (needs_dep[i])
2748             break;
2749       }
2750       if (i == write_len)
2751          return;
2752    }
2753 }
2754
2755 /**
2756  * Implements this workaround for the original 965:
2757  *
2758  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
2759  *      used as a destination register until after it has been sourced by an
2760  *      instruction with a different destination register.
2761  */
2762 void
2763 fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
2764 {
2765    int write_len = inst->regs_written;
2766    int first_write_grf = inst->dst.reg;
2767    bool needs_dep[BRW_MAX_MRF];
2768    assert(write_len < (int)sizeof(needs_dep) - 1);
2769
2770    memset(needs_dep, false, sizeof(needs_dep));
2771    memset(needs_dep, true, write_len);
2772    /* Walk forwards looking for writes to registers we're writing which aren't
2773     * read before being written.
2774     */
2775    foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst, block) {
2776       /* If we hit control flow, force resolve all remaining dependencies. */
2777       if (block->end() == scan_inst) {
2778          for (int i = 0; i < write_len; i++) {
2779             if (needs_dep[i])
2780                DEP_RESOLVE_MOV(bld.at(block, scan_inst), first_write_grf + i);
2781          }
2782          return;
2783       }
2784
2785       /* Clear the flag for registers that actually got read (as expected). */
2786       clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
2787
2788       /* We insert our reads as late as possible since they're reading the
2789        * result of a SEND, which has massive latency.
2790        */
2791       if (scan_inst->dst.file == GRF &&
2792           scan_inst->dst.reg >= first_write_grf &&
2793           scan_inst->dst.reg < first_write_grf + write_len &&
2794           needs_dep[scan_inst->dst.reg - first_write_grf]) {
2795          DEP_RESOLVE_MOV(bld.at(block, scan_inst), scan_inst->dst.reg);
2796          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2797       }
2798
2799       /* Continue the loop only if we haven't resolved all the dependencies */
2800       int i;
2801       for (i = 0; i < write_len; i++) {
2802          if (needs_dep[i])
2803             break;
2804       }
2805       if (i == write_len)
2806          return;
2807    }
2808 }
2809
2810 void
2811 fs_visitor::insert_gen4_send_dependency_workarounds()
2812 {
2813    if (devinfo->gen != 4 || devinfo->is_g4x)
2814       return;
2815
2816    bool progress = false;
2817
2818    /* Note that we're done with register allocation, so GRF fs_regs always
2819     * have a .reg_offset of 0.
2820     */
2821
2822    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2823       if (inst->mlen != 0 && inst->dst.file == GRF) {
2824          insert_gen4_pre_send_dependency_workarounds(block, inst);
2825          insert_gen4_post_send_dependency_workarounds(block, inst);
2826          progress = true;
2827       }
2828    }
2829
2830    if (progress)
2831       invalidate_live_intervals();
2832 }
2833
2834 /**
2835  * Turns the generic expression-style uniform pull constant load instruction
2836  * into a hardware-specific series of instructions for loading a pull
2837  * constant.
2838  *
2839  * The expression style allows the CSE pass before this to optimize out
2840  * repeated loads from the same offset, and gives the pre-register-allocation
2841  * scheduling full flexibility, while the conversion to native instructions
2842  * allows the post-register-allocation scheduler the best information
2843  * possible.
2844  *
2845  * Note that execution masking for setting up pull constant loads is special:
2846  * the channels that need to be written are unrelated to the current execution
2847  * mask, since a later instruction will use one of the result channels as a
2848  * source operand for all 8 or 16 of its channels.
2849  */
2850 void
2851 fs_visitor::lower_uniform_pull_constant_loads()
2852 {
2853    foreach_block_and_inst (block, fs_inst, inst, cfg) {
2854       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2855          continue;
2856
2857       if (devinfo->gen >= 7) {
2858          /* The offset arg before was a vec4-aligned byte offset.  We need to
2859           * turn it into a dword offset.
2860           */
2861          fs_reg const_offset_reg = inst->src[1];
2862          assert(const_offset_reg.file == IMM &&
2863                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2864          const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
2865
2866          fs_reg payload, offset;
2867          if (devinfo->gen >= 9) {
2868             /* We have to use a message header on Skylake to get SIMD4x2
2869              * mode.  Reserve space for the register.
2870             */
2871             offset = payload = fs_reg(GRF, alloc.allocate(2));
2872             offset.reg_offset++;
2873             inst->mlen = 2;
2874          } else {
2875             offset = payload = fs_reg(GRF, alloc.allocate(1));
2876             inst->mlen = 1;
2877          }
2878
2879          /* This is actually going to be a MOV, but since only the first dword
2880           * is accessed, we have a special opcode to do just that one.  Note
2881           * that this needs to be an operation that will be considered a def
2882           * by live variable analysis, or register allocation will explode.
2883           */
2884          fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2885                                                8, offset, const_offset_reg);
2886          setup->force_writemask_all = true;
2887
2888          setup->ir = inst->ir;
2889          setup->annotation = inst->annotation;
2890          inst->insert_before(block, setup);
2891
2892          /* Similarly, this will only populate the first 4 channels of the
2893           * result register (since we only use smear values from 0-3), but we
2894           * don't tell the optimizer.
2895           */
2896          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2897          inst->src[1] = payload;
2898          inst->base_mrf = -1;
2899
2900          invalidate_live_intervals();
2901       } else {
2902          /* Before register allocation, we didn't tell the scheduler about the
2903           * MRF we use.  We know it's safe to use this MRF because nothing
2904           * else does except for register spill/unspill, which generates and
2905           * uses its MRF within a single IR instruction.
2906           */
2907          inst->base_mrf = 14;
2908          inst->mlen = 1;
2909       }
2910    }
2911 }
2912
2913 bool
2914 fs_visitor::lower_load_payload()
2915 {
2916    bool progress = false;
2917
2918    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
2919       if (inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2920          continue;
2921
2922       assert(inst->dst.file == MRF || inst->dst.file == GRF);
2923       assert(inst->saturate == false);
2924       fs_reg dst = inst->dst;
2925
2926       /* Get rid of COMPR4.  We'll add it back in if we need it */
2927       if (dst.file == MRF)
2928          dst.reg = dst.reg & ~BRW_MRF_COMPR4;
2929
2930       const fs_builder hbld = bld.exec_all().group(8, 0).at(block, inst);
2931
2932       for (uint8_t i = 0; i < inst->header_size; i++) {
2933          if (inst->src[i].file != BAD_FILE) {
2934             fs_reg mov_dst = retype(dst, BRW_REGISTER_TYPE_UD);
2935             fs_reg mov_src = retype(inst->src[i], BRW_REGISTER_TYPE_UD);
2936             hbld.MOV(mov_dst, mov_src);
2937          }
2938          dst = offset(dst, hbld, 1);
2939       }
2940
2941       const fs_builder ibld = bld.exec_all(inst->force_writemask_all)
2942                                  .group(inst->exec_size, inst->force_sechalf)
2943                                  .at(block, inst);
2944
2945       if (inst->dst.file == MRF && (inst->dst.reg & BRW_MRF_COMPR4) &&
2946           inst->exec_size > 8) {
2947          /* In this case, the payload portion of the LOAD_PAYLOAD isn't
2948           * a straightforward copy.  Instead, the result of the
2949           * LOAD_PAYLOAD is treated as interleaved and the first four
2950           * non-header sources are unpacked as:
2951           *
2952           * m + 0: r0
2953           * m + 1: g0
2954           * m + 2: b0
2955           * m + 3: a0
2956           * m + 4: r1
2957           * m + 5: g1
2958           * m + 6: b1
2959           * m + 7: a1
2960           *
2961           * This is used for gen <= 5 fb writes.
2962           */
2963          assert(inst->exec_size == 16);
2964          assert(inst->header_size + 4 <= inst->sources);
2965          for (uint8_t i = inst->header_size; i < inst->header_size + 4; i++) {
2966             if (inst->src[i].file != BAD_FILE) {
2967                if (devinfo->has_compr4) {
2968                   fs_reg compr4_dst = retype(dst, inst->src[i].type);
2969                   compr4_dst.reg |= BRW_MRF_COMPR4;
2970                   ibld.MOV(compr4_dst, inst->src[i]);
2971                } else {
2972                   /* Platform doesn't have COMPR4.  We have to fake it */
2973                   fs_reg mov_dst = retype(dst, inst->src[i].type);
2974                   ibld.half(0).MOV(mov_dst, half(inst->src[i], 0));
2975                   mov_dst.reg += 4;
2976                   ibld.half(1).MOV(mov_dst, half(inst->src[i], 1));
2977                }
2978             }
2979
2980             dst.reg++;
2981          }
2982
2983          /* The loop above only ever incremented us through the first set
2984           * of 4 registers.  However, thanks to the magic of COMPR4, we
2985           * actually wrote to the first 8 registers, so we need to take
2986           * that into account now.
2987           */
2988          dst.reg += 4;
2989
2990          /* The COMPR4 code took care of the first 4 sources.  We'll let
2991           * the regular path handle any remaining sources.  Yes, we are
2992           * modifying the instruction but we're about to delete it so
2993           * this really doesn't hurt anything.
2994           */
2995          inst->header_size += 4;
2996       }
2997
2998       for (uint8_t i = inst->header_size; i < inst->sources; i++) {
2999          if (inst->src[i].file != BAD_FILE)
3000             ibld.MOV(retype(dst, inst->src[i].type), inst->src[i]);
3001          dst = offset(dst, ibld, 1);
3002       }
3003
3004       inst->remove(block);
3005       progress = true;
3006    }
3007
3008    if (progress)
3009       invalidate_live_intervals();
3010
3011    return progress;
3012 }
3013
3014 bool
3015 fs_visitor::lower_integer_multiplication()
3016 {
3017    bool progress = false;
3018
3019    /* Gen8's MUL instruction can do a 32-bit x 32-bit -> 32-bit operation
3020     * directly, but Cherryview cannot.
3021     */
3022    if (devinfo->gen >= 8 && !devinfo->is_cherryview)
3023       return false;
3024
3025    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
3026       if (inst->opcode != BRW_OPCODE_MUL ||
3027           inst->dst.is_accumulator() ||
3028           (inst->dst.type != BRW_REGISTER_TYPE_D &&
3029            inst->dst.type != BRW_REGISTER_TYPE_UD))
3030          continue;
3031
3032       const fs_builder ibld = bld.at(block, inst);
3033
3034       /* The MUL instruction isn't commutative. On Gen <= 6, only the low
3035        * 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of
3036        * src1 are used.
3037        *
3038        * If multiplying by an immediate value that fits in 16-bits, do a
3039        * single MUL instruction with that value in the proper location.
3040        */
3041       if (inst->src[1].file == IMM &&
3042           inst->src[1].fixed_hw_reg.dw1.ud < (1 << 16)) {
3043          if (devinfo->gen < 7) {
3044             fs_reg imm(GRF, alloc.allocate(dispatch_width / 8),
3045                        inst->dst.type);
3046             ibld.MOV(imm, inst->src[1]);
3047             ibld.MUL(inst->dst, imm, inst->src[0]);
3048          } else {
3049             ibld.MUL(inst->dst, inst->src[0], inst->src[1]);
3050          }
3051       } else {
3052          /* Gen < 8 (and some Gen8+ low-power parts like Cherryview) cannot
3053           * do 32-bit integer multiplication in one instruction, but instead
3054           * must do a sequence (which actually calculates a 64-bit result):
3055           *
3056           *    mul(8)  acc0<1>D   g3<8,8,1>D      g4<8,8,1>D
3057           *    mach(8) null       g3<8,8,1>D      g4<8,8,1>D
3058           *    mov(8)  g2<1>D     acc0<8,8,1>D
3059           *
3060           * But on Gen > 6, the ability to use second accumulator register
3061           * (acc1) for non-float data types was removed, preventing a simple
3062           * implementation in SIMD16. A 16-channel result can be calculated by
3063           * executing the three instructions twice in SIMD8, once with quarter
3064           * control of 1Q for the first eight channels and again with 2Q for
3065           * the second eight channels.
3066           *
3067           * Which accumulator register is implicitly accessed (by AccWrEnable
3068           * for instance) is determined by the quarter control. Unfortunately
3069           * Ivybridge (and presumably Baytrail) has a hardware bug in which an
3070           * implicit accumulator access by an instruction with 2Q will access
3071           * acc1 regardless of whether the data type is usable in acc1.
3072           *
3073           * Specifically, the 2Q mach(8) writes acc1 which does not exist for
3074           * integer data types.
3075           *
3076           * Since we only want the low 32-bits of the result, we can do two
3077           * 32-bit x 16-bit multiplies (like the mul and mach are doing), and
3078           * adjust the high result and add them (like the mach is doing):
3079           *
3080           *    mul(8)  g7<1>D     g3<8,8,1>D      g4.0<8,8,1>UW
3081           *    mul(8)  g8<1>D     g3<8,8,1>D      g4.1<8,8,1>UW
3082           *    shl(8)  g9<1>D     g8<8,8,1>D      16D
3083           *    add(8)  g2<1>D     g7<8,8,1>D      g8<8,8,1>D
3084           *
3085           * We avoid the shl instruction by realizing that we only want to add
3086           * the low 16-bits of the "high" result to the high 16-bits of the
3087           * "low" result and using proper regioning on the add:
3088           *
3089           *    mul(8)  g7<1>D     g3<8,8,1>D      g4.0<16,8,2>UW
3090           *    mul(8)  g8<1>D     g3<8,8,1>D      g4.1<16,8,2>UW
3091           *    add(8)  g7.1<2>UW  g7.1<16,8,2>UW  g8<16,8,2>UW
3092           *
3093           * Since it does not use the (single) accumulator register, we can
3094           * schedule multi-component multiplications much better.
3095           */
3096
3097          if (inst->conditional_mod && inst->dst.is_null()) {
3098             inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
3099                                inst->dst.type);
3100          }
3101          fs_reg low = inst->dst;
3102          fs_reg high(GRF, alloc.allocate(dispatch_width / 8),
3103                      inst->dst.type);
3104
3105          if (devinfo->gen >= 7) {
3106             fs_reg src1_0_w = inst->src[1];
3107             fs_reg src1_1_w = inst->src[1];
3108
3109             if (inst->src[1].file == IMM) {
3110                src1_0_w.fixed_hw_reg.dw1.ud &= 0xffff;
3111                src1_1_w.fixed_hw_reg.dw1.ud >>= 16;
3112             } else {
3113                src1_0_w.type = BRW_REGISTER_TYPE_UW;
3114                if (src1_0_w.stride != 0) {
3115                   assert(src1_0_w.stride == 1);
3116                   src1_0_w.stride = 2;
3117                }
3118
3119                src1_1_w.type = BRW_REGISTER_TYPE_UW;
3120                if (src1_1_w.stride != 0) {
3121                   assert(src1_1_w.stride == 1);
3122                   src1_1_w.stride = 2;
3123                }
3124                src1_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
3125             }
3126             ibld.MUL(low, inst->src[0], src1_0_w);
3127             ibld.MUL(high, inst->src[0], src1_1_w);
3128          } else {
3129             fs_reg src0_0_w = inst->src[0];
3130             fs_reg src0_1_w = inst->src[0];
3131
3132             src0_0_w.type = BRW_REGISTER_TYPE_UW;
3133             if (src0_0_w.stride != 0) {
3134                assert(src0_0_w.stride == 1);
3135                src0_0_w.stride = 2;
3136             }
3137
3138             src0_1_w.type = BRW_REGISTER_TYPE_UW;
3139             if (src0_1_w.stride != 0) {
3140                assert(src0_1_w.stride == 1);
3141                src0_1_w.stride = 2;
3142             }
3143             src0_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
3144
3145             ibld.MUL(low, src0_0_w, inst->src[1]);
3146             ibld.MUL(high, src0_1_w, inst->src[1]);
3147          }
3148
3149          fs_reg dst = inst->dst;
3150          dst.type = BRW_REGISTER_TYPE_UW;
3151          dst.subreg_offset = 2;
3152          dst.stride = 2;
3153
3154          high.type = BRW_REGISTER_TYPE_UW;
3155          high.stride = 2;
3156
3157          low.type = BRW_REGISTER_TYPE_UW;
3158          low.subreg_offset = 2;
3159          low.stride = 2;
3160
3161          ibld.ADD(dst, low, high);
3162
3163          if (inst->conditional_mod) {
3164             fs_reg null(retype(ibld.null_reg_f(), inst->dst.type));
3165             set_condmod(inst->conditional_mod,
3166                         ibld.MOV(null, inst->dst));
3167          }
3168       }
3169
3170       inst->remove(block);
3171       progress = true;
3172    }
3173
3174    if (progress)
3175       invalidate_live_intervals();
3176
3177    return progress;
3178 }
3179
3180 void
3181 fs_visitor::dump_instructions()
3182 {
3183    dump_instructions(NULL);
3184 }
3185
3186 void
3187 fs_visitor::dump_instructions(const char *name)
3188 {
3189    FILE *file = stderr;
3190    if (name && geteuid() != 0) {
3191       file = fopen(name, "w");
3192       if (!file)
3193          file = stderr;
3194    }
3195
3196    if (cfg) {
3197       calculate_register_pressure();
3198       int ip = 0, max_pressure = 0;
3199       foreach_block_and_inst(block, backend_instruction, inst, cfg) {
3200          max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
3201          fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
3202          dump_instruction(inst, file);
3203          ip++;
3204       }
3205       fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
3206    } else {
3207       int ip = 0;
3208       foreach_in_list(backend_instruction, inst, &instructions) {
3209          fprintf(file, "%4d: ", ip++);
3210          dump_instruction(inst, file);
3211       }
3212    }
3213
3214    if (file != stderr) {
3215       fclose(file);
3216    }
3217 }
3218
3219 void
3220 fs_visitor::dump_instruction(backend_instruction *be_inst)
3221 {
3222    dump_instruction(be_inst, stderr);
3223 }
3224
3225 void
3226 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
3227 {
3228    fs_inst *inst = (fs_inst *)be_inst;
3229
3230    if (inst->predicate) {
3231       fprintf(file, "(%cf0.%d) ",
3232              inst->predicate_inverse ? '-' : '+',
3233              inst->flag_subreg);
3234    }
3235
3236    fprintf(file, "%s", brw_instruction_name(inst->opcode));
3237    if (inst->saturate)
3238       fprintf(file, ".sat");
3239    if (inst->conditional_mod) {
3240       fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
3241       if (!inst->predicate &&
3242           (devinfo->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
3243                               inst->opcode != BRW_OPCODE_IF &&
3244                               inst->opcode != BRW_OPCODE_WHILE))) {
3245          fprintf(file, ".f0.%d", inst->flag_subreg);
3246       }
3247    }
3248    fprintf(file, "(%d) ", inst->exec_size);
3249
3250    if (inst->mlen) {
3251       fprintf(file, "(mlen: %d) ", inst->mlen);
3252    }
3253
3254    switch (inst->dst.file) {
3255    case GRF:
3256       fprintf(file, "vgrf%d", inst->dst.reg);
3257       if (alloc.sizes[inst->dst.reg] != inst->regs_written ||
3258           inst->dst.subreg_offset)
3259          fprintf(file, "+%d.%d",
3260                  inst->dst.reg_offset, inst->dst.subreg_offset);
3261       break;
3262    case MRF:
3263       fprintf(file, "m%d", inst->dst.reg);
3264       break;
3265    case BAD_FILE:
3266       fprintf(file, "(null)");
3267       break;
3268    case UNIFORM:
3269       fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
3270       break;
3271    case ATTR:
3272       fprintf(file, "***attr%d***", inst->dst.reg + inst->dst.reg_offset);
3273       break;
3274    case HW_REG:
3275       if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3276          switch (inst->dst.fixed_hw_reg.nr) {
3277          case BRW_ARF_NULL:
3278             fprintf(file, "null");
3279             break;
3280          case BRW_ARF_ADDRESS:
3281             fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
3282             break;
3283          case BRW_ARF_ACCUMULATOR:
3284             fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
3285             break;
3286          case BRW_ARF_FLAG:
3287             fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3288                              inst->dst.fixed_hw_reg.subnr);
3289             break;
3290          default:
3291             fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3292                                inst->dst.fixed_hw_reg.subnr);
3293             break;
3294          }
3295       } else {
3296          fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
3297       }
3298       if (inst->dst.fixed_hw_reg.subnr)
3299          fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
3300       break;
3301    default:
3302       fprintf(file, "???");
3303       break;
3304    }
3305    fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
3306
3307    for (int i = 0; i < inst->sources; i++) {
3308       if (inst->src[i].negate)
3309          fprintf(file, "-");
3310       if (inst->src[i].abs)
3311          fprintf(file, "|");
3312       switch (inst->src[i].file) {
3313       case GRF:
3314          fprintf(file, "vgrf%d", inst->src[i].reg);
3315          if (alloc.sizes[inst->src[i].reg] != (unsigned)inst->regs_read(i) ||
3316              inst->src[i].subreg_offset)
3317             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3318                     inst->src[i].subreg_offset);
3319          break;
3320       case MRF:
3321          fprintf(file, "***m%d***", inst->src[i].reg);
3322          break;
3323       case ATTR:
3324          fprintf(file, "attr%d", inst->src[i].reg + inst->src[i].reg_offset);
3325          break;
3326       case UNIFORM:
3327          fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
3328          if (inst->src[i].reladdr) {
3329             fprintf(file, "+reladdr");
3330          } else if (inst->src[i].subreg_offset) {
3331             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3332                     inst->src[i].subreg_offset);
3333          }
3334          break;
3335       case BAD_FILE:
3336          fprintf(file, "(null)");
3337          break;
3338       case IMM:
3339          switch (inst->src[i].type) {
3340          case BRW_REGISTER_TYPE_F:
3341             fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
3342             break;
3343          case BRW_REGISTER_TYPE_W:
3344          case BRW_REGISTER_TYPE_D:
3345             fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
3346             break;
3347          case BRW_REGISTER_TYPE_UW:
3348          case BRW_REGISTER_TYPE_UD:
3349             fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
3350             break;
3351          case BRW_REGISTER_TYPE_VF:
3352             fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
3353                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >>  0) & 0xff),
3354                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >>  8) & 0xff),
3355                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 16) & 0xff),
3356                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 24) & 0xff));
3357             break;
3358          default:
3359             fprintf(file, "???");
3360             break;
3361          }
3362          break;
3363       case HW_REG:
3364          if (inst->src[i].fixed_hw_reg.negate)
3365             fprintf(file, "-");
3366          if (inst->src[i].fixed_hw_reg.abs)
3367             fprintf(file, "|");
3368          if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3369             switch (inst->src[i].fixed_hw_reg.nr) {
3370             case BRW_ARF_NULL:
3371                fprintf(file, "null");
3372                break;
3373             case BRW_ARF_ADDRESS:
3374                fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
3375                break;
3376             case BRW_ARF_ACCUMULATOR:
3377                fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
3378                break;
3379             case BRW_ARF_FLAG:
3380                fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3381                                 inst->src[i].fixed_hw_reg.subnr);
3382                break;
3383             default:
3384                fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3385                                   inst->src[i].fixed_hw_reg.subnr);
3386                break;
3387             }
3388          } else {
3389             fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3390          }
3391          if (inst->src[i].fixed_hw_reg.subnr)
3392             fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
3393          if (inst->src[i].fixed_hw_reg.abs)
3394             fprintf(file, "|");
3395          break;
3396       default:
3397          fprintf(file, "???");
3398          break;
3399       }
3400       if (inst->src[i].abs)
3401          fprintf(file, "|");
3402
3403       if (inst->src[i].file != IMM) {
3404          fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
3405       }
3406
3407       if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
3408          fprintf(file, ", ");
3409    }
3410
3411    fprintf(file, " ");
3412
3413    if (dispatch_width == 16 && inst->exec_size == 8) {
3414       if (inst->force_sechalf)
3415          fprintf(file, "2ndhalf ");
3416       else
3417          fprintf(file, "1sthalf ");
3418    }
3419
3420    fprintf(file, "\n");
3421 }
3422
3423 /**
3424  * Possibly returns an instruction that set up @param reg.
3425  *
3426  * Sometimes we want to take the result of some expression/variable
3427  * dereference tree and rewrite the instruction generating the result
3428  * of the tree.  When processing the tree, we know that the
3429  * instructions generated are all writing temporaries that are dead
3430  * outside of this tree.  So, if we have some instructions that write
3431  * a temporary, we're free to point that temp write somewhere else.
3432  *
3433  * Note that this doesn't guarantee that the instruction generated
3434  * only reg -- it might be the size=4 destination of a texture instruction.
3435  */
3436 fs_inst *
3437 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3438                                            fs_inst *end,
3439                                            const fs_reg &reg)
3440 {
3441    if (end == start ||
3442        end->is_partial_write() ||
3443        reg.reladdr ||
3444        !reg.equals(end->dst)) {
3445       return NULL;
3446    } else {
3447       return end;
3448    }
3449 }
3450
3451 void
3452 fs_visitor::setup_payload_gen6()
3453 {
3454    bool uses_depth =
3455       (prog->InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3456    unsigned barycentric_interp_modes =
3457       (stage == MESA_SHADER_FRAGMENT) ?
3458       ((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
3459
3460    assert(devinfo->gen >= 6);
3461
3462    /* R0-1: masks, pixel X/Y coordinates. */
3463    payload.num_regs = 2;
3464    /* R2: only for 32-pixel dispatch.*/
3465
3466    /* R3-26: barycentric interpolation coordinates.  These appear in the
3467     * same order that they appear in the brw_wm_barycentric_interp_mode
3468     * enum.  Each set of coordinates occupies 2 registers if dispatch width
3469     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
3470     * appear if they were enabled using the "Barycentric Interpolation
3471     * Mode" bits in WM_STATE.
3472     */
3473    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3474       if (barycentric_interp_modes & (1 << i)) {
3475          payload.barycentric_coord_reg[i] = payload.num_regs;
3476          payload.num_regs += 2;
3477          if (dispatch_width == 16) {
3478             payload.num_regs += 2;
3479          }
3480       }
3481    }
3482
3483    /* R27: interpolated depth if uses source depth */
3484    if (uses_depth) {
3485       payload.source_depth_reg = payload.num_regs;
3486       payload.num_regs++;
3487       if (dispatch_width == 16) {
3488          /* R28: interpolated depth if not SIMD8. */
3489          payload.num_regs++;
3490       }
3491    }
3492    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3493    if (uses_depth) {
3494       payload.source_w_reg = payload.num_regs;
3495       payload.num_regs++;
3496       if (dispatch_width == 16) {
3497          /* R30: interpolated W if not SIMD8. */
3498          payload.num_regs++;
3499       }
3500    }
3501
3502    if (stage == MESA_SHADER_FRAGMENT) {
3503       brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3504       brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3505       prog_data->uses_pos_offset = key->compute_pos_offset;
3506       /* R31: MSAA position offsets. */
3507       if (prog_data->uses_pos_offset) {
3508          payload.sample_pos_reg = payload.num_regs;
3509          payload.num_regs++;
3510       }
3511    }
3512
3513    /* R32: MSAA input coverage mask */
3514    if (prog->SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3515       assert(devinfo->gen >= 7);
3516       payload.sample_mask_in_reg = payload.num_regs;
3517       payload.num_regs++;
3518       if (dispatch_width == 16) {
3519          /* R33: input coverage mask if not SIMD8. */
3520          payload.num_regs++;
3521       }
3522    }
3523
3524    /* R34-: bary for 32-pixel. */
3525    /* R58-59: interp W for 32-pixel. */
3526
3527    if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3528       source_depth_to_render_target = true;
3529    }
3530 }
3531
3532 void
3533 fs_visitor::setup_vs_payload()
3534 {
3535    /* R0: thread header, R1: urb handles */
3536    payload.num_regs = 2;
3537 }
3538
3539 void
3540 fs_visitor::setup_cs_payload()
3541 {
3542    assert(devinfo->gen >= 7);
3543
3544    payload.num_regs = 1;
3545 }
3546
3547 void
3548 fs_visitor::assign_binding_table_offsets()
3549 {
3550    assert(stage == MESA_SHADER_FRAGMENT);
3551    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3552    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3553    uint32_t next_binding_table_offset = 0;
3554
3555    /* If there are no color regions, we still perform an FB write to a null
3556     * renderbuffer, which we place at surface index 0.
3557     */
3558    prog_data->binding_table.render_target_start = next_binding_table_offset;
3559    next_binding_table_offset += MAX2(key->nr_color_regions, 1);
3560
3561    assign_common_binding_table_offsets(next_binding_table_offset);
3562 }
3563
3564 void
3565 fs_visitor::calculate_register_pressure()
3566 {
3567    invalidate_live_intervals();
3568    calculate_live_intervals();
3569
3570    unsigned num_instructions = 0;
3571    foreach_block(block, cfg)
3572       num_instructions += block->instructions.length();
3573
3574    regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3575
3576    for (unsigned reg = 0; reg < alloc.count; reg++) {
3577       for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3578          regs_live_at_ip[ip] += alloc.sizes[reg];
3579    }
3580 }
3581
3582 void
3583 fs_visitor::optimize()
3584 {
3585    /* bld is the common builder object pointing at the end of the program we
3586     * used to translate it into i965 IR.  For the optimization and lowering
3587     * passes coming next, any code added after the end of the program without
3588     * having explicitly called fs_builder::at() clearly points at a mistake.
3589     * Ideally optimization passes wouldn't be part of the visitor so they
3590     * wouldn't have access to bld at all, but they do, so just in case some
3591     * pass forgets to ask for a location explicitly set it to NULL here to
3592     * make it trip.
3593     */
3594    bld = bld.at(NULL, NULL);
3595
3596    split_virtual_grfs();
3597
3598    move_uniform_array_access_to_pull_constants();
3599    assign_constant_locations();
3600    demote_pull_constants();
3601
3602 #define OPT(pass, args...) ({                                           \
3603       pass_num++;                                                       \
3604       bool this_progress = pass(args);                                  \
3605                                                                         \
3606       if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) {   \
3607          char filename[64];                                             \
3608          snprintf(filename, 64, "%s%d-%04d-%02d-%02d-" #pass,              \
3609                   stage_abbrev, dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
3610                                                                         \
3611          backend_shader::dump_instructions(filename);                   \
3612       }                                                                 \
3613                                                                         \
3614       progress = progress || this_progress;                             \
3615       this_progress;                                                    \
3616    })
3617
3618    if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
3619       char filename[64];
3620       snprintf(filename, 64, "%s%d-%04d-00-start",
3621                stage_abbrev, dispatch_width,
3622                shader_prog ? shader_prog->Name : 0);
3623
3624       backend_shader::dump_instructions(filename);
3625    }
3626
3627    bool progress;
3628    int iteration = 0;
3629    int pass_num = 0;
3630    do {
3631       progress = false;
3632       pass_num = 0;
3633       iteration++;
3634
3635       OPT(remove_duplicate_mrf_writes);
3636
3637       OPT(opt_algebraic);
3638       OPT(opt_cse);
3639       OPT(opt_copy_propagate);
3640       OPT(opt_peephole_predicated_break);
3641       OPT(opt_cmod_propagation);
3642       OPT(dead_code_eliminate);
3643       OPT(opt_peephole_sel);
3644       OPT(dead_control_flow_eliminate, this);
3645       OPT(opt_register_renaming);
3646       OPT(opt_redundant_discard_jumps);
3647       OPT(opt_saturate_propagation);
3648       OPT(opt_zero_samples);
3649       OPT(register_coalesce);
3650       OPT(compute_to_mrf);
3651       OPT(eliminate_find_live_channel);
3652
3653       OPT(compact_virtual_grfs);
3654    } while (progress);
3655
3656    pass_num = 0;
3657
3658    OPT(opt_sampler_eot);
3659
3660    if (OPT(lower_load_payload)) {
3661       split_virtual_grfs();
3662       OPT(register_coalesce);
3663       OPT(compute_to_mrf);
3664       OPT(dead_code_eliminate);
3665    }
3666
3667    OPT(opt_combine_constants);
3668    OPT(lower_integer_multiplication);
3669
3670    lower_uniform_pull_constant_loads();
3671 }
3672
3673 /**
3674  * Three source instruction must have a GRF/MRF destination register.
3675  * ARF NULL is not allowed.  Fix that up by allocating a temporary GRF.
3676  */
3677 void
3678 fs_visitor::fixup_3src_null_dest()
3679 {
3680    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3681       if (inst->is_3src() && inst->dst.is_null()) {
3682          inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
3683                             inst->dst.type);
3684       }
3685    }
3686 }
3687
3688 void
3689 fs_visitor::allocate_registers()
3690 {
3691    bool allocated_without_spills;
3692
3693    static const enum instruction_scheduler_mode pre_modes[] = {
3694       SCHEDULE_PRE,
3695       SCHEDULE_PRE_NON_LIFO,
3696       SCHEDULE_PRE_LIFO,
3697    };
3698
3699    /* Try each scheduling heuristic to see if it can successfully register
3700     * allocate without spilling.  They should be ordered by decreasing
3701     * performance but increasing likelihood of allocating.
3702     */
3703    for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3704       schedule_instructions(pre_modes[i]);
3705
3706       if (0) {
3707          assign_regs_trivial();
3708          allocated_without_spills = true;
3709       } else {
3710          allocated_without_spills = assign_regs(false);
3711       }
3712       if (allocated_without_spills)
3713          break;
3714    }
3715
3716    if (!allocated_without_spills) {
3717       /* We assume that any spilling is worse than just dropping back to
3718        * SIMD8.  There's probably actually some intermediate point where
3719        * SIMD16 with a couple of spills is still better.
3720        */
3721       if (dispatch_width == 16) {
3722          fail("Failure to register allocate.  Reduce number of "
3723               "live scalar values to avoid this.");
3724       } else {
3725          compiler->shader_perf_log(log_data,
3726                                    "%s shader triggered register spilling.  "
3727                                    "Try reducing the number of live scalar "
3728                                    "values to improve performance.\n",
3729                                    stage_name);
3730       }
3731
3732       /* Since we're out of heuristics, just go spill registers until we
3733        * get an allocation.
3734        */
3735       while (!assign_regs(true)) {
3736          if (failed)
3737             break;
3738       }
3739    }
3740
3741    /* This must come after all optimization and register allocation, since
3742     * it inserts dead code that happens to have side effects, and it does
3743     * so based on the actual physical registers in use.
3744     */
3745    insert_gen4_send_dependency_workarounds();
3746
3747    if (failed)
3748       return;
3749
3750    if (!allocated_without_spills)
3751       schedule_instructions(SCHEDULE_POST);
3752
3753    if (last_scratch > 0)
3754       prog_data->total_scratch = brw_get_scratch_size(last_scratch);
3755 }
3756
3757 bool
3758 fs_visitor::run_vs(gl_clip_plane *clip_planes)
3759 {
3760    assert(stage == MESA_SHADER_VERTEX);
3761
3762    assign_common_binding_table_offsets(0);
3763    setup_vs_payload();
3764
3765    if (shader_time_index >= 0)
3766       emit_shader_time_begin();
3767
3768    emit_nir_code();
3769
3770    if (failed)
3771       return false;
3772
3773    compute_clip_distance(clip_planes);
3774
3775    emit_urb_writes();
3776
3777    if (shader_time_index >= 0)
3778       emit_shader_time_end();
3779
3780    calculate_cfg();
3781
3782    optimize();
3783
3784    assign_curb_setup();
3785    assign_vs_urb_setup();
3786
3787    fixup_3src_null_dest();
3788    allocate_registers();
3789
3790    return !failed;
3791 }
3792
3793 bool
3794 fs_visitor::run_fs(bool do_rep_send)
3795 {
3796    brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
3797    brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
3798
3799    assert(stage == MESA_SHADER_FRAGMENT);
3800
3801    sanity_param_count = prog->Parameters->NumParameters;
3802
3803    assign_binding_table_offsets();
3804
3805    if (devinfo->gen >= 6)
3806       setup_payload_gen6();
3807    else
3808       setup_payload_gen4();
3809
3810    if (0) {
3811       emit_dummy_fs();
3812    } else if (do_rep_send) {
3813       assert(dispatch_width == 16);
3814       emit_repclear_shader();
3815    } else {
3816       if (shader_time_index >= 0)
3817          emit_shader_time_begin();
3818
3819       calculate_urb_setup();
3820       if (prog->InputsRead > 0) {
3821          if (devinfo->gen < 6)
3822             emit_interpolation_setup_gen4();
3823          else
3824             emit_interpolation_setup_gen6();
3825       }
3826
3827       /* We handle discards by keeping track of the still-live pixels in f0.1.
3828        * Initialize it with the dispatched pixels.
3829        */
3830       if (wm_prog_data->uses_kill) {
3831          fs_inst *discard_init = bld.emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3832          discard_init->flag_subreg = 1;
3833       }
3834
3835       /* Generate FS IR for main().  (the visitor only descends into
3836        * functions called "main").
3837        */
3838       emit_nir_code();
3839
3840       if (failed)
3841          return false;
3842
3843       if (wm_prog_data->uses_kill)
3844          bld.emit(FS_OPCODE_PLACEHOLDER_HALT);
3845
3846       if (wm_key->alpha_test_func)
3847          emit_alpha_test();
3848
3849       emit_fb_writes();
3850
3851       if (shader_time_index >= 0)
3852          emit_shader_time_end();
3853
3854       calculate_cfg();
3855
3856       optimize();
3857
3858       assign_curb_setup();
3859       assign_urb_setup();
3860
3861       fixup_3src_null_dest();
3862       allocate_registers();
3863
3864       if (failed)
3865          return false;
3866    }
3867
3868    if (dispatch_width == 8)
3869       wm_prog_data->reg_blocks = brw_register_blocks(grf_used);
3870    else
3871       wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
3872
3873    /* If any state parameters were appended, then ParameterValues could have
3874     * been realloced, in which case the driver uniform storage set up by
3875     * _mesa_associate_uniform_storage() would point to freed memory.  Make
3876     * sure that didn't happen.
3877     */
3878    assert(sanity_param_count == prog->Parameters->NumParameters);
3879
3880    return !failed;
3881 }
3882
3883 bool
3884 fs_visitor::run_cs()
3885 {
3886    assert(stage == MESA_SHADER_COMPUTE);
3887    assert(shader);
3888
3889    sanity_param_count = prog->Parameters->NumParameters;
3890
3891    assign_common_binding_table_offsets(0);
3892
3893    setup_cs_payload();
3894
3895    if (shader_time_index >= 0)
3896       emit_shader_time_begin();
3897
3898    emit_nir_code();
3899
3900    if (failed)
3901       return false;
3902
3903    emit_cs_terminate();
3904
3905    if (shader_time_index >= 0)
3906       emit_shader_time_end();
3907
3908    calculate_cfg();
3909
3910    optimize();
3911
3912    assign_curb_setup();
3913
3914    fixup_3src_null_dest();
3915    allocate_registers();
3916
3917    if (failed)
3918       return false;
3919
3920    /* If any state parameters were appended, then ParameterValues could have
3921     * been realloced, in which case the driver uniform storage set up by
3922     * _mesa_associate_uniform_storage() would point to freed memory.  Make
3923     * sure that didn't happen.
3924     */
3925    assert(sanity_param_count == prog->Parameters->NumParameters);
3926
3927    return !failed;
3928 }
3929
3930 const unsigned *
3931 brw_wm_fs_emit(struct brw_context *brw,
3932                void *mem_ctx,
3933                const struct brw_wm_prog_key *key,
3934                struct brw_wm_prog_data *prog_data,
3935                struct gl_fragment_program *fp,
3936                struct gl_shader_program *prog,
3937                unsigned *final_assembly_size)
3938 {
3939    bool start_busy = false;
3940    double start_time = 0;
3941
3942    if (unlikely(brw->perf_debug)) {
3943       start_busy = (brw->batch.last_bo &&
3944                     drm_intel_bo_busy(brw->batch.last_bo));
3945       start_time = get_time();
3946    }
3947
3948    struct brw_shader *shader = NULL;
3949    if (prog)
3950       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3951
3952    if (unlikely(INTEL_DEBUG & DEBUG_WM))
3953       brw_dump_ir("fragment", prog, &shader->base, &fp->Base);
3954
3955    int st_index8 = -1, st_index16 = -1;
3956    if (INTEL_DEBUG & DEBUG_SHADER_TIME) {
3957       st_index8 = brw_get_shader_time_index(brw, prog, &fp->Base, ST_FS8);
3958       st_index16 = brw_get_shader_time_index(brw, prog, &fp->Base, ST_FS16);
3959    }
3960
3961    /* Now the main event: Visit the shader IR and generate our FS IR for it.
3962     */
3963    fs_visitor v(brw->intelScreen->compiler, brw,
3964                 mem_ctx, MESA_SHADER_FRAGMENT, key, &prog_data->base,
3965                 prog, &fp->Base, 8, st_index8);
3966    if (!v.run_fs(false /* do_rep_send */)) {
3967       if (prog) {
3968          prog->LinkStatus = false;
3969          ralloc_strcat(&prog->InfoLog, v.fail_msg);
3970       }
3971
3972       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3973                     v.fail_msg);
3974
3975       return NULL;
3976    }
3977
3978    cfg_t *simd16_cfg = NULL;
3979    fs_visitor v2(brw->intelScreen->compiler, brw,
3980                  mem_ctx, MESA_SHADER_FRAGMENT, key, &prog_data->base,
3981                  prog, &fp->Base, 16, st_index16);
3982    if (likely(!(INTEL_DEBUG & DEBUG_NO16) || brw->use_rep_send)) {
3983       if (!v.simd16_unsupported) {
3984          /* Try a SIMD16 compile */
3985          v2.import_uniforms(&v);
3986          if (!v2.run_fs(brw->use_rep_send)) {
3987             perf_debug("SIMD16 shader failed to compile: %s", v2.fail_msg);
3988          } else {
3989             simd16_cfg = v2.cfg;
3990          }
3991       }
3992    }
3993
3994    cfg_t *simd8_cfg;
3995    int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8;
3996    if ((no_simd8 || brw->gen < 5) && simd16_cfg) {
3997       simd8_cfg = NULL;
3998       prog_data->no_8 = true;
3999    } else {
4000       simd8_cfg = v.cfg;
4001       prog_data->no_8 = false;
4002    }
4003
4004    fs_generator g(brw->intelScreen->compiler, brw,
4005                   mem_ctx, (void *) key, &prog_data->base,
4006                   &fp->Base, v.promoted_constants, v.runtime_check_aads_emit, "FS");
4007
4008    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
4009       char *name;
4010       if (prog)
4011          name = ralloc_asprintf(mem_ctx, "%s fragment shader %d",
4012                                 prog->Label ? prog->Label : "unnamed",
4013                                 prog->Name);
4014       else
4015          name = ralloc_asprintf(mem_ctx, "fragment program %d", fp->Base.Id);
4016
4017       g.enable_debug(name);
4018    }
4019
4020    if (simd8_cfg)
4021       g.generate_code(simd8_cfg, 8);
4022    if (simd16_cfg)
4023       prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
4024
4025    if (unlikely(brw->perf_debug) && shader) {
4026       if (shader->compiled_once)
4027          brw_wm_debug_recompile(brw, prog, key);
4028       shader->compiled_once = true;
4029
4030       if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
4031          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
4032                     (get_time() - start_time) * 1000);
4033       }
4034    }
4035
4036    return g.get_assembly(final_assembly_size);
4037 }
4038
4039 extern "C" bool
4040 brw_fs_precompile(struct gl_context *ctx,
4041                   struct gl_shader_program *shader_prog,
4042                   struct gl_program *prog)
4043 {
4044    struct brw_context *brw = brw_context(ctx);
4045    struct brw_wm_prog_key key;
4046
4047    struct gl_fragment_program *fp = (struct gl_fragment_program *) prog;
4048    struct brw_fragment_program *bfp = brw_fragment_program(fp);
4049    bool program_uses_dfdy = fp->UsesDFdy;
4050
4051    memset(&key, 0, sizeof(key));
4052
4053    if (brw->gen < 6) {
4054       if (fp->UsesKill)
4055          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
4056
4057       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
4058          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
4059
4060       /* Just assume depth testing. */
4061       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
4062       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
4063    }
4064
4065    if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
4066                                          BRW_FS_VARYING_INPUT_MASK) > 16)
4067       key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
4068
4069    brw_setup_tex_for_precompile(brw, &key.tex, &fp->Base);
4070
4071    if (fp->Base.InputsRead & VARYING_BIT_POS) {
4072       key.drawable_height = ctx->DrawBuffer->Height;
4073    }
4074
4075    key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
4076          ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
4077          BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
4078
4079    if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
4080       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
4081                           key.nr_color_regions > 1;
4082    }
4083
4084    key.program_string_id = bfp->id;
4085
4086    uint32_t old_prog_offset = brw->wm.base.prog_offset;
4087    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
4088
4089    bool success = brw_codegen_wm_prog(brw, shader_prog, bfp, &key);
4090
4091    brw->wm.base.prog_offset = old_prog_offset;
4092    brw->wm.prog_data = old_prog_data;
4093
4094    return success;
4095 }
4096
4097 void
4098 brw_setup_tex_for_precompile(struct brw_context *brw,
4099                              struct brw_sampler_prog_key_data *tex,
4100                              struct gl_program *prog)
4101 {
4102    const bool has_shader_channel_select = brw->is_haswell || brw->gen >= 8;
4103    unsigned sampler_count = _mesa_fls(prog->SamplersUsed);
4104    for (unsigned i = 0; i < sampler_count; i++) {
4105       if (!has_shader_channel_select && (prog->ShadowSamplers & (1 << i))) {
4106          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
4107          tex->swizzles[i] =
4108             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
4109       } else {
4110          /* Color sampler: assume no swizzling. */
4111          tex->swizzles[i] = SWIZZLE_XYZW;
4112       }
4113    }
4114 }