src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 #include <sys/types.h>
  32
  33 #include "util/hash_table.h"
  34 #include "main/macros.h"
  35 #include "main/shaderobj.h"
  36 #include "main/fbobject.h"
  37 #include "program/prog_parameter.h"
  38 #include "program/prog_print.h"
  39 #include "util/register_allocate.h"
  40 #include "program/hash_table.h"
  41 #include "brw_context.h"
  42 #include "brw_eu.h"
  43 #include "brw_wm.h"
  44 #include "brw_fs.h"
  45 #include "brw_cfg.h"
  46 #include "brw_dead_control_flow.h"
  47 #include "main/uniforms.h"
  48 #include "brw_fs_live_variables.h"
  49 #include "glsl/glsl_types.h"
  50 #include "program/sampler.h"
  51
  52 using namespace brw;
  53
  54 void
  55 fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
  56               const fs_reg *src, unsigned sources)
  57 {
  58    memset(this, 0, sizeof(*this));
  59
  60    this->src = new fs_reg[MAX2(sources, 3)];
  61    for (unsigned i = 0; i < sources; i++)
  62       this->src[i] = src[i];
  63
  64    this->opcode = opcode;
  65    this->dst = dst;
  66    this->sources = sources;
  67    this->exec_size = exec_size;
  68
  69    assert(dst.file != IMM && dst.file != UNIFORM);
  70
  71    assert(this->exec_size != 0);
  72
  73    this->conditional_mod = BRW_CONDITIONAL_NONE;
  74
  75    /* This will be the case for almost all instructions. */
  76    switch (dst.file) {
  77    case GRF:
  78    case HW_REG:
  79    case MRF:
  80    case ATTR:
  81       this->regs_written = DIV_ROUND_UP(dst.component_size(exec_size),
  82                                         REG_SIZE);
  83       break;
  84    case BAD_FILE:
  85       this->regs_written = 0;
  86       break;
  87    case IMM:
  88    case UNIFORM:
  89       unreachable("Invalid destination register file");
  90    default:
  91       unreachable("Invalid register file");
  92    }
  93
  94    this->writes_accumulator = false;
  95 }
  96
  97 fs_inst::fs_inst()
  98 {
  99    init(BRW_OPCODE_NOP, 8, dst, NULL, 0);
 100 }
 101
 102 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
 103 {
 104    init(opcode, exec_size, reg_undef, NULL, 0);
 105 }
 106
 107 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst)
 108 {
 109    init(opcode, exec_size, dst, NULL, 0);
 110 }
 111
 112 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 113                  const fs_reg &src0)
 114 {
 115    const fs_reg src[1] = { src0 };
 116    init(opcode, exec_size, dst, src, 1);
 117 }
 118
 119 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 120                  const fs_reg &src0, const fs_reg &src1)
 121 {
 122    const fs_reg src[2] = { src0, src1 };
 123    init(opcode, exec_size, dst, src, 2);
 124 }
 125
 126 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 127                  const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
 128 {
 129    const fs_reg src[3] = { src0, src1, src2 };
 130    init(opcode, exec_size, dst, src, 3);
 131 }
 132
 133 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
 134                  const fs_reg src[], unsigned sources)
 135 {
 136    init(opcode, exec_width, dst, src, sources);
 137 }
 138
 139 fs_inst::fs_inst(const fs_inst &that)
 140 {
 141    memcpy(this, &that, sizeof(that));
 142
 143    this->src = new fs_reg[MAX2(that.sources, 3)];
 144
 145    for (unsigned i = 0; i < that.sources; i++)
 146       this->src[i] = that.src[i];
 147 }
 148
 149 fs_inst::~fs_inst()
 150 {
 151    delete[] this->src;
 152 }
 153
 154 void
 155 fs_inst::resize_sources(uint8_t num_sources)
 156 {
 157    if (this->sources != num_sources) {
 158       fs_reg *src = new fs_reg[MAX2(num_sources, 3)];
 159
 160       for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i)
 161          src[i] = this->src[i];
 162
 163       delete[] this->src;
 164       this->src = src;
 165       this->sources = num_sources;
 166    }
 167 }
 168
 169 void
 170 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld,
 171                                        const fs_reg &dst,
 172                                        const fs_reg &surf_index,
 173                                        const fs_reg &varying_offset,
 174                                        uint32_t const_offset)
 175 {
 176    /* We have our constant surface use a pitch of 4 bytes, so our index can
 177     * be any component of a vector, and then we load 4 contiguous
 178     * components starting from that.
 179     *
 180     * We break down the const_offset to a portion added to the variable
 181     * offset and a portion done using reg_offset, which means that if you
 182     * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
 183     * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
 184     * CSE can later notice that those loads are all the same and eliminate
 185     * the redundant ones.
 186     */
 187    fs_reg vec4_offset = vgrf(glsl_type::int_type);
 188    bld.ADD(vec4_offset, varying_offset, fs_reg(const_offset & ~3));
 189
 190    int scale = 1;
 191    if (devinfo->gen == 4 && bld.dispatch_width() == 8) {
 192       /* Pre-gen5, we can either use a SIMD8 message that requires (header,
 193        * u, v, r) as parameters, or we can just use the SIMD16 message
 194        * consisting of (header, u).  We choose the second, at the cost of a
 195        * longer return length.
 196        */
 197       scale = 2;
 198    }
 199
 200    enum opcode op;
 201    if (devinfo->gen >= 7)
 202       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
 203    else
 204       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
 205
 206    int regs_written = 4 * (bld.dispatch_width() / 8) * scale;
 207    fs_reg vec4_result = fs_reg(GRF, alloc.allocate(regs_written), dst.type);
 208    fs_inst *inst = bld.emit(op, vec4_result, surf_index, vec4_offset);
 209    inst->regs_written = regs_written;
 210
 211    if (devinfo->gen < 7) {
 212       inst->base_mrf = 13;
 213       inst->header_size = 1;
 214       if (devinfo->gen == 4)
 215          inst->mlen = 3;
 216       else
 217          inst->mlen = 1 + bld.dispatch_width() / 8;
 218    }
 219
 220    bld.MOV(dst, offset(vec4_result, bld, (const_offset & 3) * scale));
 221 }
 222
 223 /**
 224  * A helper for MOV generation for fixing up broken hardware SEND dependency
 225  * handling.
 226  */
 227 void
 228 fs_visitor::DEP_RESOLVE_MOV(const fs_builder &bld, int grf)
 229 {
 230    /* The caller always wants uncompressed to emit the minimal extra
 231     * dependencies, and to avoid having to deal with aligning its regs to 2.
 232     */
 233    const fs_builder ubld = bld.annotate("send dependency resolve")
 234                               .half(0);
 235
 236    ubld.MOV(ubld.null_reg_f(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
 237 }
 238
 239 bool
 240 fs_inst::equals(fs_inst *inst) const
 241 {
 242    return (opcode == inst->opcode &&
 243            dst.equals(inst->dst) &&
 244            src[0].equals(inst->src[0]) &&
 245            src[1].equals(inst->src[1]) &&
 246            src[2].equals(inst->src[2]) &&
 247            saturate == inst->saturate &&
 248            predicate == inst->predicate &&
 249            conditional_mod == inst->conditional_mod &&
 250            mlen == inst->mlen &&
 251            base_mrf == inst->base_mrf &&
 252            target == inst->target &&
 253            eot == inst->eot &&
 254            header_size == inst->header_size &&
 255            shadow_compare == inst->shadow_compare &&
 256            exec_size == inst->exec_size &&
 257            offset == inst->offset);
 258 }
 259
 260 bool
 261 fs_inst::overwrites_reg(const fs_reg &reg) const
 262 {
 263    return reg.in_range(dst, regs_written);
 264 }
 265
 266 bool
 267 fs_inst::is_send_from_grf() const
 268 {
 269    switch (opcode) {
 270    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
 271    case SHADER_OPCODE_SHADER_TIME_ADD:
 272    case FS_OPCODE_INTERPOLATE_AT_CENTROID:
 273    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
 274    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
 275    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
 276    case SHADER_OPCODE_UNTYPED_ATOMIC:
 277    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
 278    case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
 279    case SHADER_OPCODE_TYPED_ATOMIC:
 280    case SHADER_OPCODE_TYPED_SURFACE_READ:
 281    case SHADER_OPCODE_TYPED_SURFACE_WRITE:
 282    case SHADER_OPCODE_URB_WRITE_SIMD8:
 283       return true;
 284    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 285       return src[1].file == GRF;
 286    case FS_OPCODE_FB_WRITE:
 287       return src[0].file == GRF;
 288    default:
 289       if (is_tex())
 290          return src[0].file == GRF;
 291
 292       return false;
 293    }
 294 }
 295
 296 bool
 297 fs_inst::is_copy_payload(const brw::simple_allocator &grf_alloc) const
 298 {
 299    if (this->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
 300       return false;
 301
 302    fs_reg reg = this->src[0];
 303    if (reg.file != GRF || reg.reg_offset != 0 || reg.stride == 0)
 304       return false;
 305
 306    if (grf_alloc.sizes[reg.reg] != this->regs_written)
 307       return false;
 308
 309    for (int i = 0; i < this->sources; i++) {
 310       reg.type = this->src[i].type;
 311       if (!this->src[i].equals(reg))
 312          return false;
 313
 314       if (i < this->header_size) {
 315          reg.reg_offset += 1;
 316       } else {
 317          reg.reg_offset += this->exec_size / 8;
 318       }
 319    }
 320
 321    return true;
 322 }
 323
 324 bool
 325 fs_inst::can_do_source_mods(const struct brw_device_info *devinfo)
 326 {
 327    if (devinfo->gen == 6 && is_math())
 328       return false;
 329
 330    if (is_send_from_grf())
 331       return false;
 332
 333    if (!backend_instruction::can_do_source_mods())
 334       return false;
 335
 336    return true;
 337 }
 338
 339 bool
 340 fs_inst::has_side_effects() const
 341 {
 342    return this->eot || backend_instruction::has_side_effects();
 343 }
 344
 345 void
 346 fs_reg::init()
 347 {
 348    memset(this, 0, sizeof(*this));
 349    stride = 1;
 350 }
 351
 352 /** Generic unset register constructor. */
 353 fs_reg::fs_reg()
 354 {
 355    init();
 356    this->file = BAD_FILE;
 357 }
 358
 359 /** Immediate value constructor. */
 360 fs_reg::fs_reg(float f)
 361 {
 362    init();
 363    this->file = IMM;
 364    this->type = BRW_REGISTER_TYPE_F;
 365    this->stride = 0;
 366    this->fixed_hw_reg.dw1.f = f;
 367 }
 368
 369 /** Immediate value constructor. */
 370 fs_reg::fs_reg(int32_t i)
 371 {
 372    init();
 373    this->file = IMM;
 374    this->type = BRW_REGISTER_TYPE_D;
 375    this->stride = 0;
 376    this->fixed_hw_reg.dw1.d = i;
 377 }
 378
 379 /** Immediate value constructor. */
 380 fs_reg::fs_reg(uint32_t u)
 381 {
 382    init();
 383    this->file = IMM;
 384    this->type = BRW_REGISTER_TYPE_UD;
 385    this->stride = 0;
 386    this->fixed_hw_reg.dw1.ud = u;
 387 }
 388
 389 /** Vector float immediate value constructor. */
 390 fs_reg::fs_reg(uint8_t vf[4])
 391 {
 392    init();
 393    this->file = IMM;
 394    this->type = BRW_REGISTER_TYPE_VF;
 395    memcpy(&this->fixed_hw_reg.dw1.ud, vf, sizeof(unsigned));
 396 }
 397
 398 /** Vector float immediate value constructor. */
 399 fs_reg::fs_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
 400 {
 401    init();
 402    this->file = IMM;
 403    this->type = BRW_REGISTER_TYPE_VF;
 404    this->fixed_hw_reg.dw1.ud = (vf0 <<  0) |
 405                                (vf1 <<  8) |
 406                                (vf2 << 16) |
 407                                (vf3 << 24);
 408 }
 409
 410 /** Fixed brw_reg. */
 411 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 412 {
 413    init();
 414    this->file = HW_REG;
 415    this->fixed_hw_reg = fixed_hw_reg;
 416    this->type = fixed_hw_reg.type;
 417 }
 418
 419 bool
 420 fs_reg::equals(const fs_reg &r) const
 421 {
 422    return (file == r.file &&
 423            reg == r.reg &&
 424            reg_offset == r.reg_offset &&
 425            subreg_offset == r.subreg_offset &&
 426            type == r.type &&
 427            negate == r.negate &&
 428            abs == r.abs &&
 429            !reladdr && !r.reladdr &&
 430            memcmp(&fixed_hw_reg, &r.fixed_hw_reg, sizeof(fixed_hw_reg)) == 0 &&
 431            stride == r.stride);
 432 }
 433
 434 fs_reg &
 435 fs_reg::set_smear(unsigned subreg)
 436 {
 437    assert(file != HW_REG && file != IMM);
 438    subreg_offset = subreg * type_sz(type);
 439    stride = 0;
 440    return *this;
 441 }
 442
 443 bool
 444 fs_reg::is_contiguous() const
 445 {
 446    return stride == 1;
 447 }
 448
 449 unsigned
 450 fs_reg::component_size(unsigned width) const
 451 {
 452    const unsigned stride = (file != HW_REG ? this->stride :
 453                             fixed_hw_reg.hstride == 0 ? 0 :
 454                             1 << (fixed_hw_reg.hstride - 1));
 455    return MAX2(width * stride, 1) * type_sz(type);
 456 }
 457
 458 int
 459 fs_visitor::type_size(const struct glsl_type *type)
 460 {
 461    unsigned int size, i;
 462
 463    switch (type->base_type) {
 464    case GLSL_TYPE_UINT:
 465    case GLSL_TYPE_INT:
 466    case GLSL_TYPE_FLOAT:
 467    case GLSL_TYPE_BOOL:
 468       return type->components();
 469    case GLSL_TYPE_ARRAY:
 470       return type_size(type->fields.array) * type->length;
 471    case GLSL_TYPE_STRUCT:
 472       size = 0;
 473       for (i = 0; i < type->length; i++) {
 474          size += type_size(type->fields.structure[i].type);
 475       }
 476       return size;
 477    case GLSL_TYPE_SAMPLER:
 478       /* Samplers take up no register space, since they're baked in at
 479        * link time.
 480        */
 481       return 0;
 482    case GLSL_TYPE_ATOMIC_UINT:
 483       return 0;
 484    case GLSL_TYPE_SUBROUTINE:
 485       return 1;
 486    case GLSL_TYPE_IMAGE:
 487    case GLSL_TYPE_VOID:
 488    case GLSL_TYPE_ERROR:
 489    case GLSL_TYPE_INTERFACE:
 490    case GLSL_TYPE_DOUBLE:
 491       unreachable("not reached");
 492    }
 493
 494    return 0;
 495 }
 496
 497 /**
 498  * Create a MOV to read the timestamp register.
 499  *
 500  * The caller is responsible for emitting the MOV.  The return value is
 501  * the destination of the MOV, with extra parameters set.
 502  */
 503 fs_reg
 504 fs_visitor::get_timestamp(const fs_builder &bld)
 505 {
 506    assert(devinfo->gen >= 7);
 507
 508    fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 509                                           BRW_ARF_TIMESTAMP,
 510                                           0),
 511                              BRW_REGISTER_TYPE_UD));
 512
 513    fs_reg dst = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
 514
 515    /* We want to read the 3 fields we care about even if it's not enabled in
 516     * the dispatch.
 517     */
 518    bld.group(4, 0).exec_all().MOV(dst, ts);
 519
 520    /* The caller wants the low 32 bits of the timestamp.  Since it's running
 521     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 522     * which is plenty of time for our purposes.  It is identical across the
 523     * EUs, but since it's tracking GPU core speed it will increment at a
 524     * varying rate as render P-states change.
 525     *
 526     * The caller could also check if render P-states have changed (or anything
 527     * else that might disrupt timing) by setting smear to 2 and checking if
 528     * that field is != 0.
 529     */
 530    dst.set_smear(0);
 531
 532    return dst;
 533 }
 534
 535 void
 536 fs_visitor::emit_shader_time_begin()
 537 {
 538    shader_start_time = get_timestamp(bld.annotate("shader time start"));
 539 }
 540
 541 void
 542 fs_visitor::emit_shader_time_end()
 543 {
 544    /* Insert our code just before the final SEND with EOT. */
 545    exec_node *end = this->instructions.get_tail();
 546    assert(end && ((fs_inst *) end)->eot);
 547    const fs_builder ibld = bld.annotate("shader time end")
 548                               .exec_all().at(NULL, end);
 549
 550    fs_reg shader_end_time = get_timestamp(ibld);
 551
 552    /* Check that there weren't any timestamp reset events (assuming these
 553     * were the only two timestamp reads that happened).
 554     */
 555    fs_reg reset = shader_end_time;
 556    reset.set_smear(2);
 557    set_condmod(BRW_CONDITIONAL_Z,
 558                ibld.AND(ibld.null_reg_ud(), reset, fs_reg(1u)));
 559    ibld.IF(BRW_PREDICATE_NORMAL);
 560
 561    fs_reg start = shader_start_time;
 562    start.negate = true;
 563    fs_reg diff = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
 564    diff.set_smear(0);
 565
 566    const fs_builder cbld = ibld.group(1, 0);
 567    cbld.group(1, 0).ADD(diff, start, shader_end_time);
 568
 569    /* If there were no instructions between the two timestamp gets, the diff
 570     * is 2 cycles.  Remove that overhead, so I can forget about that when
 571     * trying to determine the time taken for single instructions.
 572     */
 573    cbld.ADD(diff, diff, fs_reg(-2u));
 574    SHADER_TIME_ADD(cbld, 0, diff);
 575    SHADER_TIME_ADD(cbld, 1, fs_reg(1u));
 576    ibld.emit(BRW_OPCODE_ELSE);
 577    SHADER_TIME_ADD(cbld, 2, fs_reg(1u));
 578    ibld.emit(BRW_OPCODE_ENDIF);
 579 }
 580
 581 void
 582 fs_visitor::SHADER_TIME_ADD(const fs_builder &bld,
 583                             int shader_time_subindex,
 584                             fs_reg value)
 585 {
 586    int index = shader_time_index * 3 + shader_time_subindex;
 587    fs_reg offset = fs_reg(index * SHADER_TIME_STRIDE);
 588
 589    fs_reg payload;
 590    if (dispatch_width == 8)
 591       payload = vgrf(glsl_type::uvec2_type);
 592    else
 593       payload = vgrf(glsl_type::uint_type);
 594
 595    bld.emit(SHADER_OPCODE_SHADER_TIME_ADD, fs_reg(), payload, offset, value);
 596 }
 597
 598 void
 599 fs_visitor::vfail(const char *format, va_list va)
 600 {
 601    char *msg;
 602
 603    if (failed)
 604       return;
 605
 606    failed = true;
 607
 608    msg = ralloc_vasprintf(mem_ctx, format, va);
 609    msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
 610
 611    this->fail_msg = msg;
 612
 613    if (debug_enabled) {
 614       fprintf(stderr, "%s",  msg);
 615    }
 616 }
 617
 618 void
 619 fs_visitor::fail(const char *format, ...)
 620 {
 621    va_list va;
 622
 623    va_start(va, format);
 624    vfail(format, va);
 625    va_end(va);
 626 }
 627
 628 /**
 629  * Mark this program as impossible to compile in SIMD16 mode.
 630  *
 631  * During the SIMD8 compile (which happens first), we can detect and flag
 632  * things that are unsupported in SIMD16 mode, so the compiler can skip
 633  * the SIMD16 compile altogether.
 634  *
 635  * During a SIMD16 compile (if one happens anyway), this just calls fail().
 636  */
 637 void
 638 fs_visitor::no16(const char *msg)
 639 {
 640    if (dispatch_width == 16) {
 641       fail("%s", msg);
 642    } else {
 643       simd16_unsupported = true;
 644
 645       compiler->shader_perf_log(log_data,
 646                                 "SIMD16 shader failed to compile: %s", msg);
 647    }
 648 }
 649
 650 /**
 651  * Returns true if the instruction has a flag that means it won't
 652  * update an entire destination register.
 653  *
 654  * For example, dead code elimination and live variable analysis want to know
 655  * when a write to a variable screens off any preceding values that were in
 656  * it.
 657  */
 658 bool
 659 fs_inst::is_partial_write() const
 660 {
 661    return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
 662            (this->exec_size * type_sz(this->dst.type)) < 32 ||
 663            !this->dst.is_contiguous());
 664 }
 665
 666 unsigned
 667 fs_inst::components_read(unsigned i) const
 668 {
 669    switch (opcode) {
 670    case FS_OPCODE_LINTERP:
 671       if (i == 0)
 672          return 2;
 673       else
 674          return 1;
 675
 676    case FS_OPCODE_PIXEL_X:
 677    case FS_OPCODE_PIXEL_Y:
 678       assert(i == 0);
 679       return 2;
 680
 681    default:
 682       return 1;
 683    }
 684 }
 685
 686 int
 687 fs_inst::regs_read(int arg) const
 688 {
 689    switch (opcode) {
 690    case FS_OPCODE_FB_WRITE:
 691    case SHADER_OPCODE_URB_WRITE_SIMD8:
 692    case SHADER_OPCODE_UNTYPED_ATOMIC:
 693    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
 694    case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
 695    case SHADER_OPCODE_TYPED_ATOMIC:
 696    case SHADER_OPCODE_TYPED_SURFACE_READ:
 697    case SHADER_OPCODE_TYPED_SURFACE_WRITE:
 698    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
 699       if (arg == 0)
 700          return mlen;
 701       break;
 702
 703    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7:
 704       /* The payload is actually stored in src1 */
 705       if (arg == 1)
 706          return mlen;
 707       break;
 708
 709    case FS_OPCODE_LINTERP:
 710       if (arg == 1)
 711          return 1;
 712       break;
 713
 714    case SHADER_OPCODE_LOAD_PAYLOAD:
 715       if (arg < this->header_size)
 716          return 1;
 717       break;
 718
 719    case CS_OPCODE_CS_TERMINATE:
 720       return 1;
 721
 722    default:
 723       if (is_tex() && arg == 0 && src[0].file == GRF)
 724          return mlen;
 725       break;
 726    }
 727
 728    switch (src[arg].file) {
 729    case BAD_FILE:
 730       return 0;
 731    case UNIFORM:
 732    case IMM:
 733       return 1;
 734    case GRF:
 735    case HW_REG:
 736       return DIV_ROUND_UP(components_read(arg) *
 737                           src[arg].component_size(exec_size),
 738                           REG_SIZE);
 739    case MRF:
 740       unreachable("MRF registers are not allowed as sources");
 741    default:
 742       unreachable("Invalid register file");
 743    }
 744 }
 745
 746 bool
 747 fs_inst::reads_flag() const
 748 {
 749    return predicate;
 750 }
 751
 752 bool
 753 fs_inst::writes_flag() const
 754 {
 755    return (conditional_mod && (opcode != BRW_OPCODE_SEL &&
 756                                opcode != BRW_OPCODE_IF &&
 757                                opcode != BRW_OPCODE_WHILE)) ||
 758           opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
 759 }
 760
 761 /**
 762  * Returns how many MRFs an FS opcode will write over.
 763  *
 764  * Note that this is not the 0 or 1 implied writes in an actual gen
 765  * instruction -- the FS opcodes often generate MOVs in addition.
 766  */
 767 int
 768 fs_visitor::implied_mrf_writes(fs_inst *inst)
 769 {
 770    if (inst->mlen == 0)
 771       return 0;
 772
 773    if (inst->base_mrf == -1)
 774       return 0;
 775
 776    switch (inst->opcode) {
 777    case SHADER_OPCODE_RCP:
 778    case SHADER_OPCODE_RSQ:
 779    case SHADER_OPCODE_SQRT:
 780    case SHADER_OPCODE_EXP2:
 781    case SHADER_OPCODE_LOG2:
 782    case SHADER_OPCODE_SIN:
 783    case SHADER_OPCODE_COS:
 784       return 1 * dispatch_width / 8;
 785    case SHADER_OPCODE_POW:
 786    case SHADER_OPCODE_INT_QUOTIENT:
 787    case SHADER_OPCODE_INT_REMAINDER:
 788       return 2 * dispatch_width / 8;
 789    case SHADER_OPCODE_TEX:
 790    case FS_OPCODE_TXB:
 791    case SHADER_OPCODE_TXD:
 792    case SHADER_OPCODE_TXF:
 793    case SHADER_OPCODE_TXF_CMS:
 794    case SHADER_OPCODE_TXF_MCS:
 795    case SHADER_OPCODE_TG4:
 796    case SHADER_OPCODE_TG4_OFFSET:
 797    case SHADER_OPCODE_TXL:
 798    case SHADER_OPCODE_TXS:
 799    case SHADER_OPCODE_LOD:
 800       return 1;
 801    case FS_OPCODE_FB_WRITE:
 802       return 2;
 803    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 804    case SHADER_OPCODE_GEN4_SCRATCH_READ:
 805       return 1;
 806    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
 807       return inst->mlen;
 808    case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
 809       return inst->mlen;
 810    case SHADER_OPCODE_UNTYPED_ATOMIC:
 811    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
 812    case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
 813    case SHADER_OPCODE_TYPED_ATOMIC:
 814    case SHADER_OPCODE_TYPED_SURFACE_READ:
 815    case SHADER_OPCODE_TYPED_SURFACE_WRITE:
 816    case SHADER_OPCODE_URB_WRITE_SIMD8:
 817    case FS_OPCODE_INTERPOLATE_AT_CENTROID:
 818    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
 819    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
 820    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
 821       return 0;
 822    default:
 823       unreachable("not reached");
 824    }
 825 }
 826
 827 fs_reg
 828 fs_visitor::vgrf(const glsl_type *const type)
 829 {
 830    int reg_width = dispatch_width / 8;
 831    return fs_reg(GRF, alloc.allocate(type_size(type) * reg_width),
 832                  brw_type_for_base_type(type));
 833 }
 834
 835 /** Fixed HW reg constructor. */
 836 fs_reg::fs_reg(enum register_file file, int reg)
 837 {
 838    init();
 839    this->file = file;
 840    this->reg = reg;
 841    this->type = BRW_REGISTER_TYPE_F;
 842    this->stride = (file == UNIFORM ? 0 : 1);
 843 }
 844
 845 /** Fixed HW reg constructor. */
 846 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
 847 {
 848    init();
 849    this->file = file;
 850    this->reg = reg;
 851    this->type = type;
 852    this->stride = (file == UNIFORM ? 0 : 1);
 853 }
 854
 855 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
 856  * This brings in those uniform definitions
 857  */
 858 void
 859 fs_visitor::import_uniforms(fs_visitor *v)
 860 {
 861    this->push_constant_loc = v->push_constant_loc;
 862    this->pull_constant_loc = v->pull_constant_loc;
 863    this->uniforms = v->uniforms;
 864    this->param_size = v->param_size;
 865 }
 866
 867 fs_reg *
 868 fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
 869                                          bool origin_upper_left)
 870 {
 871    assert(stage == MESA_SHADER_FRAGMENT);
 872    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
 873    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec4_type));
 874    fs_reg wpos = *reg;
 875    bool flip = !origin_upper_left ^ key->render_to_fbo;
 876
 877    /* gl_FragCoord.x */
 878    if (pixel_center_integer) {
 879       bld.MOV(wpos, this->pixel_x);
 880    } else {
 881       bld.ADD(wpos, this->pixel_x, fs_reg(0.5f));
 882    }
 883    wpos = offset(wpos, bld, 1);
 884
 885    /* gl_FragCoord.y */
 886    if (!flip && pixel_center_integer) {
 887       bld.MOV(wpos, this->pixel_y);
 888    } else {
 889       fs_reg pixel_y = this->pixel_y;
 890       float offset = (pixel_center_integer ? 0.0 : 0.5);
 891
 892       if (flip) {
 893          pixel_y.negate = true;
 894          offset += key->drawable_height - 1.0;
 895       }
 896
 897       bld.ADD(wpos, pixel_y, fs_reg(offset));
 898    }
 899    wpos = offset(wpos, bld, 1);
 900
 901    /* gl_FragCoord.z */
 902    if (devinfo->gen >= 6) {
 903       bld.MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0)));
 904    } else {
 905       bld.emit(FS_OPCODE_LINTERP, wpos,
 906            this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 907            interp_reg(VARYING_SLOT_POS, 2));
 908    }
 909    wpos = offset(wpos, bld, 1);
 910
 911    /* gl_FragCoord.w: Already set up in emit_interpolation */
 912    bld.MOV(wpos, this->wpos_w);
 913
 914    return reg;
 915 }
 916
 917 fs_inst *
 918 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
 919                          glsl_interp_qualifier interpolation_mode,
 920                          bool is_centroid, bool is_sample)
 921 {
 922    brw_wm_barycentric_interp_mode barycoord_mode;
 923    if (devinfo->gen >= 6) {
 924       if (is_centroid) {
 925          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
 926             barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
 927          else
 928             barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
 929       } else if (is_sample) {
 930           if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
 931             barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
 932          else
 933             barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
 934       } else {
 935          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
 936             barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
 937          else
 938             barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
 939       }
 940    } else {
 941       /* On Ironlake and below, there is only one interpolation mode.
 942        * Centroid interpolation doesn't mean anything on this hardware --
 943        * there is no multisampling.
 944        */
 945       barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
 946    }
 947    return bld.emit(FS_OPCODE_LINTERP, attr,
 948                    this->delta_xy[barycoord_mode], interp);
 949 }
 950
 951 void
 952 fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
 953                                        const glsl_type *type,
 954                                        glsl_interp_qualifier interpolation_mode,
 955                                        int location, bool mod_centroid,
 956                                        bool mod_sample)
 957 {
 958    attr.type = brw_type_for_base_type(type->get_scalar_type());
 959
 960    assert(stage == MESA_SHADER_FRAGMENT);
 961    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
 962    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
 963
 964    unsigned int array_elements;
 965
 966    if (type->is_array()) {
 967       array_elements = type->length;
 968       if (array_elements == 0) {
 969          fail("dereferenced array '%s' has length 0\n", name);
 970       }
 971       type = type->fields.array;
 972    } else {
 973       array_elements = 1;
 974    }
 975
 976    if (interpolation_mode == INTERP_QUALIFIER_NONE) {
 977       bool is_gl_Color =
 978          location == VARYING_SLOT_COL0 || location == VARYING_SLOT_COL1;
 979       if (key->flat_shade && is_gl_Color) {
 980          interpolation_mode = INTERP_QUALIFIER_FLAT;
 981       } else {
 982          interpolation_mode = INTERP_QUALIFIER_SMOOTH;
 983       }
 984    }
 985
 986    for (unsigned int i = 0; i < array_elements; i++) {
 987       for (unsigned int j = 0; j < type->matrix_columns; j++) {
 988          if (prog_data->urb_setup[location] == -1) {
 989             /* If there's no incoming setup data for this slot, don't
 990              * emit interpolation for it.
 991              */
 992             attr = offset(attr, bld, type->vector_elements);
 993             location++;
 994             continue;
 995          }
 996
 997          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
 998             /* Constant interpolation (flat shading) case. The SF has
 999              * handed us defined values in only the constant offset
1000              * field of the setup reg.
1001              */
1002             for (unsigned int k = 0; k < type->vector_elements; k++) {
1003                struct brw_reg interp = interp_reg(location, k);
1004                interp = suboffset(interp, 3);
1005                interp.type = attr.type;
1006                bld.emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1007                attr = offset(attr, bld, 1);
1008             }
1009          } else {
1010             /* Smooth/noperspective interpolation case. */
1011             for (unsigned int k = 0; k < type->vector_elements; k++) {
1012                struct brw_reg interp = interp_reg(location, k);
1013                if (devinfo->needs_unlit_centroid_workaround && mod_centroid) {
1014                   /* Get the pixel/sample mask into f0 so that we know
1015                    * which pixels are lit.  Then, for each channel that is
1016                    * unlit, replace the centroid data with non-centroid
1017                    * data.
1018                    */
1019                   bld.emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1020
1021                   fs_inst *inst;
1022                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1023                                       false, false);
1024                   inst->predicate = BRW_PREDICATE_NORMAL;
1025                   inst->predicate_inverse = true;
1026                   if (devinfo->has_pln)
1027                      inst->no_dd_clear = true;
1028
1029                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1030                                       mod_centroid && !key->persample_shading,
1031                                       mod_sample || key->persample_shading);
1032                   inst->predicate = BRW_PREDICATE_NORMAL;
1033                   inst->predicate_inverse = false;
1034                   if (devinfo->has_pln)
1035                      inst->no_dd_check = true;
1036
1037                } else {
1038                   emit_linterp(attr, fs_reg(interp), interpolation_mode,
1039                                mod_centroid && !key->persample_shading,
1040                                mod_sample || key->persample_shading);
1041                }
1042                if (devinfo->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1043                   bld.MUL(attr, attr, this->pixel_w);
1044                }
1045                attr = offset(attr, bld, 1);
1046             }
1047
1048          }
1049          location++;
1050       }
1051    }
1052 }
1053
1054 fs_reg *
1055 fs_visitor::emit_frontfacing_interpolation()
1056 {
1057    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type));
1058
1059    if (devinfo->gen >= 6) {
1060       /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
1061        * a boolean result from this (~0/true or 0/false).
1062        *
1063        * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
1064        * this task in only one instruction:
1065        *    - a negation source modifier will flip the bit; and
1066        *    - a W -> D type conversion will sign extend the bit into the high
1067        *      word of the destination.
1068        *
1069        * An ASR 15 fills the low word of the destination.
1070        */
1071       fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
1072       g0.negate = true;
1073
1074       bld.ASR(*reg, g0, fs_reg(15));
1075    } else {
1076       /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
1077        * a boolean result from this (1/true or 0/false).
1078        *
1079        * Like in the above case, since the bit is the MSB of g1.6:UD we can use
1080        * the negation source modifier to flip it. Unfortunately the SHR
1081        * instruction only operates on UD (or D with an abs source modifier)
1082        * sources without negation.
1083        *
1084        * Instead, use ASR (which will give ~0/true or 0/false).
1085        */
1086       fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
1087       g1_6.negate = true;
1088
1089       bld.ASR(*reg, g1_6, fs_reg(31));
1090    }
1091
1092    return reg;
1093 }
1094
1095 void
1096 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1097 {
1098    assert(stage == MESA_SHADER_FRAGMENT);
1099    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1100    assert(dst.type == BRW_REGISTER_TYPE_F);
1101
1102    if (key->compute_pos_offset) {
1103       /* Convert int_sample_pos to floating point */
1104       bld.MOV(dst, int_sample_pos);
1105       /* Scale to the range [0, 1] */
1106       bld.MUL(dst, dst, fs_reg(1 / 16.0f));
1107    }
1108    else {
1109       /* From ARB_sample_shading specification:
1110        * "When rendering to a non-multisample buffer, or if multisample
1111        *  rasterization is disabled, gl_SamplePosition will always be
1112        *  (0.5, 0.5).
1113        */
1114       bld.MOV(dst, fs_reg(0.5f));
1115    }
1116 }
1117
1118 fs_reg *
1119 fs_visitor::emit_samplepos_setup()
1120 {
1121    assert(devinfo->gen >= 6);
1122
1123    const fs_builder abld = bld.annotate("compute sample position");
1124    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type));
1125    fs_reg pos = *reg;
1126    fs_reg int_sample_x = vgrf(glsl_type::int_type);
1127    fs_reg int_sample_y = vgrf(glsl_type::int_type);
1128
1129    /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1130     * mode will be enabled.
1131     *
1132     * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1133     * R31.1:0         Position Offset X/Y for Slot[3:0]
1134     * R31.3:2         Position Offset X/Y for Slot[7:4]
1135     * .....
1136     *
1137     * The X, Y sample positions come in as bytes in  thread payload. So, read
1138     * the positions using vstride=16, width=8, hstride=2.
1139     */
1140    struct brw_reg sample_pos_reg =
1141       stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1142                     BRW_REGISTER_TYPE_B), 16, 8, 2);
1143
1144    if (dispatch_width == 8) {
1145       abld.MOV(int_sample_x, fs_reg(sample_pos_reg));
1146    } else {
1147       abld.half(0).MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg));
1148       abld.half(1).MOV(half(int_sample_x, 1),
1149                        fs_reg(suboffset(sample_pos_reg, 16)));
1150    }
1151    /* Compute gl_SamplePosition.x */
1152    compute_sample_position(pos, int_sample_x);
1153    pos = offset(pos, abld, 1);
1154    if (dispatch_width == 8) {
1155       abld.MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1)));
1156    } else {
1157       abld.half(0).MOV(half(int_sample_y, 0),
1158                        fs_reg(suboffset(sample_pos_reg, 1)));
1159       abld.half(1).MOV(half(int_sample_y, 1),
1160                        fs_reg(suboffset(sample_pos_reg, 17)));
1161    }
1162    /* Compute gl_SamplePosition.y */
1163    compute_sample_position(pos, int_sample_y);
1164    return reg;
1165 }
1166
1167 fs_reg *
1168 fs_visitor::emit_sampleid_setup()
1169 {
1170    assert(stage == MESA_SHADER_FRAGMENT);
1171    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1172    assert(devinfo->gen >= 6);
1173
1174    const fs_builder abld = bld.annotate("compute sample id");
1175    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
1176
1177    if (key->compute_sample_id) {
1178       fs_reg t1 = vgrf(glsl_type::int_type);
1179       fs_reg t2 = vgrf(glsl_type::int_type);
1180       t2.type = BRW_REGISTER_TYPE_UW;
1181
1182       /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1183        * 8x multisampling, subspan 0 will represent sample N (where N
1184        * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1185        * 7. We can find the value of N by looking at R0.0 bits 7:6
1186        * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1187        * (since samples are always delivered in pairs). That is, we
1188        * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1189        * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1190        * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1191        * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1192        * populating a temporary variable with the sequence (0, 1, 2, 3),
1193        * and then reading from it using vstride=1, width=4, hstride=0.
1194        * These computations hold good for 4x multisampling as well.
1195        *
1196        * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1197        * the first four slots are sample 0 of subspan 0; the next four
1198        * are sample 1 of subspan 0; the third group is sample 0 of
1199        * subspan 1, and finally sample 1 of subspan 1.
1200        */
1201       abld.exec_all()
1202           .AND(t1, fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1203                fs_reg(0xc0));
1204       abld.exec_all().SHR(t1, t1, fs_reg(5));
1205
1206       /* This works for both SIMD8 and SIMD16 */
1207       abld.exec_all()
1208           .MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210));
1209
1210       /* This special instruction takes care of setting vstride=1,
1211        * width=4, hstride=0 of t2 during an ADD instruction.
1212        */
1213       abld.emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1214    } else {
1215       /* As per GL_ARB_sample_shading specification:
1216        * "When rendering to a non-multisample buffer, or if multisample
1217        *  rasterization is disabled, gl_SampleID will always be zero."
1218        */
1219       abld.MOV(*reg, fs_reg(0));
1220    }
1221
1222    return reg;
1223 }
1224
1225 void
1226 fs_visitor::resolve_source_modifiers(fs_reg *src)
1227 {
1228    if (!src->abs && !src->negate)
1229       return;
1230
1231    fs_reg temp = bld.vgrf(src->type);
1232    bld.MOV(temp, *src);
1233    *src = temp;
1234 }
1235
1236 void
1237 fs_visitor::emit_discard_jump()
1238 {
1239    assert(((brw_wm_prog_data*) this->prog_data)->uses_kill);
1240
1241    /* For performance, after a discard, jump to the end of the
1242     * shader if all relevant channels have been discarded.
1243     */
1244    fs_inst *discard_jump = bld.emit(FS_OPCODE_DISCARD_JUMP);
1245    discard_jump->flag_subreg = 1;
1246
1247    discard_jump->predicate = (dispatch_width == 8)
1248                              ? BRW_PREDICATE_ALIGN1_ANY8H
1249                              : BRW_PREDICATE_ALIGN1_ANY16H;
1250    discard_jump->predicate_inverse = true;
1251 }
1252
1253 void
1254 fs_visitor::assign_curb_setup()
1255 {
1256    if (dispatch_width == 8) {
1257       prog_data->dispatch_grf_start_reg = payload.num_regs;
1258    } else {
1259       if (stage == MESA_SHADER_FRAGMENT) {
1260          brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1261          prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1262       } else if (stage == MESA_SHADER_COMPUTE) {
1263          brw_cs_prog_data *prog_data = (brw_cs_prog_data*) this->prog_data;
1264          prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1265       } else {
1266          unreachable("Unsupported shader type!");
1267       }
1268    }
1269
1270    prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1271
1272    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1273    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1274       for (unsigned int i = 0; i < inst->sources; i++) {
1275          if (inst->src[i].file == UNIFORM) {
1276             int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1277             int constant_nr;
1278             if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1279                constant_nr = push_constant_loc[uniform_nr];
1280             } else {
1281                /* Section 5.11 of the OpenGL 4.1 spec says:
1282                 * "Out-of-bounds reads return undefined values, which include
1283                 *  values from other variables of the active program or zero."
1284                 * Just return the first push constant.
1285                 */
1286                constant_nr = 0;
1287             }
1288
1289             struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1290                                                   constant_nr / 8,
1291                                                   constant_nr % 8);
1292
1293             assert(inst->src[i].stride == 0);
1294             inst->src[i].file = HW_REG;
1295             inst->src[i].fixed_hw_reg = byte_offset(
1296                retype(brw_reg, inst->src[i].type),
1297                inst->src[i].subreg_offset);
1298          }
1299       }
1300    }
1301 }
1302
1303 void
1304 fs_visitor::calculate_urb_setup()
1305 {
1306    assert(stage == MESA_SHADER_FRAGMENT);
1307    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1308    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1309
1310    memset(prog_data->urb_setup, -1,
1311           sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
1312
1313    int urb_next = 0;
1314    /* Figure out where each of the incoming setup attributes lands. */
1315    if (devinfo->gen >= 6) {
1316       if (_mesa_bitcount_64(prog->InputsRead &
1317                             BRW_FS_VARYING_INPUT_MASK) <= 16) {
1318          /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1319           * first 16 varying inputs, so we can put them wherever we want.
1320           * Just put them in order.
1321           *
1322           * This is useful because it means that (a) inputs not used by the
1323           * fragment shader won't take up valuable register space, and (b) we
1324           * won't have to recompile the fragment shader if it gets paired with
1325           * a different vertex (or geometry) shader.
1326           */
1327          for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1328             if (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1329                 BITFIELD64_BIT(i)) {
1330                prog_data->urb_setup[i] = urb_next++;
1331             }
1332          }
1333       } else {
1334          /* We have enough input varyings that the SF/SBE pipeline stage can't
1335           * arbitrarily rearrange them to suit our whim; we have to put them
1336           * in an order that matches the output of the previous pipeline stage
1337           * (geometry or vertex shader).
1338           */
1339          struct brw_vue_map prev_stage_vue_map;
1340          brw_compute_vue_map(devinfo, &prev_stage_vue_map,
1341                              key->input_slots_valid);
1342          int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1343          assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1344          for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1345               slot++) {
1346             int varying = prev_stage_vue_map.slot_to_varying[slot];
1347             /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1348              * unused.
1349              */
1350             if (varying != BRW_VARYING_SLOT_COUNT &&
1351                 (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1352                  BITFIELD64_BIT(varying))) {
1353                prog_data->urb_setup[varying] = slot - first_slot;
1354             }
1355          }
1356          urb_next = prev_stage_vue_map.num_slots - first_slot;
1357       }
1358    } else {
1359       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1360       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1361          /* Point size is packed into the header, not as a general attribute */
1362          if (i == VARYING_SLOT_PSIZ)
1363             continue;
1364
1365          if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1366             /* The back color slot is skipped when the front color is
1367              * also written to.  In addition, some slots can be
1368              * written in the vertex shader and not read in the
1369              * fragment shader.  So the register number must always be
1370              * incremented, mapped or not.
1371              */
1372             if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1373                prog_data->urb_setup[i] = urb_next;
1374             urb_next++;
1375          }
1376       }
1377
1378       /*
1379        * It's a FS only attribute, and we did interpolation for this attribute
1380        * in SF thread. So, count it here, too.
1381        *
1382        * See compile_sf_prog() for more info.
1383        */
1384       if (prog->InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1385          prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1386    }
1387
1388    prog_data->num_varying_inputs = urb_next;
1389 }
1390
1391 void
1392 fs_visitor::assign_urb_setup()
1393 {
1394    assert(stage == MESA_SHADER_FRAGMENT);
1395    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1396
1397    int urb_start = payload.num_regs + prog_data->base.curb_read_length;
1398
1399    /* Offset all the urb_setup[] index by the actual position of the
1400     * setup regs, now that the location of the constants has been chosen.
1401     */
1402    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1403       if (inst->opcode == FS_OPCODE_LINTERP) {
1404          assert(inst->src[1].file == HW_REG);
1405          inst->src[1].fixed_hw_reg.nr += urb_start;
1406       }
1407
1408       if (inst->opcode == FS_OPCODE_CINTERP) {
1409          assert(inst->src[0].file == HW_REG);
1410          inst->src[0].fixed_hw_reg.nr += urb_start;
1411       }
1412    }
1413
1414    /* Each attribute is 4 setup channels, each of which is half a reg. */
1415    this->first_non_payload_grf =
1416       urb_start + prog_data->num_varying_inputs * 2;
1417 }
1418
1419 void
1420 fs_visitor::assign_vs_urb_setup()
1421 {
1422    brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
1423    int grf, count, slot, channel, attr;
1424
1425    assert(stage == MESA_SHADER_VERTEX);
1426    count = _mesa_bitcount_64(vs_prog_data->inputs_read);
1427    if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid)
1428       count++;
1429
1430    /* Each attribute is 4 regs. */
1431    this->first_non_payload_grf =
1432       payload.num_regs + prog_data->curb_read_length + count * 4;
1433
1434    unsigned vue_entries =
1435       MAX2(count, vs_prog_data->base.vue_map.num_slots);
1436
1437    vs_prog_data->base.urb_entry_size = ALIGN(vue_entries, 4) / 4;
1438    vs_prog_data->base.urb_read_length = (count + 1) / 2;
1439
1440    assert(vs_prog_data->base.urb_read_length <= 15);
1441
1442    /* Rewrite all ATTR file references to the hw grf that they land in. */
1443    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1444       for (int i = 0; i < inst->sources; i++) {
1445          if (inst->src[i].file == ATTR) {
1446
1447             if (inst->src[i].reg == VERT_ATTRIB_MAX) {
1448                slot = count - 1;
1449             } else {
1450                /* Attributes come in in a contiguous block, ordered by their
1451                 * gl_vert_attrib value.  That means we can compute the slot
1452                 * number for an attribute by masking out the enabled
1453                 * attributes before it and counting the bits.
1454                 */
1455                attr = inst->src[i].reg + inst->src[i].reg_offset / 4;
1456                slot = _mesa_bitcount_64(vs_prog_data->inputs_read &
1457                                         BITFIELD64_MASK(attr));
1458             }
1459
1460             channel = inst->src[i].reg_offset & 3;
1461
1462             grf = payload.num_regs +
1463                prog_data->curb_read_length +
1464                slot * 4 + channel;
1465
1466             inst->src[i].file = HW_REG;
1467             inst->src[i].fixed_hw_reg =
1468                retype(brw_vec8_grf(grf, 0), inst->src[i].type);
1469          }
1470       }
1471    }
1472 }
1473
1474 /**
1475  * Split large virtual GRFs into separate components if we can.
1476  *
1477  * This is mostly duplicated with what brw_fs_vector_splitting does,
1478  * but that's really conservative because it's afraid of doing
1479  * splitting that doesn't result in real progress after the rest of
1480  * the optimization phases, which would cause infinite looping in
1481  * optimization.  We can do it once here, safely.  This also has the
1482  * opportunity to split interpolated values, or maybe even uniforms,
1483  * which we don't have at the IR level.
1484  *
1485  * We want to split, because virtual GRFs are what we register
1486  * allocate and spill (due to contiguousness requirements for some
1487  * instructions), and they're what we naturally generate in the
1488  * codegen process, but most virtual GRFs don't actually need to be
1489  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1490  * live intervals and better dead code elimination and coalescing.
1491  */
1492 void
1493 fs_visitor::split_virtual_grfs()
1494 {
1495    int num_vars = this->alloc.count;
1496
1497    /* Count the total number of registers */
1498    int reg_count = 0;
1499    int vgrf_to_reg[num_vars];
1500    for (int i = 0; i < num_vars; i++) {
1501       vgrf_to_reg[i] = reg_count;
1502       reg_count += alloc.sizes[i];
1503    }
1504
1505    /* An array of "split points".  For each register slot, this indicates
1506     * if this slot can be separated from the previous slot.  Every time an
1507     * instruction uses multiple elements of a register (as a source or
1508     * destination), we mark the used slots as inseparable.  Then we go
1509     * through and split the registers into the smallest pieces we can.
1510     */
1511    bool split_points[reg_count];
1512    memset(split_points, 0, sizeof(split_points));
1513
1514    /* Mark all used registers as fully splittable */
1515    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1516       if (inst->dst.file == GRF) {
1517          int reg = vgrf_to_reg[inst->dst.reg];
1518          for (unsigned j = 1; j < this->alloc.sizes[inst->dst.reg]; j++)
1519             split_points[reg + j] = true;
1520       }
1521
1522       for (int i = 0; i < inst->sources; i++) {
1523          if (inst->src[i].file == GRF) {
1524             int reg = vgrf_to_reg[inst->src[i].reg];
1525             for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].reg]; j++)
1526                split_points[reg + j] = true;
1527          }
1528       }
1529    }
1530
1531    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1532       if (inst->dst.file == GRF) {
1533          int reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
1534          for (int j = 1; j < inst->regs_written; j++)
1535             split_points[reg + j] = false;
1536       }
1537       for (int i = 0; i < inst->sources; i++) {
1538          if (inst->src[i].file == GRF) {
1539             int reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
1540             for (int j = 1; j < inst->regs_read(i); j++)
1541                split_points[reg + j] = false;
1542          }
1543       }
1544    }
1545
1546    int new_virtual_grf[reg_count];
1547    int new_reg_offset[reg_count];
1548
1549    int reg = 0;
1550    for (int i = 0; i < num_vars; i++) {
1551       /* The first one should always be 0 as a quick sanity check. */
1552       assert(split_points[reg] == false);
1553
1554       /* j = 0 case */
1555       new_reg_offset[reg] = 0;
1556       reg++;
1557       int offset = 1;
1558
1559       /* j > 0 case */
1560       for (unsigned j = 1; j < alloc.sizes[i]; j++) {
1561          /* If this is a split point, reset the offset to 0 and allocate a
1562           * new virtual GRF for the previous offset many registers
1563           */
1564          if (split_points[reg]) {
1565             assert(offset <= MAX_VGRF_SIZE);
1566             int grf = alloc.allocate(offset);
1567             for (int k = reg - offset; k < reg; k++)
1568                new_virtual_grf[k] = grf;
1569             offset = 0;
1570          }
1571          new_reg_offset[reg] = offset;
1572          offset++;
1573          reg++;
1574       }
1575
1576       /* The last one gets the original register number */
1577       assert(offset <= MAX_VGRF_SIZE);
1578       alloc.sizes[i] = offset;
1579       for (int k = reg - offset; k < reg; k++)
1580          new_virtual_grf[k] = i;
1581    }
1582    assert(reg == reg_count);
1583
1584    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1585       if (inst->dst.file == GRF) {
1586          reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
1587          inst->dst.reg = new_virtual_grf[reg];
1588          inst->dst.reg_offset = new_reg_offset[reg];
1589          assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
1590       }
1591       for (int i = 0; i < inst->sources; i++) {
1592          if (inst->src[i].file == GRF) {
1593             reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
1594             inst->src[i].reg = new_virtual_grf[reg];
1595             inst->src[i].reg_offset = new_reg_offset[reg];
1596             assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
1597          }
1598       }
1599    }
1600    invalidate_live_intervals();
1601 }
1602
1603 /**
1604  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1605  *
1606  * During code generation, we create tons of temporary variables, many of
1607  * which get immediately killed and are never used again.  Yet, in later
1608  * optimization and analysis passes, such as compute_live_intervals, we need
1609  * to loop over all the virtual GRFs.  Compacting them can save a lot of
1610  * overhead.
1611  */
1612 bool
1613 fs_visitor::compact_virtual_grfs()
1614 {
1615    bool progress = false;
1616    int remap_table[this->alloc.count];
1617    memset(remap_table, -1, sizeof(remap_table));
1618
1619    /* Mark which virtual GRFs are used. */
1620    foreach_block_and_inst(block, const fs_inst, inst, cfg) {
1621       if (inst->dst.file == GRF)
1622          remap_table[inst->dst.reg] = 0;
1623
1624       for (int i = 0; i < inst->sources; i++) {
1625          if (inst->src[i].file == GRF)
1626             remap_table[inst->src[i].reg] = 0;
1627       }
1628    }
1629
1630    /* Compact the GRF arrays. */
1631    int new_index = 0;
1632    for (unsigned i = 0; i < this->alloc.count; i++) {
1633       if (remap_table[i] == -1) {
1634          /* We just found an unused register.  This means that we are
1635           * actually going to compact something.
1636           */
1637          progress = true;
1638       } else {
1639          remap_table[i] = new_index;
1640          alloc.sizes[new_index] = alloc.sizes[i];
1641          invalidate_live_intervals();
1642          ++new_index;
1643       }
1644    }
1645
1646    this->alloc.count = new_index;
1647
1648    /* Patch all the instructions to use the newly renumbered registers */
1649    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1650       if (inst->dst.file == GRF)
1651          inst->dst.reg = remap_table[inst->dst.reg];
1652
1653       for (int i = 0; i < inst->sources; i++) {
1654          if (inst->src[i].file == GRF)
1655             inst->src[i].reg = remap_table[inst->src[i].reg];
1656       }
1657    }
1658
1659    /* Patch all the references to delta_xy, since they're used in register
1660     * allocation.  If they're unused, switch them to BAD_FILE so we don't
1661     * think some random VGRF is delta_xy.
1662     */
1663    for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
1664       if (delta_xy[i].file == GRF) {
1665          if (remap_table[delta_xy[i].reg] != -1) {
1666             delta_xy[i].reg = remap_table[delta_xy[i].reg];
1667          } else {
1668             delta_xy[i].file = BAD_FILE;
1669          }
1670       }
1671    }
1672
1673    return progress;
1674 }
1675
1676 /*
1677  * Implements array access of uniforms by inserting a
1678  * PULL_CONSTANT_LOAD instruction.
1679  *
1680  * Unlike temporary GRF array access (where we don't support it due to
1681  * the difficulty of doing relative addressing on instruction
1682  * destinations), we could potentially do array access of uniforms
1683  * that were loaded in GRF space as push constants.  In real-world
1684  * usage we've seen, though, the arrays being used are always larger
1685  * than we could load as push constants, so just always move all
1686  * uniform array access out to a pull constant buffer.
1687  */
1688 void
1689 fs_visitor::move_uniform_array_access_to_pull_constants()
1690 {
1691    if (dispatch_width != 8)
1692       return;
1693
1694    pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
1695    memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
1696
1697    /* Walk through and find array access of uniforms.  Put a copy of that
1698     * uniform in the pull constant buffer.
1699     *
1700     * Note that we don't move constant-indexed accesses to arrays.  No
1701     * testing has been done of the performance impact of this choice.
1702     */
1703    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
1704       for (int i = 0 ; i < inst->sources; i++) {
1705          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1706             continue;
1707
1708          int uniform = inst->src[i].reg;
1709
1710          /* If this array isn't already present in the pull constant buffer,
1711           * add it.
1712           */
1713          if (pull_constant_loc[uniform] == -1) {
1714             const gl_constant_value **values = &stage_prog_data->param[uniform];
1715
1716             assert(param_size[uniform]);
1717
1718             for (int j = 0; j < param_size[uniform]; j++) {
1719                pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
1720
1721                stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
1722                   values[j];
1723             }
1724          }
1725       }
1726    }
1727 }
1728
1729 /**
1730  * Assign UNIFORM file registers to either push constants or pull constants.
1731  *
1732  * We allow a fragment shader to have more than the specified minimum
1733  * maximum number of fragment shader uniform components (64).  If
1734  * there are too many of these, they'd fill up all of register space.
1735  * So, this will push some of them out to the pull constant buffer and
1736  * update the program to load them.
1737  */
1738 void
1739 fs_visitor::assign_constant_locations()
1740 {
1741    /* Only the first compile (SIMD8 mode) gets to decide on locations. */
1742    if (dispatch_width != 8)
1743       return;
1744
1745    /* Find which UNIFORM registers are still in use. */
1746    bool is_live[uniforms];
1747    for (unsigned int i = 0; i < uniforms; i++) {
1748       is_live[i] = false;
1749    }
1750
1751    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1752       for (int i = 0; i < inst->sources; i++) {
1753          if (inst->src[i].file != UNIFORM)
1754             continue;
1755
1756          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1757          if (constant_nr >= 0 && constant_nr < (int) uniforms)
1758             is_live[constant_nr] = true;
1759       }
1760    }
1761
1762    /* Only allow 16 registers (128 uniform components) as push constants.
1763     *
1764     * Just demote the end of the list.  We could probably do better
1765     * here, demoting things that are rarely used in the program first.
1766     *
1767     * If changing this value, note the limitation about total_regs in
1768     * brw_curbe.c.
1769     */
1770    unsigned int max_push_components = 16 * 8;
1771    unsigned int num_push_constants = 0;
1772
1773    push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
1774
1775    for (unsigned int i = 0; i < uniforms; i++) {
1776       if (!is_live[i] || pull_constant_loc[i] != -1) {
1777          /* This UNIFORM register is either dead, or has already been demoted
1778           * to a pull const.  Mark it as no longer living in the param[] array.
1779           */
1780          push_constant_loc[i] = -1;
1781          continue;
1782       }
1783
1784       if (num_push_constants < max_push_components) {
1785          /* Retain as a push constant.  Record the location in the params[]
1786           * array.
1787           */
1788          push_constant_loc[i] = num_push_constants++;
1789       } else {
1790          /* Demote to a pull constant. */
1791          push_constant_loc[i] = -1;
1792
1793          int pull_index = stage_prog_data->nr_pull_params++;
1794          stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
1795          pull_constant_loc[i] = pull_index;
1796       }
1797    }
1798
1799    stage_prog_data->nr_params = num_push_constants;
1800
1801    /* Up until now, the param[] array has been indexed by reg + reg_offset
1802     * of UNIFORM registers.  Condense it to only contain the uniforms we
1803     * chose to upload as push constants.
1804     */
1805    for (unsigned int i = 0; i < uniforms; i++) {
1806       int remapped = push_constant_loc[i];
1807
1808       if (remapped == -1)
1809          continue;
1810
1811       assert(remapped <= (int)i);
1812       stage_prog_data->param[remapped] = stage_prog_data->param[i];
1813    }
1814 }
1815
1816 /**
1817  * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
1818  * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
1819  */
1820 void
1821 fs_visitor::demote_pull_constants()
1822 {
1823    foreach_block_and_inst (block, fs_inst, inst, cfg) {
1824       for (int i = 0; i < inst->sources; i++) {
1825          if (inst->src[i].file != UNIFORM)
1826             continue;
1827
1828          int pull_index;
1829          unsigned location = inst->src[i].reg + inst->src[i].reg_offset;
1830          if (location >= uniforms) /* Out of bounds access */
1831             pull_index = -1;
1832          else
1833             pull_index = pull_constant_loc[location];
1834
1835          if (pull_index == -1)
1836             continue;
1837
1838          /* Set up the annotation tracking for new generated instructions. */
1839          const fs_builder ibld = bld.annotate(inst->annotation, inst->ir)
1840                                     .at(block, inst);
1841          fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
1842          fs_reg dst = vgrf(glsl_type::float_type);
1843
1844          assert(inst->src[i].stride == 0);
1845
1846          /* Generate a pull load into dst. */
1847          if (inst->src[i].reladdr) {
1848             VARYING_PULL_CONSTANT_LOAD(ibld, dst,
1849                                        surf_index,
1850                                        *inst->src[i].reladdr,
1851                                        pull_index);
1852             inst->src[i].reladdr = NULL;
1853             inst->src[i].stride = 1;
1854          } else {
1855             fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1856             ibld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1857                       dst, surf_index, offset);
1858             inst->src[i].set_smear(pull_index & 3);
1859          }
1860
1861          /* Rewrite the instruction to use the temporary VGRF. */
1862          inst->src[i].file = GRF;
1863          inst->src[i].reg = dst.reg;
1864          inst->src[i].reg_offset = 0;
1865       }
1866    }
1867    invalidate_live_intervals();
1868 }
1869
1870 bool
1871 fs_visitor::opt_algebraic()
1872 {
1873    bool progress = false;
1874
1875    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1876       switch (inst->opcode) {
1877       case BRW_OPCODE_MOV:
1878          if (inst->src[0].file != IMM)
1879             break;
1880
1881          if (inst->saturate) {
1882             if (inst->dst.type != inst->src[0].type)
1883                assert(!"unimplemented: saturate mixed types");
1884
1885             if (brw_saturate_immediate(inst->dst.type,
1886                                        &inst->src[0].fixed_hw_reg)) {
1887                inst->saturate = false;
1888                progress = true;
1889             }
1890          }
1891          break;
1892
1893       case BRW_OPCODE_MUL:
1894          if (inst->src[1].file != IMM)
1895             continue;
1896
1897          /* a * 1.0 = a */
1898          if (inst->src[1].is_one()) {
1899             inst->opcode = BRW_OPCODE_MOV;
1900             inst->src[1] = reg_undef;
1901             progress = true;
1902             break;
1903          }
1904
1905          /* a * -1.0 = -a */
1906          if (inst->src[1].is_negative_one()) {
1907             inst->opcode = BRW_OPCODE_MOV;
1908             inst->src[0].negate = !inst->src[0].negate;
1909             inst->src[1] = reg_undef;
1910             progress = true;
1911             break;
1912          }
1913
1914          /* a * 0.0 = 0.0 */
1915          if (inst->src[1].is_zero()) {
1916             inst->opcode = BRW_OPCODE_MOV;
1917             inst->src[0] = inst->src[1];
1918             inst->src[1] = reg_undef;
1919             progress = true;
1920             break;
1921          }
1922
1923          if (inst->src[0].file == IMM) {
1924             assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
1925             inst->opcode = BRW_OPCODE_MOV;
1926             inst->src[0].fixed_hw_reg.dw1.f *= inst->src[1].fixed_hw_reg.dw1.f;
1927             inst->src[1] = reg_undef;
1928             progress = true;
1929             break;
1930          }
1931          break;
1932       case BRW_OPCODE_ADD:
1933          if (inst->src[1].file != IMM)
1934             continue;
1935
1936          /* a + 0.0 = a */
1937          if (inst->src[1].is_zero()) {
1938             inst->opcode = BRW_OPCODE_MOV;
1939             inst->src[1] = reg_undef;
1940             progress = true;
1941             break;
1942          }
1943
1944          if (inst->src[0].file == IMM) {
1945             assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
1946             inst->opcode = BRW_OPCODE_MOV;
1947             inst->src[0].fixed_hw_reg.dw1.f += inst->src[1].fixed_hw_reg.dw1.f;
1948             inst->src[1] = reg_undef;
1949             progress = true;
1950             break;
1951          }
1952          break;
1953       case BRW_OPCODE_OR:
1954          if (inst->src[0].equals(inst->src[1])) {
1955             inst->opcode = BRW_OPCODE_MOV;
1956             inst->src[1] = reg_undef;
1957             progress = true;
1958             break;
1959          }
1960          break;
1961       case BRW_OPCODE_LRP:
1962          if (inst->src[1].equals(inst->src[2])) {
1963             inst->opcode = BRW_OPCODE_MOV;
1964             inst->src[0] = inst->src[1];
1965             inst->src[1] = reg_undef;
1966             inst->src[2] = reg_undef;
1967             progress = true;
1968             break;
1969          }
1970          break;
1971       case BRW_OPCODE_CMP:
1972          if (inst->conditional_mod == BRW_CONDITIONAL_GE &&
1973              inst->src[0].abs &&
1974              inst->src[0].negate &&
1975              inst->src[1].is_zero()) {
1976             inst->src[0].abs = false;
1977             inst->src[0].negate = false;
1978             inst->conditional_mod = BRW_CONDITIONAL_Z;
1979             progress = true;
1980             break;
1981          }
1982          break;
1983       case BRW_OPCODE_SEL:
1984          if (inst->src[0].equals(inst->src[1])) {
1985             inst->opcode = BRW_OPCODE_MOV;
1986             inst->src[1] = reg_undef;
1987             inst->predicate = BRW_PREDICATE_NONE;
1988             inst->predicate_inverse = false;
1989             progress = true;
1990          } else if (inst->saturate && inst->src[1].file == IMM) {
1991             switch (inst->conditional_mod) {
1992             case BRW_CONDITIONAL_LE:
1993             case BRW_CONDITIONAL_L:
1994                switch (inst->src[1].type) {
1995                case BRW_REGISTER_TYPE_F:
1996                   if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
1997                      inst->opcode = BRW_OPCODE_MOV;
1998                      inst->src[1] = reg_undef;
1999                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
2000                      progress = true;
2001                   }
2002                   break;
2003                default:
2004                   break;
2005                }
2006                break;
2007             case BRW_CONDITIONAL_GE:
2008             case BRW_CONDITIONAL_G:
2009                switch (inst->src[1].type) {
2010                case BRW_REGISTER_TYPE_F:
2011                   if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
2012                      inst->opcode = BRW_OPCODE_MOV;
2013                      inst->src[1] = reg_undef;
2014                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
2015                      progress = true;
2016                   }
2017                   break;
2018                default:
2019                   break;
2020                }
2021             default:
2022                break;
2023             }
2024          }
2025          break;
2026       case BRW_OPCODE_MAD:
2027          if (inst->src[1].is_zero() || inst->src[2].is_zero()) {
2028             inst->opcode = BRW_OPCODE_MOV;
2029             inst->src[1] = reg_undef;
2030             inst->src[2] = reg_undef;
2031             progress = true;
2032          } else if (inst->src[0].is_zero()) {
2033             inst->opcode = BRW_OPCODE_MUL;
2034             inst->src[0] = inst->src[2];
2035             inst->src[2] = reg_undef;
2036             progress = true;
2037          } else if (inst->src[1].is_one()) {
2038             inst->opcode = BRW_OPCODE_ADD;
2039             inst->src[1] = inst->src[2];
2040             inst->src[2] = reg_undef;
2041             progress = true;
2042          } else if (inst->src[2].is_one()) {
2043             inst->opcode = BRW_OPCODE_ADD;
2044             inst->src[2] = reg_undef;
2045             progress = true;
2046          } else if (inst->src[1].file == IMM && inst->src[2].file == IMM) {
2047             inst->opcode = BRW_OPCODE_ADD;
2048             inst->src[1].fixed_hw_reg.dw1.f *= inst->src[2].fixed_hw_reg.dw1.f;
2049             inst->src[2] = reg_undef;
2050             progress = true;
2051          }
2052          break;
2053       case SHADER_OPCODE_RCP: {
2054          fs_inst *prev = (fs_inst *)inst->prev;
2055          if (prev->opcode == SHADER_OPCODE_SQRT) {
2056             if (inst->src[0].equals(prev->dst)) {
2057                inst->opcode = SHADER_OPCODE_RSQ;
2058                inst->src[0] = prev->src[0];
2059                progress = true;
2060             }
2061          }
2062          break;
2063       }
2064       case SHADER_OPCODE_BROADCAST:
2065          if (is_uniform(inst->src[0])) {
2066             inst->opcode = BRW_OPCODE_MOV;
2067             inst->sources = 1;
2068             inst->force_writemask_all = true;
2069             progress = true;
2070          } else if (inst->src[1].file == IMM) {
2071             inst->opcode = BRW_OPCODE_MOV;
2072             inst->src[0] = component(inst->src[0],
2073                                      inst->src[1].fixed_hw_reg.dw1.ud);
2074             inst->sources = 1;
2075             inst->force_writemask_all = true;
2076             progress = true;
2077          }
2078          break;
2079
2080       default:
2081          break;
2082       }
2083
2084       /* Swap if src[0] is immediate. */
2085       if (progress && inst->is_commutative()) {
2086          if (inst->src[0].file == IMM) {
2087             fs_reg tmp = inst->src[1];
2088             inst->src[1] = inst->src[0];
2089             inst->src[0] = tmp;
2090          }
2091       }
2092    }
2093    return progress;
2094 }
2095
2096 /**
2097  * Optimize sample messages that have constant zero values for the trailing
2098  * texture coordinates. We can just reduce the message length for these
2099  * instructions instead of reserving a register for it. Trailing parameters
2100  * that aren't sent default to zero anyway. This will cause the dead code
2101  * eliminator to remove the MOV instruction that would otherwise be emitted to
2102  * set up the zero value.
2103  */
2104 bool
2105 fs_visitor::opt_zero_samples()
2106 {
2107    /* Gen4 infers the texturing opcode based on the message length so we can't
2108     * change it.
2109     */
2110    if (devinfo->gen < 5)
2111       return false;
2112
2113    bool progress = false;
2114
2115    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2116       if (!inst->is_tex())
2117          continue;
2118
2119       fs_inst *load_payload = (fs_inst *) inst->prev;
2120
2121       if (load_payload->is_head_sentinel() ||
2122           load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2123          continue;
2124
2125       /* We don't want to remove the message header or the first parameter.
2126        * Removing the first parameter is not allowed, see the Haswell PRM
2127        * volume 7, page 149:
2128        *
2129        *     "Parameter 0 is required except for the sampleinfo message, which
2130        *      has no parameter 0"
2131        */
2132       while (inst->mlen > inst->header_size + dispatch_width / 8 &&
2133              load_payload->src[(inst->mlen - inst->header_size) /
2134                                (dispatch_width / 8) +
2135                                inst->header_size - 1].is_zero()) {
2136          inst->mlen -= dispatch_width / 8;
2137          progress = true;
2138       }
2139    }
2140
2141    if (progress)
2142       invalidate_live_intervals();
2143
2144    return progress;
2145 }
2146
2147 /**
2148  * Optimize sample messages which are followed by the final RT write.
2149  *
2150  * CHV, and GEN9+ can mark a texturing SEND instruction with EOT to have its
2151  * results sent directly to the framebuffer, bypassing the EU.  Recognize the
2152  * final texturing results copied to the framebuffer write payload and modify
2153  * them to write to the framebuffer directly.
2154  */
2155 bool
2156 fs_visitor::opt_sampler_eot()
2157 {
2158    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2159
2160    if (stage != MESA_SHADER_FRAGMENT)
2161       return false;
2162
2163    if (devinfo->gen < 9 && !devinfo->is_cherryview)
2164       return false;
2165
2166    /* FINISHME: It should be possible to implement this optimization when there
2167     * are multiple drawbuffers.
2168     */
2169    if (key->nr_color_regions != 1)
2170       return false;
2171
2172    /* Look for a texturing instruction immediately before the final FB_WRITE. */
2173    fs_inst *fb_write = (fs_inst *) cfg->blocks[cfg->num_blocks - 1]->end();
2174    assert(fb_write->eot);
2175    assert(fb_write->opcode == FS_OPCODE_FB_WRITE);
2176
2177    fs_inst *tex_inst = (fs_inst *) fb_write->prev;
2178
2179    /* There wasn't one; nothing to do. */
2180    if (unlikely(tex_inst->is_head_sentinel()) || !tex_inst->is_tex())
2181       return false;
2182
2183    /* This optimisation doesn't seem to work for textureGather for some
2184     * reason. I can't find any documentation or known workarounds to indicate
2185     * that this is expected, but considering that it is probably pretty
2186     * unlikely that a shader would directly write out the results from
2187     * textureGather we might as well just disable it.
2188     */
2189    if (tex_inst->opcode == SHADER_OPCODE_TG4 ||
2190        tex_inst->opcode == SHADER_OPCODE_TG4_OFFSET)
2191       return false;
2192
2193    /* If there's no header present, we need to munge the LOAD_PAYLOAD as well.
2194     * It's very likely to be the previous instruction.
2195     */
2196    fs_inst *load_payload = (fs_inst *) tex_inst->prev;
2197    if (load_payload->is_head_sentinel() ||
2198        load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2199       return false;
2200
2201    assert(!tex_inst->eot); /* We can't get here twice */
2202    assert((tex_inst->offset & (0xff << 24)) == 0);
2203
2204    tex_inst->offset |= fb_write->target << 24;
2205    tex_inst->eot = true;
2206    tex_inst->dst = bld.null_reg_ud();
2207    fb_write->remove(cfg->blocks[cfg->num_blocks - 1]);
2208
2209    /* If a header is present, marking the eot is sufficient. Otherwise, we need
2210     * to create a new LOAD_PAYLOAD command with the same sources and a space
2211     * saved for the header. Using a new destination register not only makes sure
2212     * we have enough space, but it will make sure the dead code eliminator kills
2213     * the instruction that this will replace.
2214     */
2215    if (tex_inst->header_size != 0)
2216       return true;
2217
2218    fs_reg send_header = bld.vgrf(BRW_REGISTER_TYPE_F,
2219                                  load_payload->sources + 1);
2220    fs_reg *new_sources =
2221       ralloc_array(mem_ctx, fs_reg, load_payload->sources + 1);
2222
2223    new_sources[0] = fs_reg();
2224    for (int i = 0; i < load_payload->sources; i++)
2225       new_sources[i+1] = load_payload->src[i];
2226
2227    /* The LOAD_PAYLOAD helper seems like the obvious choice here. However, it
2228     * requires a lot of information about the sources to appropriately figure
2229     * out the number of registers needed to be used. Given this stage in our
2230     * optimization, we may not have the appropriate GRFs required by
2231     * LOAD_PAYLOAD at this point (copy propagation). Therefore, we need to
2232     * manually emit the instruction.
2233     */
2234    fs_inst *new_load_payload = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD,
2235                                                     load_payload->exec_size,
2236                                                     send_header,
2237                                                     new_sources,
2238                                                     load_payload->sources + 1);
2239
2240    new_load_payload->regs_written = load_payload->regs_written + 1;
2241    new_load_payload->header_size = 1;
2242    tex_inst->mlen++;
2243    tex_inst->header_size = 1;
2244    tex_inst->insert_before(cfg->blocks[cfg->num_blocks - 1], new_load_payload);
2245    tex_inst->src[0] = send_header;
2246
2247    return true;
2248 }
2249
2250 bool
2251 fs_visitor::opt_register_renaming()
2252 {
2253    bool progress = false;
2254    int depth = 0;
2255
2256    int remap[alloc.count];
2257    memset(remap, -1, sizeof(int) * alloc.count);
2258
2259    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2260       if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
2261          depth++;
2262       } else if (inst->opcode == BRW_OPCODE_ENDIF ||
2263                  inst->opcode == BRW_OPCODE_WHILE) {
2264          depth--;
2265       }
2266
2267       /* Rewrite instruction sources. */
2268       for (int i = 0; i < inst->sources; i++) {
2269          if (inst->src[i].file == GRF &&
2270              remap[inst->src[i].reg] != -1 &&
2271              remap[inst->src[i].reg] != inst->src[i].reg) {
2272             inst->src[i].reg = remap[inst->src[i].reg];
2273             progress = true;
2274          }
2275       }
2276
2277       const int dst = inst->dst.reg;
2278
2279       if (depth == 0 &&
2280           inst->dst.file == GRF &&
2281           alloc.sizes[inst->dst.reg] == inst->exec_size / 8 &&
2282           !inst->is_partial_write()) {
2283          if (remap[dst] == -1) {
2284             remap[dst] = dst;
2285          } else {
2286             remap[dst] = alloc.allocate(inst->exec_size / 8);
2287             inst->dst.reg = remap[dst];
2288             progress = true;
2289          }
2290       } else if (inst->dst.file == GRF &&
2291                  remap[dst] != -1 &&
2292                  remap[dst] != dst) {
2293          inst->dst.reg = remap[dst];
2294          progress = true;
2295       }
2296    }
2297
2298    if (progress) {
2299       invalidate_live_intervals();
2300
2301       for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
2302          if (delta_xy[i].file == GRF && remap[delta_xy[i].reg] != -1) {
2303             delta_xy[i].reg = remap[delta_xy[i].reg];
2304          }
2305       }
2306    }
2307
2308    return progress;
2309 }
2310
2311 /**
2312  * Remove redundant or useless discard jumps.
2313  *
2314  * For example, we can eliminate jumps in the following sequence:
2315  *
2316  * discard-jump       (redundant with the next jump)
2317  * discard-jump       (useless; jumps to the next instruction)
2318  * placeholder-halt
2319  */
2320 bool
2321 fs_visitor::opt_redundant_discard_jumps()
2322 {
2323    bool progress = false;
2324
2325    bblock_t *last_bblock = cfg->blocks[cfg->num_blocks - 1];
2326
2327    fs_inst *placeholder_halt = NULL;
2328    foreach_inst_in_block_reverse(fs_inst, inst, last_bblock) {
2329       if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT) {
2330          placeholder_halt = inst;
2331          break;
2332       }
2333    }
2334
2335    if (!placeholder_halt)
2336       return false;
2337
2338    /* Delete any HALTs immediately before the placeholder halt. */
2339    for (fs_inst *prev = (fs_inst *) placeholder_halt->prev;
2340         !prev->is_head_sentinel() && prev->opcode == FS_OPCODE_DISCARD_JUMP;
2341         prev = (fs_inst *) placeholder_halt->prev) {
2342       prev->remove(last_bblock);
2343       progress = true;
2344    }
2345
2346    if (progress)
2347       invalidate_live_intervals();
2348
2349    return progress;
2350 }
2351
2352 bool
2353 fs_visitor::compute_to_mrf()
2354 {
2355    bool progress = false;
2356    int next_ip = 0;
2357
2358    /* No MRFs on Gen >= 7. */
2359    if (devinfo->gen >= 7)
2360       return false;
2361
2362    calculate_live_intervals();
2363
2364    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2365       int ip = next_ip;
2366       next_ip++;
2367
2368       if (inst->opcode != BRW_OPCODE_MOV ||
2369           inst->is_partial_write() ||
2370           inst->dst.file != MRF || inst->src[0].file != GRF ||
2371           inst->dst.type != inst->src[0].type ||
2372           inst->src[0].abs || inst->src[0].negate ||
2373           !inst->src[0].is_contiguous() ||
2374           inst->src[0].subreg_offset)
2375          continue;
2376
2377       /* Work out which hardware MRF registers are written by this
2378        * instruction.
2379        */
2380       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2381       int mrf_high;
2382       if (inst->dst.reg & BRW_MRF_COMPR4) {
2383          mrf_high = mrf_low + 4;
2384       } else if (inst->exec_size == 16) {
2385          mrf_high = mrf_low + 1;
2386       } else {
2387          mrf_high = mrf_low;
2388       }
2389
2390       /* Can't compute-to-MRF this GRF if someone else was going to
2391        * read it later.
2392        */
2393       if (this->virtual_grf_end[inst->src[0].reg] > ip)
2394          continue;
2395
2396       /* Found a move of a GRF to a MRF.  Let's see if we can go
2397        * rewrite the thing that made this GRF to write into the MRF.
2398        */
2399       foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2400          if (scan_inst->dst.file == GRF &&
2401              scan_inst->dst.reg == inst->src[0].reg) {
2402             /* Found the last thing to write our reg we want to turn
2403              * into a compute-to-MRF.
2404              */
2405
2406             /* If this one instruction didn't populate all the
2407              * channels, bail.  We might be able to rewrite everything
2408              * that writes that reg, but it would require smarter
2409              * tracking to delay the rewriting until complete success.
2410              */
2411             if (scan_inst->is_partial_write())
2412                break;
2413
2414             /* Things returning more than one register would need us to
2415              * understand coalescing out more than one MOV at a time.
2416              */
2417             if (scan_inst->regs_written > scan_inst->exec_size / 8)
2418                break;
2419
2420             /* SEND instructions can't have MRF as a destination. */
2421             if (scan_inst->mlen)
2422                break;
2423
2424             if (devinfo->gen == 6) {
2425                /* gen6 math instructions must have the destination be
2426                 * GRF, so no compute-to-MRF for them.
2427                 */
2428                if (scan_inst->is_math()) {
2429                   break;
2430                }
2431             }
2432
2433             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2434                /* Found the creator of our MRF's source value. */
2435                scan_inst->dst.file = MRF;
2436                scan_inst->dst.reg = inst->dst.reg;
2437                scan_inst->saturate |= inst->saturate;
2438                inst->remove(block);
2439                progress = true;
2440             }
2441             break;
2442          }
2443
2444          /* We don't handle control flow here.  Most computation of
2445           * values that end up in MRFs are shortly before the MRF
2446           * write anyway.
2447           */
2448          if (block->start() == scan_inst)
2449             break;
2450
2451          /* You can't read from an MRF, so if someone else reads our
2452           * MRF's source GRF that we wanted to rewrite, that stops us.
2453           */
2454          bool interfered = false;
2455          for (int i = 0; i < scan_inst->sources; i++) {
2456             if (scan_inst->src[i].file == GRF &&
2457                 scan_inst->src[i].reg == inst->src[0].reg &&
2458                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2459                interfered = true;
2460             }
2461          }
2462          if (interfered)
2463             break;
2464
2465          if (scan_inst->dst.file == MRF) {
2466             /* If somebody else writes our MRF here, we can't
2467              * compute-to-MRF before that.
2468              */
2469             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2470             int scan_mrf_high;
2471
2472             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2473                scan_mrf_high = scan_mrf_low + 4;
2474             } else if (scan_inst->exec_size == 16) {
2475                scan_mrf_high = scan_mrf_low + 1;
2476             } else {
2477                scan_mrf_high = scan_mrf_low;
2478             }
2479
2480             if (mrf_low == scan_mrf_low ||
2481                 mrf_low == scan_mrf_high ||
2482                 mrf_high == scan_mrf_low ||
2483                 mrf_high == scan_mrf_high) {
2484                break;
2485             }
2486          }
2487
2488          if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2489             /* Found a SEND instruction, which means that there are
2490              * live values in MRFs from base_mrf to base_mrf +
2491              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2492              * above it.
2493              */
2494             if (mrf_low >= scan_inst->base_mrf &&
2495                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2496                break;
2497             }
2498             if (mrf_high >= scan_inst->base_mrf &&
2499                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2500                break;
2501             }
2502          }
2503       }
2504    }
2505
2506    if (progress)
2507       invalidate_live_intervals();
2508
2509    return progress;
2510 }
2511
2512 /**
2513  * Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control
2514  * flow.  We could probably do better here with some form of divergence
2515  * analysis.
2516  */
2517 bool
2518 fs_visitor::eliminate_find_live_channel()
2519 {
2520    bool progress = false;
2521    unsigned depth = 0;
2522
2523    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2524       switch (inst->opcode) {
2525       case BRW_OPCODE_IF:
2526       case BRW_OPCODE_DO:
2527          depth++;
2528          break;
2529
2530       case BRW_OPCODE_ENDIF:
2531       case BRW_OPCODE_WHILE:
2532          depth--;
2533          break;
2534
2535       case FS_OPCODE_DISCARD_JUMP:
2536          /* This can potentially make control flow non-uniform until the end
2537           * of the program.
2538           */
2539          return progress;
2540
2541       case SHADER_OPCODE_FIND_LIVE_CHANNEL:
2542          if (depth == 0) {
2543             inst->opcode = BRW_OPCODE_MOV;
2544             inst->src[0] = fs_reg(0);
2545             inst->sources = 1;
2546             inst->force_writemask_all = true;
2547             progress = true;
2548          }
2549          break;
2550
2551       default:
2552          break;
2553       }
2554    }
2555
2556    return progress;
2557 }
2558
2559 /**
2560  * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
2561  * instructions to FS_OPCODE_REP_FB_WRITE.
2562  */
2563 void
2564 fs_visitor::emit_repclear_shader()
2565 {
2566    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2567    int base_mrf = 1;
2568    int color_mrf = base_mrf + 2;
2569
2570    fs_inst *mov = bld.exec_all().MOV(vec4(brw_message_reg(color_mrf)),
2571                                      fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F));
2572
2573    fs_inst *write;
2574    if (key->nr_color_regions == 1) {
2575       write = bld.emit(FS_OPCODE_REP_FB_WRITE);
2576       write->saturate = key->clamp_fragment_color;
2577       write->base_mrf = color_mrf;
2578       write->target = 0;
2579       write->header_size = 0;
2580       write->mlen = 1;
2581    } else {
2582       assume(key->nr_color_regions > 0);
2583       for (int i = 0; i < key->nr_color_regions; ++i) {
2584          write = bld.emit(FS_OPCODE_REP_FB_WRITE);
2585          write->saturate = key->clamp_fragment_color;
2586          write->base_mrf = base_mrf;
2587          write->target = i;
2588          write->header_size = 2;
2589          write->mlen = 3;
2590       }
2591    }
2592    write->eot = true;
2593
2594    calculate_cfg();
2595
2596    assign_constant_locations();
2597    assign_curb_setup();
2598
2599    /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
2600    assert(mov->src[0].file == HW_REG);
2601    mov->src[0] = brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
2602 }
2603
2604 /**
2605  * Walks through basic blocks, looking for repeated MRF writes and
2606  * removing the later ones.
2607  */
2608 bool
2609 fs_visitor::remove_duplicate_mrf_writes()
2610 {
2611    fs_inst *last_mrf_move[16];
2612    bool progress = false;
2613
2614    /* Need to update the MRF tracking for compressed instructions. */
2615    if (dispatch_width == 16)
2616       return false;
2617
2618    memset(last_mrf_move, 0, sizeof(last_mrf_move));
2619
2620    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
2621       if (inst->is_control_flow()) {
2622          memset(last_mrf_move, 0, sizeof(last_mrf_move));
2623       }
2624
2625       if (inst->opcode == BRW_OPCODE_MOV &&
2626           inst->dst.file == MRF) {
2627          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2628          if (prev_inst && inst->equals(prev_inst)) {
2629             inst->remove(block);
2630             progress = true;
2631             continue;
2632          }
2633       }
2634
2635       /* Clear out the last-write records for MRFs that were overwritten. */
2636       if (inst->dst.file == MRF) {
2637          last_mrf_move[inst->dst.reg] = NULL;
2638       }
2639
2640       if (inst->mlen > 0 && inst->base_mrf != -1) {
2641          /* Found a SEND instruction, which will include two or fewer
2642           * implied MRF writes.  We could do better here.
2643           */
2644          for (int i = 0; i < implied_mrf_writes(inst); i++) {
2645             last_mrf_move[inst->base_mrf + i] = NULL;
2646          }
2647       }
2648
2649       /* Clear out any MRF move records whose sources got overwritten. */
2650       if (inst->dst.file == GRF) {
2651          for (unsigned int i = 0; i < ARRAY_SIZE(last_mrf_move); i++) {
2652             if (last_mrf_move[i] &&
2653                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2654                last_mrf_move[i] = NULL;
2655             }
2656          }
2657       }
2658
2659       if (inst->opcode == BRW_OPCODE_MOV &&
2660           inst->dst.file == MRF &&
2661           inst->src[0].file == GRF &&
2662           !inst->is_partial_write()) {
2663          last_mrf_move[inst->dst.reg] = inst;
2664       }
2665    }
2666
2667    if (progress)
2668       invalidate_live_intervals();
2669
2670    return progress;
2671 }
2672
2673 static void
2674 clear_deps_for_inst_src(fs_inst *inst, bool *deps, int first_grf, int grf_len)
2675 {
2676    /* Clear the flag for registers that actually got read (as expected). */
2677    for (int i = 0; i < inst->sources; i++) {
2678       int grf;
2679       if (inst->src[i].file == GRF) {
2680          grf = inst->src[i].reg;
2681       } else if (inst->src[i].file == HW_REG &&
2682                  inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2683          grf = inst->src[i].fixed_hw_reg.nr;
2684       } else {
2685          continue;
2686       }
2687
2688       if (grf >= first_grf &&
2689           grf < first_grf + grf_len) {
2690          deps[grf - first_grf] = false;
2691          if (inst->exec_size == 16)
2692             deps[grf - first_grf + 1] = false;
2693       }
2694    }
2695 }
2696
2697 /**
2698  * Implements this workaround for the original 965:
2699  *
2700  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2701  *      check for post destination dependencies on this instruction, software
2702  *      must ensure that there is no destination hazard for the case of ‘write
2703  *      followed by a posted write’ shown in the following example.
2704  *
2705  *      1. mov r3 0
2706  *      2. send r3.xy <rest of send instruction>
2707  *      3. mov r2 r3
2708  *
2709  *      Due to no post-destination dependency check on the ‘send’, the above
2710  *      code sequence could have two instructions (1 and 2) in flight at the
2711  *      same time that both consider ‘r3’ as the target of their final writes.
2712  */
2713 void
2714 fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
2715                                                         fs_inst *inst)
2716 {
2717    int write_len = inst->regs_written;
2718    int first_write_grf = inst->dst.reg;
2719    bool needs_dep[BRW_MAX_MRF];
2720    assert(write_len < (int)sizeof(needs_dep) - 1);
2721
2722    memset(needs_dep, false, sizeof(needs_dep));
2723    memset(needs_dep, true, write_len);
2724
2725    clear_deps_for_inst_src(inst, needs_dep, first_write_grf, write_len);
2726
2727    /* Walk backwards looking for writes to registers we're writing which
2728     * aren't read since being written.  If we hit the start of the program,
2729     * we assume that there are no outstanding dependencies on entry to the
2730     * program.
2731     */
2732    foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2733       /* If we hit control flow, assume that there *are* outstanding
2734        * dependencies, and force their cleanup before our instruction.
2735        */
2736       if (block->start() == scan_inst) {
2737          for (int i = 0; i < write_len; i++) {
2738             if (needs_dep[i])
2739                DEP_RESOLVE_MOV(bld.at(block, inst), first_write_grf + i);
2740          }
2741          return;
2742       }
2743
2744       /* We insert our reads as late as possible on the assumption that any
2745        * instruction but a MOV that might have left us an outstanding
2746        * dependency has more latency than a MOV.
2747        */
2748       if (scan_inst->dst.file == GRF) {
2749          for (int i = 0; i < scan_inst->regs_written; i++) {
2750             int reg = scan_inst->dst.reg + i;
2751
2752             if (reg >= first_write_grf &&
2753                 reg < first_write_grf + write_len &&
2754                 needs_dep[reg - first_write_grf]) {
2755                DEP_RESOLVE_MOV(bld.at(block, inst), reg);
2756                needs_dep[reg - first_write_grf] = false;
2757                if (scan_inst->exec_size == 16)
2758                   needs_dep[reg - first_write_grf + 1] = false;
2759             }
2760          }
2761       }
2762
2763       /* Clear the flag for registers that actually got read (as expected). */
2764       clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
2765
2766       /* Continue the loop only if we haven't resolved all the dependencies */
2767       int i;
2768       for (i = 0; i < write_len; i++) {
2769          if (needs_dep[i])
2770             break;
2771       }
2772       if (i == write_len)
2773          return;
2774    }
2775 }
2776
2777 /**
2778  * Implements this workaround for the original 965:
2779  *
2780  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
2781  *      used as a destination register until after it has been sourced by an
2782  *      instruction with a different destination register.
2783  */
2784 void
2785 fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
2786 {
2787    int write_len = inst->regs_written;
2788    int first_write_grf = inst->dst.reg;
2789    bool needs_dep[BRW_MAX_MRF];
2790    assert(write_len < (int)sizeof(needs_dep) - 1);
2791
2792    memset(needs_dep, false, sizeof(needs_dep));
2793    memset(needs_dep, true, write_len);
2794    /* Walk forwards looking for writes to registers we're writing which aren't
2795     * read before being written.
2796     */
2797    foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst, block) {
2798       /* If we hit control flow, force resolve all remaining dependencies. */
2799       if (block->end() == scan_inst) {
2800          for (int i = 0; i < write_len; i++) {
2801             if (needs_dep[i])
2802                DEP_RESOLVE_MOV(bld.at(block, scan_inst), first_write_grf + i);
2803          }
2804          return;
2805       }
2806
2807       /* Clear the flag for registers that actually got read (as expected). */
2808       clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
2809
2810       /* We insert our reads as late as possible since they're reading the
2811        * result of a SEND, which has massive latency.
2812        */
2813       if (scan_inst->dst.file == GRF &&
2814           scan_inst->dst.reg >= first_write_grf &&
2815           scan_inst->dst.reg < first_write_grf + write_len &&
2816           needs_dep[scan_inst->dst.reg - first_write_grf]) {
2817          DEP_RESOLVE_MOV(bld.at(block, scan_inst), scan_inst->dst.reg);
2818          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2819       }
2820
2821       /* Continue the loop only if we haven't resolved all the dependencies */
2822       int i;
2823       for (i = 0; i < write_len; i++) {
2824          if (needs_dep[i])
2825             break;
2826       }
2827       if (i == write_len)
2828          return;
2829    }
2830 }
2831
2832 void
2833 fs_visitor::insert_gen4_send_dependency_workarounds()
2834 {
2835    if (devinfo->gen != 4 || devinfo->is_g4x)
2836       return;
2837
2838    bool progress = false;
2839
2840    /* Note that we're done with register allocation, so GRF fs_regs always
2841     * have a .reg_offset of 0.
2842     */
2843
2844    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2845       if (inst->mlen != 0 && inst->dst.file == GRF) {
2846          insert_gen4_pre_send_dependency_workarounds(block, inst);
2847          insert_gen4_post_send_dependency_workarounds(block, inst);
2848          progress = true;
2849       }
2850    }
2851
2852    if (progress)
2853       invalidate_live_intervals();
2854 }
2855
2856 /**
2857  * Turns the generic expression-style uniform pull constant load instruction
2858  * into a hardware-specific series of instructions for loading a pull
2859  * constant.
2860  *
2861  * The expression style allows the CSE pass before this to optimize out
2862  * repeated loads from the same offset, and gives the pre-register-allocation
2863  * scheduling full flexibility, while the conversion to native instructions
2864  * allows the post-register-allocation scheduler the best information
2865  * possible.
2866  *
2867  * Note that execution masking for setting up pull constant loads is special:
2868  * the channels that need to be written are unrelated to the current execution
2869  * mask, since a later instruction will use one of the result channels as a
2870  * source operand for all 8 or 16 of its channels.
2871  */
2872 void
2873 fs_visitor::lower_uniform_pull_constant_loads()
2874 {
2875    foreach_block_and_inst (block, fs_inst, inst, cfg) {
2876       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2877          continue;
2878
2879       if (devinfo->gen >= 7) {
2880          /* The offset arg before was a vec4-aligned byte offset.  We need to
2881           * turn it into a dword offset.
2882           */
2883          fs_reg const_offset_reg = inst->src[1];
2884          assert(const_offset_reg.file == IMM &&
2885                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2886          const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
2887
2888          fs_reg payload, offset;
2889          if (devinfo->gen >= 9) {
2890             /* We have to use a message header on Skylake to get SIMD4x2
2891              * mode.  Reserve space for the register.
2892             */
2893             offset = payload = fs_reg(GRF, alloc.allocate(2));
2894             offset.reg_offset++;
2895             inst->mlen = 2;
2896          } else {
2897             offset = payload = fs_reg(GRF, alloc.allocate(1));
2898             inst->mlen = 1;
2899          }
2900
2901          /* This is actually going to be a MOV, but since only the first dword
2902           * is accessed, we have a special opcode to do just that one.  Note
2903           * that this needs to be an operation that will be considered a def
2904           * by live variable analysis, or register allocation will explode.
2905           */
2906          fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2907                                                8, offset, const_offset_reg);
2908          setup->force_writemask_all = true;
2909
2910          setup->ir = inst->ir;
2911          setup->annotation = inst->annotation;
2912          inst->insert_before(block, setup);
2913
2914          /* Similarly, this will only populate the first 4 channels of the
2915           * result register (since we only use smear values from 0-3), but we
2916           * don't tell the optimizer.
2917           */
2918          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2919          inst->src[1] = payload;
2920          inst->base_mrf = -1;
2921
2922          invalidate_live_intervals();
2923       } else {
2924          /* Before register allocation, we didn't tell the scheduler about the
2925           * MRF we use.  We know it's safe to use this MRF because nothing
2926           * else does except for register spill/unspill, which generates and
2927           * uses its MRF within a single IR instruction.
2928           */
2929          inst->base_mrf = 14;
2930          inst->mlen = 1;
2931       }
2932    }
2933 }
2934
2935 bool
2936 fs_visitor::lower_load_payload()
2937 {
2938    bool progress = false;
2939
2940    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
2941       if (inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2942          continue;
2943
2944       assert(inst->dst.file == MRF || inst->dst.file == GRF);
2945       assert(inst->saturate == false);
2946       fs_reg dst = inst->dst;
2947
2948       /* Get rid of COMPR4.  We'll add it back in if we need it */
2949       if (dst.file == MRF)
2950          dst.reg = dst.reg & ~BRW_MRF_COMPR4;
2951
2952       const fs_builder hbld = bld.exec_all().group(8, 0).at(block, inst);
2953
2954       for (uint8_t i = 0; i < inst->header_size; i++) {
2955          if (inst->src[i].file != BAD_FILE) {
2956             fs_reg mov_dst = retype(dst, BRW_REGISTER_TYPE_UD);
2957             fs_reg mov_src = retype(inst->src[i], BRW_REGISTER_TYPE_UD);
2958             hbld.MOV(mov_dst, mov_src);
2959          }
2960          dst = offset(dst, hbld, 1);
2961       }
2962
2963       const fs_builder ibld = bld.exec_all(inst->force_writemask_all)
2964                                  .group(inst->exec_size, inst->force_sechalf)
2965                                  .at(block, inst);
2966
2967       if (inst->dst.file == MRF && (inst->dst.reg & BRW_MRF_COMPR4) &&
2968           inst->exec_size > 8) {
2969          /* In this case, the payload portion of the LOAD_PAYLOAD isn't
2970           * a straightforward copy.  Instead, the result of the
2971           * LOAD_PAYLOAD is treated as interleaved and the first four
2972           * non-header sources are unpacked as:
2973           *
2974           * m + 0: r0
2975           * m + 1: g0
2976           * m + 2: b0
2977           * m + 3: a0
2978           * m + 4: r1
2979           * m + 5: g1
2980           * m + 6: b1
2981           * m + 7: a1
2982           *
2983           * This is used for gen <= 5 fb writes.
2984           */
2985          assert(inst->exec_size == 16);
2986          assert(inst->header_size + 4 <= inst->sources);
2987          for (uint8_t i = inst->header_size; i < inst->header_size + 4; i++) {
2988             if (inst->src[i].file != BAD_FILE) {
2989                if (devinfo->has_compr4) {
2990                   fs_reg compr4_dst = retype(dst, inst->src[i].type);
2991                   compr4_dst.reg |= BRW_MRF_COMPR4;
2992                   ibld.MOV(compr4_dst, inst->src[i]);
2993                } else {
2994                   /* Platform doesn't have COMPR4.  We have to fake it */
2995                   fs_reg mov_dst = retype(dst, inst->src[i].type);
2996                   ibld.half(0).MOV(mov_dst, half(inst->src[i], 0));
2997                   mov_dst.reg += 4;
2998                   ibld.half(1).MOV(mov_dst, half(inst->src[i], 1));
2999                }
3000             }
3001
3002             dst.reg++;
3003          }
3004
3005          /* The loop above only ever incremented us through the first set
3006           * of 4 registers.  However, thanks to the magic of COMPR4, we
3007           * actually wrote to the first 8 registers, so we need to take
3008           * that into account now.
3009           */
3010          dst.reg += 4;
3011
3012          /* The COMPR4 code took care of the first 4 sources.  We'll let
3013           * the regular path handle any remaining sources.  Yes, we are
3014           * modifying the instruction but we're about to delete it so
3015           * this really doesn't hurt anything.
3016           */
3017          inst->header_size += 4;
3018       }
3019
3020       for (uint8_t i = inst->header_size; i < inst->sources; i++) {
3021          if (inst->src[i].file != BAD_FILE)
3022             ibld.MOV(retype(dst, inst->src[i].type), inst->src[i]);
3023          dst = offset(dst, ibld, 1);
3024       }
3025
3026       inst->remove(block);
3027       progress = true;
3028    }
3029
3030    if (progress)
3031       invalidate_live_intervals();
3032
3033    return progress;
3034 }
3035
3036 bool
3037 fs_visitor::lower_integer_multiplication()
3038 {
3039    bool progress = false;
3040
3041    /* Gen8's MUL instruction can do a 32-bit x 32-bit -> 32-bit operation
3042     * directly, but Cherryview cannot.
3043     */
3044    if (devinfo->gen >= 8 && !devinfo->is_cherryview)
3045       return false;
3046
3047    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
3048       if (inst->opcode != BRW_OPCODE_MUL ||
3049           inst->dst.is_accumulator() ||
3050           (inst->dst.type != BRW_REGISTER_TYPE_D &&
3051            inst->dst.type != BRW_REGISTER_TYPE_UD))
3052          continue;
3053
3054       const fs_builder ibld = bld.at(block, inst);
3055
3056       /* The MUL instruction isn't commutative. On Gen <= 6, only the low
3057        * 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of
3058        * src1 are used.
3059        *
3060        * If multiplying by an immediate value that fits in 16-bits, do a
3061        * single MUL instruction with that value in the proper location.
3062        */
3063       if (inst->src[1].file == IMM &&
3064           inst->src[1].fixed_hw_reg.dw1.ud < (1 << 16)) {
3065          if (devinfo->gen < 7) {
3066             fs_reg imm(GRF, alloc.allocate(dispatch_width / 8),
3067                        inst->dst.type);
3068             ibld.MOV(imm, inst->src[1]);
3069             ibld.MUL(inst->dst, imm, inst->src[0]);
3070          } else {
3071             ibld.MUL(inst->dst, inst->src[0], inst->src[1]);
3072          }
3073       } else {
3074          /* Gen < 8 (and some Gen8+ low-power parts like Cherryview) cannot
3075           * do 32-bit integer multiplication in one instruction, but instead
3076           * must do a sequence (which actually calculates a 64-bit result):
3077           *
3078           *    mul(8)  acc0<1>D   g3<8,8,1>D      g4<8,8,1>D
3079           *    mach(8) null       g3<8,8,1>D      g4<8,8,1>D
3080           *    mov(8)  g2<1>D     acc0<8,8,1>D
3081           *
3082           * But on Gen > 6, the ability to use second accumulator register
3083           * (acc1) for non-float data types was removed, preventing a simple
3084           * implementation in SIMD16. A 16-channel result can be calculated by
3085           * executing the three instructions twice in SIMD8, once with quarter
3086           * control of 1Q for the first eight channels and again with 2Q for
3087           * the second eight channels.
3088           *
3089           * Which accumulator register is implicitly accessed (by AccWrEnable
3090           * for instance) is determined by the quarter control. Unfortunately
3091           * Ivybridge (and presumably Baytrail) has a hardware bug in which an
3092           * implicit accumulator access by an instruction with 2Q will access
3093           * acc1 regardless of whether the data type is usable in acc1.
3094           *
3095           * Specifically, the 2Q mach(8) writes acc1 which does not exist for
3096           * integer data types.
3097           *
3098           * Since we only want the low 32-bits of the result, we can do two
3099           * 32-bit x 16-bit multiplies (like the mul and mach are doing), and
3100           * adjust the high result and add them (like the mach is doing):
3101           *
3102           *    mul(8)  g7<1>D     g3<8,8,1>D      g4.0<8,8,1>UW
3103           *    mul(8)  g8<1>D     g3<8,8,1>D      g4.1<8,8,1>UW
3104           *    shl(8)  g9<1>D     g8<8,8,1>D      16D
3105           *    add(8)  g2<1>D     g7<8,8,1>D      g8<8,8,1>D
3106           *
3107           * We avoid the shl instruction by realizing that we only want to add
3108           * the low 16-bits of the "high" result to the high 16-bits of the
3109           * "low" result and using proper regioning on the add:
3110           *
3111           *    mul(8)  g7<1>D     g3<8,8,1>D      g4.0<16,8,2>UW
3112           *    mul(8)  g8<1>D     g3<8,8,1>D      g4.1<16,8,2>UW
3113           *    add(8)  g7.1<2>UW  g7.1<16,8,2>UW  g8<16,8,2>UW
3114           *
3115           * Since it does not use the (single) accumulator register, we can
3116           * schedule multi-component multiplications much better.
3117           */
3118
3119          if (inst->conditional_mod && inst->dst.is_null()) {
3120             inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
3121                                inst->dst.type);
3122          }
3123          fs_reg low = inst->dst;
3124          fs_reg high(GRF, alloc.allocate(dispatch_width / 8),
3125                      inst->dst.type);
3126
3127          if (devinfo->gen >= 7) {
3128             fs_reg src1_0_w = inst->src[1];
3129             fs_reg src1_1_w = inst->src[1];
3130
3131             if (inst->src[1].file == IMM) {
3132                src1_0_w.fixed_hw_reg.dw1.ud &= 0xffff;
3133                src1_1_w.fixed_hw_reg.dw1.ud >>= 16;
3134             } else {
3135                src1_0_w.type = BRW_REGISTER_TYPE_UW;
3136                if (src1_0_w.stride != 0) {
3137                   assert(src1_0_w.stride == 1);
3138                   src1_0_w.stride = 2;
3139                }
3140
3141                src1_1_w.type = BRW_REGISTER_TYPE_UW;
3142                if (src1_1_w.stride != 0) {
3143                   assert(src1_1_w.stride == 1);
3144                   src1_1_w.stride = 2;
3145                }
3146                src1_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
3147             }
3148             ibld.MUL(low, inst->src[0], src1_0_w);
3149             ibld.MUL(high, inst->src[0], src1_1_w);
3150          } else {
3151             fs_reg src0_0_w = inst->src[0];
3152             fs_reg src0_1_w = inst->src[0];
3153
3154             src0_0_w.type = BRW_REGISTER_TYPE_UW;
3155             if (src0_0_w.stride != 0) {
3156                assert(src0_0_w.stride == 1);
3157                src0_0_w.stride = 2;
3158             }
3159
3160             src0_1_w.type = BRW_REGISTER_TYPE_UW;
3161             if (src0_1_w.stride != 0) {
3162                assert(src0_1_w.stride == 1);
3163                src0_1_w.stride = 2;
3164             }
3165             src0_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
3166
3167             ibld.MUL(low, src0_0_w, inst->src[1]);
3168             ibld.MUL(high, src0_1_w, inst->src[1]);
3169          }
3170
3171          fs_reg dst = inst->dst;
3172          dst.type = BRW_REGISTER_TYPE_UW;
3173          dst.subreg_offset = 2;
3174          dst.stride = 2;
3175
3176          high.type = BRW_REGISTER_TYPE_UW;
3177          high.stride = 2;
3178
3179          low.type = BRW_REGISTER_TYPE_UW;
3180          low.subreg_offset = 2;
3181          low.stride = 2;
3182
3183          ibld.ADD(dst, low, high);
3184
3185          if (inst->conditional_mod) {
3186             fs_reg null(retype(ibld.null_reg_f(), inst->dst.type));
3187             set_condmod(inst->conditional_mod,
3188                         ibld.MOV(null, inst->dst));
3189          }
3190       }
3191
3192       inst->remove(block);
3193       progress = true;
3194    }
3195
3196    if (progress)
3197       invalidate_live_intervals();
3198
3199    return progress;
3200 }
3201
3202 bool
3203 fs_visitor::lower_logical_sends()
3204 {
3205    bool progress = false;
3206
3207    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
3208       const fs_builder ibld = bld.exec_all(inst->force_writemask_all)
3209                                  .group(inst->exec_size, inst->force_sechalf)
3210                                  .at(block, inst);
3211
3212       switch (inst->opcode) {
3213       default:
3214          continue;
3215       }
3216
3217       progress = true;
3218    }
3219
3220    if (progress)
3221       invalidate_live_intervals();
3222
3223    return progress;
3224 }
3225
3226 /**
3227  * Get the closest native SIMD width supported by the hardware for instruction
3228  * \p inst.  The instruction will be left untouched by
3229  * fs_visitor::lower_simd_width() if the returned value is equal to the
3230  * original execution size.
3231  */
3232 static unsigned
3233 get_lowered_simd_width(const struct brw_device_info *devinfo,
3234                        const fs_inst *inst)
3235 {
3236    switch (inst->opcode) {
3237    default:
3238       return inst->exec_size;
3239    }
3240 }
3241
3242 /**
3243  * The \p rows array of registers represents a \p num_rows by \p num_columns
3244  * matrix in row-major order, write it in column-major order into the register
3245  * passed as destination.  \p stride gives the separation between matrix
3246  * elements in the input in fs_builder::dispatch_width() units.
3247  */
3248 static void
3249 emit_transpose(const fs_builder &bld,
3250                const fs_reg &dst, const fs_reg *rows,
3251                unsigned num_rows, unsigned num_columns, unsigned stride)
3252 {
3253    fs_reg *const components = new fs_reg[num_rows * num_columns];
3254
3255    for (unsigned i = 0; i < num_columns; ++i) {
3256       for (unsigned j = 0; j < num_rows; ++j)
3257          components[num_rows * i + j] = offset(rows[j], bld, stride * i);
3258    }
3259
3260    bld.LOAD_PAYLOAD(dst, components, num_rows * num_columns, 0);
3261
3262    delete[] components;
3263 }
3264
3265 bool
3266 fs_visitor::lower_simd_width()
3267 {
3268    bool progress = false;
3269
3270    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
3271       const unsigned lower_width = get_lowered_simd_width(devinfo, inst);
3272
3273       if (lower_width != inst->exec_size) {
3274          /* Builder matching the original instruction. */
3275          const fs_builder ibld = bld.at(block, inst)
3276                                     .exec_all(inst->force_writemask_all)
3277                                     .group(inst->exec_size, inst->force_sechalf);
3278
3279          /* Split the copies in chunks of the execution width of either the
3280           * original or the lowered instruction, whichever is lower.
3281           */
3282          const unsigned copy_width = MIN2(lower_width, inst->exec_size);
3283          const unsigned n = inst->exec_size / copy_width;
3284          const unsigned dst_size = inst->regs_written * REG_SIZE /
3285             inst->dst.component_size(inst->exec_size);
3286          fs_reg dsts[4];
3287
3288          assert(n > 0 && n <= ARRAY_SIZE(dsts) &&
3289                 !inst->writes_accumulator && !inst->mlen);
3290
3291          for (unsigned i = 0; i < n; i++) {
3292             /* Emit a copy of the original instruction with the lowered width.
3293              * If the EOT flag was set throw it away except for the last
3294              * instruction to avoid killing the thread prematurely.
3295              */
3296             fs_inst split_inst = *inst;
3297             split_inst.exec_size = lower_width;
3298             split_inst.eot = inst->eot && i == n - 1;
3299
3300             /* Set exec_all if the lowered width is higher than the original
3301              * to avoid breaking the compiler invariant that no control
3302              * flow-masked instruction is wider than the shader's
3303              * dispatch_width.  Then transform the sources and destination and
3304              * emit the lowered instruction.
3305              */
3306             const fs_builder lbld = ibld.exec_all(lower_width > inst->exec_size)
3307                                         .group(lower_width, i);
3308
3309             for (unsigned j = 0; j < inst->sources; j++) {
3310                if (inst->src[j].file != BAD_FILE &&
3311                    !is_uniform(inst->src[j])) {
3312                   /* Get the i-th copy_width-wide chunk of the source. */
3313                   const fs_reg src = horiz_offset(inst->src[j], copy_width * i);
3314                   const unsigned src_size = inst->components_read(j);
3315
3316                   /* Use a trivial transposition to copy one every n
3317                    * copy_width-wide components of the register into a
3318                    * temporary passed as source to the lowered instruction.
3319                    */
3320                   split_inst.src[j] = lbld.vgrf(inst->src[j].type, src_size);
3321                   emit_transpose(lbld.group(copy_width, 0),
3322                                  split_inst.src[j], &src, 1, src_size, n);
3323                }
3324             }
3325
3326             if (inst->regs_written) {
3327                /* Allocate enough space to hold the result of the lowered
3328                 * instruction and fix up the number of registers written.
3329                 */
3330                split_inst.dst = dsts[i] =
3331                   lbld.vgrf(inst->dst.type, dst_size);
3332                split_inst.regs_written =
3333                   DIV_ROUND_UP(inst->regs_written * lower_width,
3334                                inst->exec_size);
3335             }
3336
3337             lbld.emit(split_inst);
3338          }
3339
3340          if (inst->regs_written) {
3341             /* Distance between useful channels in the temporaries, skipping
3342              * garbage if the lowered instruction is wider than the original.
3343              */
3344             const unsigned m = lower_width / copy_width;
3345
3346             /* Interleave the components of the result from the lowered
3347              * instructions.  We need to set exec_all() when copying more than
3348              * one half per component, because LOAD_PAYLOAD (in terms of which
3349              * emit_transpose is implemented) can only use the same channel
3350              * enable signals for all of its non-header sources.
3351              */
3352             emit_transpose(ibld.exec_all(inst->exec_size > copy_width)
3353                                .group(copy_width, 0),
3354                            inst->dst, dsts, n, dst_size, m);
3355          }
3356
3357          inst->remove(block);
3358          progress = true;
3359       }
3360    }
3361
3362    if (progress)
3363       invalidate_live_intervals();
3364
3365    return progress;
3366 }
3367
3368 void
3369 fs_visitor::dump_instructions()
3370 {
3371    dump_instructions(NULL);
3372 }
3373
3374 void
3375 fs_visitor::dump_instructions(const char *name)
3376 {
3377    FILE *file = stderr;
3378    if (name && geteuid() != 0) {
3379       file = fopen(name, "w");
3380       if (!file)
3381          file = stderr;
3382    }
3383
3384    if (cfg) {
3385       calculate_register_pressure();
3386       int ip = 0, max_pressure = 0;
3387       foreach_block_and_inst(block, backend_instruction, inst, cfg) {
3388          max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
3389          fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
3390          dump_instruction(inst, file);
3391          ip++;
3392       }
3393       fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
3394    } else {
3395       int ip = 0;
3396       foreach_in_list(backend_instruction, inst, &instructions) {
3397          fprintf(file, "%4d: ", ip++);
3398          dump_instruction(inst, file);
3399       }
3400    }
3401
3402    if (file != stderr) {
3403       fclose(file);
3404    }
3405 }
3406
3407 void
3408 fs_visitor::dump_instruction(backend_instruction *be_inst)
3409 {
3410    dump_instruction(be_inst, stderr);
3411 }
3412
3413 void
3414 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
3415 {
3416    fs_inst *inst = (fs_inst *)be_inst;
3417
3418    if (inst->predicate) {
3419       fprintf(file, "(%cf0.%d) ",
3420              inst->predicate_inverse ? '-' : '+',
3421              inst->flag_subreg);
3422    }
3423
3424    fprintf(file, "%s", brw_instruction_name(inst->opcode));
3425    if (inst->saturate)
3426       fprintf(file, ".sat");
3427    if (inst->conditional_mod) {
3428       fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
3429       if (!inst->predicate &&
3430           (devinfo->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
3431                               inst->opcode != BRW_OPCODE_IF &&
3432                               inst->opcode != BRW_OPCODE_WHILE))) {
3433          fprintf(file, ".f0.%d", inst->flag_subreg);
3434       }
3435    }
3436    fprintf(file, "(%d) ", inst->exec_size);
3437
3438    if (inst->mlen) {
3439       fprintf(file, "(mlen: %d) ", inst->mlen);
3440    }
3441
3442    switch (inst->dst.file) {
3443    case GRF:
3444       fprintf(file, "vgrf%d", inst->dst.reg);
3445       if (alloc.sizes[inst->dst.reg] != inst->regs_written ||
3446           inst->dst.subreg_offset)
3447          fprintf(file, "+%d.%d",
3448                  inst->dst.reg_offset, inst->dst.subreg_offset);
3449       break;
3450    case MRF:
3451       fprintf(file, "m%d", inst->dst.reg);
3452       break;
3453    case BAD_FILE:
3454       fprintf(file, "(null)");
3455       break;
3456    case UNIFORM:
3457       fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
3458       break;
3459    case ATTR:
3460       fprintf(file, "***attr%d***", inst->dst.reg + inst->dst.reg_offset);
3461       break;
3462    case HW_REG:
3463       if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3464          switch (inst->dst.fixed_hw_reg.nr) {
3465          case BRW_ARF_NULL:
3466             fprintf(file, "null");
3467             break;
3468          case BRW_ARF_ADDRESS:
3469             fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
3470             break;
3471          case BRW_ARF_ACCUMULATOR:
3472             fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
3473             break;
3474          case BRW_ARF_FLAG:
3475             fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3476                              inst->dst.fixed_hw_reg.subnr);
3477             break;
3478          default:
3479             fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3480                                inst->dst.fixed_hw_reg.subnr);
3481             break;
3482          }
3483       } else {
3484          fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
3485       }
3486       if (inst->dst.fixed_hw_reg.subnr)
3487          fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
3488       break;
3489    default:
3490       fprintf(file, "???");
3491       break;
3492    }
3493    fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
3494
3495    for (int i = 0; i < inst->sources; i++) {
3496       if (inst->src[i].negate)
3497          fprintf(file, "-");
3498       if (inst->src[i].abs)
3499          fprintf(file, "|");
3500       switch (inst->src[i].file) {
3501       case GRF:
3502          fprintf(file, "vgrf%d", inst->src[i].reg);
3503          if (alloc.sizes[inst->src[i].reg] != (unsigned)inst->regs_read(i) ||
3504              inst->src[i].subreg_offset)
3505             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3506                     inst->src[i].subreg_offset);
3507          break;
3508       case MRF:
3509          fprintf(file, "***m%d***", inst->src[i].reg);
3510          break;
3511       case ATTR:
3512          fprintf(file, "attr%d", inst->src[i].reg + inst->src[i].reg_offset);
3513          break;
3514       case UNIFORM:
3515          fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
3516          if (inst->src[i].reladdr) {
3517             fprintf(file, "+reladdr");
3518          } else if (inst->src[i].subreg_offset) {
3519             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3520                     inst->src[i].subreg_offset);
3521          }
3522          break;
3523       case BAD_FILE:
3524          fprintf(file, "(null)");
3525          break;
3526       case IMM:
3527          switch (inst->src[i].type) {
3528          case BRW_REGISTER_TYPE_F:
3529             fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
3530             break;
3531          case BRW_REGISTER_TYPE_W:
3532          case BRW_REGISTER_TYPE_D:
3533             fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
3534             break;
3535          case BRW_REGISTER_TYPE_UW:
3536          case BRW_REGISTER_TYPE_UD:
3537             fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
3538             break;
3539          case BRW_REGISTER_TYPE_VF:
3540             fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
3541                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >>  0) & 0xff),
3542                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >>  8) & 0xff),
3543                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 16) & 0xff),
3544                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 24) & 0xff));
3545             break;
3546          default:
3547             fprintf(file, "???");
3548             break;
3549          }
3550          break;
3551       case HW_REG:
3552          if (inst->src[i].fixed_hw_reg.negate)
3553             fprintf(file, "-");
3554          if (inst->src[i].fixed_hw_reg.abs)
3555             fprintf(file, "|");
3556          if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3557             switch (inst->src[i].fixed_hw_reg.nr) {
3558             case BRW_ARF_NULL:
3559                fprintf(file, "null");
3560                break;
3561             case BRW_ARF_ADDRESS:
3562                fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
3563                break;
3564             case BRW_ARF_ACCUMULATOR:
3565                fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
3566                break;
3567             case BRW_ARF_FLAG:
3568                fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3569                                 inst->src[i].fixed_hw_reg.subnr);
3570                break;
3571             default:
3572                fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3573                                   inst->src[i].fixed_hw_reg.subnr);
3574                break;
3575             }
3576          } else {
3577             fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3578          }
3579          if (inst->src[i].fixed_hw_reg.subnr)
3580             fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
3581          if (inst->src[i].fixed_hw_reg.abs)
3582             fprintf(file, "|");
3583          break;
3584       default:
3585          fprintf(file, "???");
3586          break;
3587       }
3588       if (inst->src[i].abs)
3589          fprintf(file, "|");
3590
3591       if (inst->src[i].file != IMM) {
3592          fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
3593       }
3594
3595       if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
3596          fprintf(file, ", ");
3597    }
3598
3599    fprintf(file, " ");
3600
3601    if (dispatch_width == 16 && inst->exec_size == 8) {
3602       if (inst->force_sechalf)
3603          fprintf(file, "2ndhalf ");
3604       else
3605          fprintf(file, "1sthalf ");
3606    }
3607
3608    fprintf(file, "\n");
3609 }
3610
3611 /**
3612  * Possibly returns an instruction that set up @param reg.
3613  *
3614  * Sometimes we want to take the result of some expression/variable
3615  * dereference tree and rewrite the instruction generating the result
3616  * of the tree.  When processing the tree, we know that the
3617  * instructions generated are all writing temporaries that are dead
3618  * outside of this tree.  So, if we have some instructions that write
3619  * a temporary, we're free to point that temp write somewhere else.
3620  *
3621  * Note that this doesn't guarantee that the instruction generated
3622  * only reg -- it might be the size=4 destination of a texture instruction.
3623  */
3624 fs_inst *
3625 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3626                                            fs_inst *end,
3627                                            const fs_reg &reg)
3628 {
3629    if (end == start ||
3630        end->is_partial_write() ||
3631        reg.reladdr ||
3632        !reg.equals(end->dst)) {
3633       return NULL;
3634    } else {
3635       return end;
3636    }
3637 }
3638
3639 void
3640 fs_visitor::setup_payload_gen6()
3641 {
3642    bool uses_depth =
3643       (prog->InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3644    unsigned barycentric_interp_modes =
3645       (stage == MESA_SHADER_FRAGMENT) ?
3646       ((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
3647
3648    assert(devinfo->gen >= 6);
3649
3650    /* R0-1: masks, pixel X/Y coordinates. */
3651    payload.num_regs = 2;
3652    /* R2: only for 32-pixel dispatch.*/
3653
3654    /* R3-26: barycentric interpolation coordinates.  These appear in the
3655     * same order that they appear in the brw_wm_barycentric_interp_mode
3656     * enum.  Each set of coordinates occupies 2 registers if dispatch width
3657     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
3658     * appear if they were enabled using the "Barycentric Interpolation
3659     * Mode" bits in WM_STATE.
3660     */
3661    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3662       if (barycentric_interp_modes & (1 << i)) {
3663          payload.barycentric_coord_reg[i] = payload.num_regs;
3664          payload.num_regs += 2;
3665          if (dispatch_width == 16) {
3666             payload.num_regs += 2;
3667          }
3668       }
3669    }
3670
3671    /* R27: interpolated depth if uses source depth */
3672    if (uses_depth) {
3673       payload.source_depth_reg = payload.num_regs;
3674       payload.num_regs++;
3675       if (dispatch_width == 16) {
3676          /* R28: interpolated depth if not SIMD8. */
3677          payload.num_regs++;
3678       }
3679    }
3680    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3681    if (uses_depth) {
3682       payload.source_w_reg = payload.num_regs;
3683       payload.num_regs++;
3684       if (dispatch_width == 16) {
3685          /* R30: interpolated W if not SIMD8. */
3686          payload.num_regs++;
3687       }
3688    }
3689
3690    if (stage == MESA_SHADER_FRAGMENT) {
3691       brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3692       brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3693       prog_data->uses_pos_offset = key->compute_pos_offset;
3694       /* R31: MSAA position offsets. */
3695       if (prog_data->uses_pos_offset) {
3696          payload.sample_pos_reg = payload.num_regs;
3697          payload.num_regs++;
3698       }
3699    }
3700
3701    /* R32: MSAA input coverage mask */
3702    if (prog->SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3703       assert(devinfo->gen >= 7);
3704       payload.sample_mask_in_reg = payload.num_regs;
3705       payload.num_regs++;
3706       if (dispatch_width == 16) {
3707          /* R33: input coverage mask if not SIMD8. */
3708          payload.num_regs++;
3709       }
3710    }
3711
3712    /* R34-: bary for 32-pixel. */
3713    /* R58-59: interp W for 32-pixel. */
3714
3715    if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3716       source_depth_to_render_target = true;
3717    }
3718 }
3719
3720 void
3721 fs_visitor::setup_vs_payload()
3722 {
3723    /* R0: thread header, R1: urb handles */
3724    payload.num_regs = 2;
3725 }
3726
3727 void
3728 fs_visitor::setup_cs_payload()
3729 {
3730    assert(devinfo->gen >= 7);
3731
3732    payload.num_regs = 1;
3733 }
3734
3735 void
3736 fs_visitor::assign_binding_table_offsets()
3737 {
3738    assert(stage == MESA_SHADER_FRAGMENT);
3739    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3740    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3741    uint32_t next_binding_table_offset = 0;
3742
3743    /* If there are no color regions, we still perform an FB write to a null
3744     * renderbuffer, which we place at surface index 0.
3745     */
3746    prog_data->binding_table.render_target_start = next_binding_table_offset;
3747    next_binding_table_offset += MAX2(key->nr_color_regions, 1);
3748
3749    assign_common_binding_table_offsets(next_binding_table_offset);
3750 }
3751
3752 void
3753 fs_visitor::calculate_register_pressure()
3754 {
3755    invalidate_live_intervals();
3756    calculate_live_intervals();
3757
3758    unsigned num_instructions = 0;
3759    foreach_block(block, cfg)
3760       num_instructions += block->instructions.length();
3761
3762    regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3763
3764    for (unsigned reg = 0; reg < alloc.count; reg++) {
3765       for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3766          regs_live_at_ip[ip] += alloc.sizes[reg];
3767    }
3768 }
3769
3770 void
3771 fs_visitor::optimize()
3772 {
3773    /* bld is the common builder object pointing at the end of the program we
3774     * used to translate it into i965 IR.  For the optimization and lowering
3775     * passes coming next, any code added after the end of the program without
3776     * having explicitly called fs_builder::at() clearly points at a mistake.
3777     * Ideally optimization passes wouldn't be part of the visitor so they
3778     * wouldn't have access to bld at all, but they do, so just in case some
3779     * pass forgets to ask for a location explicitly set it to NULL here to
3780     * make it trip.
3781     */
3782    bld = bld.at(NULL, NULL);
3783
3784    split_virtual_grfs();
3785
3786    move_uniform_array_access_to_pull_constants();
3787    assign_constant_locations();
3788    demote_pull_constants();
3789
3790 #define OPT(pass, args...) ({                                           \
3791       pass_num++;                                                       \
3792       bool this_progress = pass(args);                                  \
3793                                                                         \
3794       if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) {   \
3795          char filename[64];                                             \
3796          snprintf(filename, 64, "%s%d-%04d-%02d-%02d-" #pass,              \
3797                   stage_abbrev, dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
3798                                                                         \
3799          backend_shader::dump_instructions(filename);                   \
3800       }                                                                 \
3801                                                                         \
3802       progress = progress || this_progress;                             \
3803       this_progress;                                                    \
3804    })
3805
3806    if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
3807       char filename[64];
3808       snprintf(filename, 64, "%s%d-%04d-00-start",
3809                stage_abbrev, dispatch_width,
3810                shader_prog ? shader_prog->Name : 0);
3811
3812       backend_shader::dump_instructions(filename);
3813    }
3814
3815    bool progress = false;
3816    int iteration = 0;
3817    int pass_num = 0;
3818
3819    OPT(lower_simd_width);
3820    OPT(lower_logical_sends);
3821
3822    do {
3823       progress = false;
3824       pass_num = 0;
3825       iteration++;
3826
3827       OPT(remove_duplicate_mrf_writes);
3828
3829       OPT(opt_algebraic);
3830       OPT(opt_cse);
3831       OPT(opt_copy_propagate);
3832       OPT(opt_peephole_predicated_break);
3833       OPT(opt_cmod_propagation);
3834       OPT(dead_code_eliminate);
3835       OPT(opt_peephole_sel);
3836       OPT(dead_control_flow_eliminate, this);
3837       OPT(opt_register_renaming);
3838       OPT(opt_redundant_discard_jumps);
3839       OPT(opt_saturate_propagation);
3840       OPT(opt_zero_samples);
3841       OPT(register_coalesce);
3842       OPT(compute_to_mrf);
3843       OPT(eliminate_find_live_channel);
3844
3845       OPT(compact_virtual_grfs);
3846    } while (progress);
3847
3848    pass_num = 0;
3849
3850    OPT(opt_sampler_eot);
3851
3852    if (OPT(lower_load_payload)) {
3853       split_virtual_grfs();
3854       OPT(register_coalesce);
3855       OPT(compute_to_mrf);
3856       OPT(dead_code_eliminate);
3857    }
3858
3859    OPT(opt_combine_constants);
3860    OPT(lower_integer_multiplication);
3861
3862    lower_uniform_pull_constant_loads();
3863 }
3864
3865 /**
3866  * Three source instruction must have a GRF/MRF destination register.
3867  * ARF NULL is not allowed.  Fix that up by allocating a temporary GRF.
3868  */
3869 void
3870 fs_visitor::fixup_3src_null_dest()
3871 {
3872    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3873       if (inst->is_3src() && inst->dst.is_null()) {
3874          inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
3875                             inst->dst.type);
3876       }
3877    }
3878 }
3879
3880 void
3881 fs_visitor::allocate_registers()
3882 {
3883    bool allocated_without_spills;
3884
3885    static const enum instruction_scheduler_mode pre_modes[] = {
3886       SCHEDULE_PRE,
3887       SCHEDULE_PRE_NON_LIFO,
3888       SCHEDULE_PRE_LIFO,
3889    };
3890
3891    /* Try each scheduling heuristic to see if it can successfully register
3892     * allocate without spilling.  They should be ordered by decreasing
3893     * performance but increasing likelihood of allocating.
3894     */
3895    for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3896       schedule_instructions(pre_modes[i]);
3897
3898       if (0) {
3899          assign_regs_trivial();
3900          allocated_without_spills = true;
3901       } else {
3902          allocated_without_spills = assign_regs(false);
3903       }
3904       if (allocated_without_spills)
3905          break;
3906    }
3907
3908    if (!allocated_without_spills) {
3909       /* We assume that any spilling is worse than just dropping back to
3910        * SIMD8.  There's probably actually some intermediate point where
3911        * SIMD16 with a couple of spills is still better.
3912        */
3913       if (dispatch_width == 16) {
3914          fail("Failure to register allocate.  Reduce number of "
3915               "live scalar values to avoid this.");
3916       } else {
3917          compiler->shader_perf_log(log_data,
3918                                    "%s shader triggered register spilling.  "
3919                                    "Try reducing the number of live scalar "
3920                                    "values to improve performance.\n",
3921                                    stage_name);
3922       }
3923
3924       /* Since we're out of heuristics, just go spill registers until we
3925        * get an allocation.
3926        */
3927       while (!assign_regs(true)) {
3928          if (failed)
3929             break;
3930       }
3931    }
3932
3933    /* This must come after all optimization and register allocation, since
3934     * it inserts dead code that happens to have side effects, and it does
3935     * so based on the actual physical registers in use.
3936     */
3937    insert_gen4_send_dependency_workarounds();
3938
3939    if (failed)
3940       return;
3941
3942    if (!allocated_without_spills)
3943       schedule_instructions(SCHEDULE_POST);
3944
3945    if (last_scratch > 0)
3946       prog_data->total_scratch = brw_get_scratch_size(last_scratch);
3947 }
3948
3949 bool
3950 fs_visitor::run_vs(gl_clip_plane *clip_planes)
3951 {
3952    assert(stage == MESA_SHADER_VERTEX);
3953
3954    assign_common_binding_table_offsets(0);
3955    setup_vs_payload();
3956
3957    if (shader_time_index >= 0)
3958       emit_shader_time_begin();
3959
3960    emit_nir_code();
3961
3962    if (failed)
3963       return false;
3964
3965    compute_clip_distance(clip_planes);
3966
3967    emit_urb_writes();
3968
3969    if (shader_time_index >= 0)
3970       emit_shader_time_end();
3971
3972    calculate_cfg();
3973
3974    optimize();
3975
3976    assign_curb_setup();
3977    assign_vs_urb_setup();
3978
3979    fixup_3src_null_dest();
3980    allocate_registers();
3981
3982    return !failed;
3983 }
3984
3985 bool
3986 fs_visitor::run_fs(bool do_rep_send)
3987 {
3988    brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
3989    brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
3990
3991    assert(stage == MESA_SHADER_FRAGMENT);
3992
3993    sanity_param_count = prog->Parameters->NumParameters;
3994
3995    assign_binding_table_offsets();
3996
3997    if (devinfo->gen >= 6)
3998       setup_payload_gen6();
3999    else
4000       setup_payload_gen4();
4001
4002    if (0) {
4003       emit_dummy_fs();
4004    } else if (do_rep_send) {
4005       assert(dispatch_width == 16);
4006       emit_repclear_shader();
4007    } else {
4008       if (shader_time_index >= 0)
4009          emit_shader_time_begin();
4010
4011       calculate_urb_setup();
4012       if (prog->InputsRead > 0) {
4013          if (devinfo->gen < 6)
4014             emit_interpolation_setup_gen4();
4015          else
4016             emit_interpolation_setup_gen6();
4017       }
4018
4019       /* We handle discards by keeping track of the still-live pixels in f0.1.
4020        * Initialize it with the dispatched pixels.
4021        */
4022       if (wm_prog_data->uses_kill) {
4023          fs_inst *discard_init = bld.emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
4024          discard_init->flag_subreg = 1;
4025       }
4026
4027       /* Generate FS IR for main().  (the visitor only descends into
4028        * functions called "main").
4029        */
4030       emit_nir_code();
4031
4032       if (failed)
4033          return false;
4034
4035       if (wm_prog_data->uses_kill)
4036          bld.emit(FS_OPCODE_PLACEHOLDER_HALT);
4037
4038       if (wm_key->alpha_test_func)
4039          emit_alpha_test();
4040
4041       emit_fb_writes();
4042
4043       if (shader_time_index >= 0)
4044          emit_shader_time_end();
4045
4046       calculate_cfg();
4047
4048       optimize();
4049
4050       assign_curb_setup();
4051       assign_urb_setup();
4052
4053       fixup_3src_null_dest();
4054       allocate_registers();
4055
4056       if (failed)
4057          return false;
4058    }
4059
4060    if (dispatch_width == 8)
4061       wm_prog_data->reg_blocks = brw_register_blocks(grf_used);
4062    else
4063       wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
4064
4065    /* If any state parameters were appended, then ParameterValues could have
4066     * been realloced, in which case the driver uniform storage set up by
4067     * _mesa_associate_uniform_storage() would point to freed memory.  Make
4068     * sure that didn't happen.
4069     */
4070    assert(sanity_param_count == prog->Parameters->NumParameters);
4071
4072    return !failed;
4073 }
4074
4075 bool
4076 fs_visitor::run_cs()
4077 {
4078    assert(stage == MESA_SHADER_COMPUTE);
4079    assert(shader);
4080
4081    sanity_param_count = prog->Parameters->NumParameters;
4082
4083    assign_common_binding_table_offsets(0);
4084
4085    setup_cs_payload();
4086
4087    if (shader_time_index >= 0)
4088       emit_shader_time_begin();
4089
4090    emit_nir_code();
4091
4092    if (failed)
4093       return false;
4094
4095    emit_cs_terminate();
4096
4097    if (shader_time_index >= 0)
4098       emit_shader_time_end();
4099
4100    calculate_cfg();
4101
4102    optimize();
4103
4104    assign_curb_setup();
4105
4106    fixup_3src_null_dest();
4107    allocate_registers();
4108
4109    if (failed)
4110       return false;
4111
4112    /* If any state parameters were appended, then ParameterValues could have
4113     * been realloced, in which case the driver uniform storage set up by
4114     * _mesa_associate_uniform_storage() would point to freed memory.  Make
4115     * sure that didn't happen.
4116     */
4117    assert(sanity_param_count == prog->Parameters->NumParameters);
4118
4119    return !failed;
4120 }
4121
4122 const unsigned *
4123 brw_wm_fs_emit(struct brw_context *brw,
4124                void *mem_ctx,
4125                const struct brw_wm_prog_key *key,
4126                struct brw_wm_prog_data *prog_data,
4127                struct gl_fragment_program *fp,
4128                struct gl_shader_program *prog,
4129                unsigned *final_assembly_size)
4130 {
4131    bool start_busy = false;
4132    double start_time = 0;
4133
4134    if (unlikely(brw->perf_debug)) {
4135       start_busy = (brw->batch.last_bo &&
4136                     drm_intel_bo_busy(brw->batch.last_bo));
4137       start_time = get_time();
4138    }
4139
4140    struct brw_shader *shader = NULL;
4141    if (prog)
4142       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
4143
4144    if (unlikely(INTEL_DEBUG & DEBUG_WM))
4145       brw_dump_ir("fragment", prog, &shader->base, &fp->Base);
4146
4147    int st_index8 = -1, st_index16 = -1;
4148    if (INTEL_DEBUG & DEBUG_SHADER_TIME) {
4149       st_index8 = brw_get_shader_time_index(brw, prog, &fp->Base, ST_FS8);
4150       st_index16 = brw_get_shader_time_index(brw, prog, &fp->Base, ST_FS16);
4151    }
4152
4153    /* Now the main event: Visit the shader IR and generate our FS IR for it.
4154     */
4155    fs_visitor v(brw->intelScreen->compiler, brw,
4156                 mem_ctx, MESA_SHADER_FRAGMENT, key, &prog_data->base,
4157                 prog, &fp->Base, 8, st_index8);
4158    if (!v.run_fs(false /* do_rep_send */)) {
4159       if (prog) {
4160          prog->LinkStatus = false;
4161          ralloc_strcat(&prog->InfoLog, v.fail_msg);
4162       }
4163
4164       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
4165                     v.fail_msg);
4166
4167       return NULL;
4168    }
4169
4170    cfg_t *simd16_cfg = NULL;
4171    fs_visitor v2(brw->intelScreen->compiler, brw,
4172                  mem_ctx, MESA_SHADER_FRAGMENT, key, &prog_data->base,
4173                  prog, &fp->Base, 16, st_index16);
4174    if (likely(!(INTEL_DEBUG & DEBUG_NO16) || brw->use_rep_send)) {
4175       if (!v.simd16_unsupported) {
4176          /* Try a SIMD16 compile */
4177          v2.import_uniforms(&v);
4178          if (!v2.run_fs(brw->use_rep_send)) {
4179             perf_debug("SIMD16 shader failed to compile: %s", v2.fail_msg);
4180          } else {
4181             simd16_cfg = v2.cfg;
4182          }
4183       }
4184    }
4185
4186    cfg_t *simd8_cfg;
4187    int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8;
4188    if ((no_simd8 || brw->gen < 5) && simd16_cfg) {
4189       simd8_cfg = NULL;
4190       prog_data->no_8 = true;
4191    } else {
4192       simd8_cfg = v.cfg;
4193       prog_data->no_8 = false;
4194    }
4195
4196    fs_generator g(brw->intelScreen->compiler, brw,
4197                   mem_ctx, (void *) key, &prog_data->base,
4198                   &fp->Base, v.promoted_constants, v.runtime_check_aads_emit, "FS");
4199
4200    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
4201       char *name;
4202       if (prog)
4203          name = ralloc_asprintf(mem_ctx, "%s fragment shader %d",
4204                                 prog->Label ? prog->Label : "unnamed",
4205                                 prog->Name);
4206       else
4207          name = ralloc_asprintf(mem_ctx, "fragment program %d", fp->Base.Id);
4208
4209       g.enable_debug(name);
4210    }
4211
4212    if (simd8_cfg)
4213       g.generate_code(simd8_cfg, 8);
4214    if (simd16_cfg)
4215       prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
4216
4217    if (unlikely(brw->perf_debug) && shader) {
4218       if (shader->compiled_once)
4219          brw_wm_debug_recompile(brw, prog, key);
4220       shader->compiled_once = true;
4221
4222       if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
4223          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
4224                     (get_time() - start_time) * 1000);
4225       }
4226    }
4227
4228    return g.get_assembly(final_assembly_size);
4229 }
4230
4231 extern "C" bool
4232 brw_fs_precompile(struct gl_context *ctx,
4233                   struct gl_shader_program *shader_prog,
4234                   struct gl_program *prog)
4235 {
4236    struct brw_context *brw = brw_context(ctx);
4237    struct brw_wm_prog_key key;
4238
4239    struct gl_fragment_program *fp = (struct gl_fragment_program *) prog;
4240    struct brw_fragment_program *bfp = brw_fragment_program(fp);
4241    bool program_uses_dfdy = fp->UsesDFdy;
4242
4243    memset(&key, 0, sizeof(key));
4244
4245    if (brw->gen < 6) {
4246       if (fp->UsesKill)
4247          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
4248
4249       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
4250          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
4251
4252       /* Just assume depth testing. */
4253       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
4254       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
4255    }
4256
4257    if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
4258                                          BRW_FS_VARYING_INPUT_MASK) > 16)
4259       key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
4260
4261    brw_setup_tex_for_precompile(brw, &key.tex, &fp->Base);
4262
4263    if (fp->Base.InputsRead & VARYING_BIT_POS) {
4264       key.drawable_height = ctx->DrawBuffer->Height;
4265    }
4266
4267    key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
4268          ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
4269          BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
4270
4271    if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
4272       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
4273                           key.nr_color_regions > 1;
4274    }
4275
4276    key.program_string_id = bfp->id;
4277
4278    uint32_t old_prog_offset = brw->wm.base.prog_offset;
4279    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
4280
4281    bool success = brw_codegen_wm_prog(brw, shader_prog, bfp, &key);
4282
4283    brw->wm.base.prog_offset = old_prog_offset;
4284    brw->wm.prog_data = old_prog_data;
4285
4286    return success;
4287 }
4288
4289 void
4290 brw_setup_tex_for_precompile(struct brw_context *brw,
4291                              struct brw_sampler_prog_key_data *tex,
4292                              struct gl_program *prog)
4293 {
4294    const bool has_shader_channel_select = brw->is_haswell || brw->gen >= 8;
4295    unsigned sampler_count = _mesa_fls(prog->SamplersUsed);
4296    for (unsigned i = 0; i < sampler_count; i++) {
4297       if (!has_shader_channel_select && (prog->ShadowSamplers & (1 << i))) {
4298          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
4299          tex->swizzles[i] =
4300             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
4301       } else {
4302          /* Color sampler: assume no swizzling. */
4303          tex->swizzles[i] = SWIZZLE_XYZW;
4304       }
4305    }
4306 }